Python-сообщество

sp3 · Май 21, 2011 00:58:46

Здравствуйте.
Мне нужно скачать файл со странички http://forums.thinkingwithportals.com/downloads.php?view=detail&df_id=470
Проблема в том, что прямой ссылки нет, а есть кнопка download

<form action="./downloads.php" method="post" name="download" > 
				
				<input type="hidden" name="file_version" value="0" />&nbsp;&raquo;&nbsp;<input type="submit" name="submit" value="Download" class="button1" /> 
				<input type="hidden" name="hotlink_id" value="6fede94291739792ae7c734d60e20cc9" /> 
				<input type="hidden" name="df_id" value="478" /> 
<input type="hidden" name="modcp" value="0" /> 
<input type="hidden" name="cat_id" value="10" /> 
<input type="hidden" name="hotlink_id" value="6fede94291739792ae7c734d60e20cc9" /> 
<input type="hidden" name="view" value="load" /> 
</form>

нашел в этой теме http://python.su/forum/viewtopic.php?id=5323
код от Ferroman'a

#----------------------------------------------------------------------
#
# Author:      Laszlo Nagy
#
# Copyright:   (c) 2005 by Szoftver Messias Bt.
# Licence:     BSD style
#
#
#----------------------------------------------------------------------
import os
from hashlib import md5
import urllib2
import mimetypes
import cookielib
import logging


class MozillaCacher(object):
    """A dictionary like object, that can cache results on a storage device."""
    def __init__(self,cachedir='.cache'):
        self.cachedir = cachedir
        if not os.path.isdir(cachedir):
            self.logger.debug('Create cache dir')
            os.mkdir(cachedir)

    def name2fname(self,name):
        return os.path.join(self.cachedir,name)

    def __getitem__(self,name):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            return file(fname,'rb').read()
        else:
            raise IndexError()

    def __setitem__(self,name,value):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            os.unlink(fname)
        f = file(fname,'wb+')
        try:
            f.write(value)
        finally:
            f.close()

    def __delitem__(self,name):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            os.unlink(fname)

    def __iter__(self):
        raise NotImplementedError()

    def has_key(self,name):
        return os.path.isfile(self.name2fname(name))

class MozillaEmulator(object):
    def __init__(self,cacher=None,trycount=0):
        """Create a new MozillaEmulator object.

        @param cacher: A dictionary like object, that can cache search results on a storage device.
            You can use a simple dictionary here, but it is not recommended.
            You can also put None here to disable caching completely.
        @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
                A value of 0 means no retrying. A value of 1 means one retry. etc."""
        self.cacher = cacher
        self.cookies = cookielib.CookieJar()
        self.trycount = trycount
        self.debug = False
        self.logger = logging.getLogger("libs.MozillaEmulator")

    def _hash(self,data):
        h = md5()
        h.update(data)
        return h.hexdigest()

    def build_opener(self,url,postdata=None,extraheaders=None,forbid_redirect=False):
        txheaders = {
            'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language':'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Keep-Alive': '300',
            'Connection': 'keep-alive',
        }
        if not (extraheaders is None):
            for key,value in extraheaders.iteritems():
                txheaders[key] = value
        req = urllib2.Request(url, postdata, txheaders)
        self.cookies.add_cookie_header(req)
        if forbid_redirect:
            redirector = HTTPNoRedirector()
        else:
            redirector = urllib2.HTTPRedirectHandler()

        http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
        https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)

        u = urllib2.build_opener(http_handler,https_handler,urllib2.HTTPCookieProcessor(self.cookies),redirector)
        u.addheaders = [('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; hu-HU; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')]
        if not postdata is None:
            req.add_data(postdata)
        return (req,u)

    def download(self,url,postdata=None,extraheaders=None,forbid_redirect=False,
            trycount=None,fd=None,onprogress=None,only_head=False,retry=True):
        """Download an URL with GET or POST methods.

        @param postdata: It can be a string that will be POST-ed to the URL.
            When None is given, the method will be GET instead.
        @param extraheaders: You can add/modify HTTP headers with a dict here.
        @param forbid_redirect: Set this flag if you do not want to handle
            HTTP 301 and 302 redirects.
        @param trycount: Specify the maximum number of retries here.
            0 means no retry on error. Using -1 means infinite retring.
            None means the default value (that is self.trycount).
        @param fd: You can pass a file descriptor here. In this case,
            the data will be written into the file. Please note that
            when you save the raw data into a file then it won't be cached.
        @param onprogress: A function that has two parameters:
            the size of the resource and the downloaded size. This will be
            called for each 1KB chunk. (If the HTTP header does not contain
            the content-length field, then the size parameter will be zero!)
        @param only_head: Create the openerdirector and return it. In other
            words, this will not retrieve any content except HTTP headers.

        @return: The raw HTML page data, unless fd was specified. When fd
            was given, the return value is undefined.
        """
        if trycount is None:
            trycount = self.trycount
        cnt = 0
        while True:
            try:
                key = self._hash(url)
                if ((self.cacher is None) or (not self.cacher.has_key(key)) or retry):
                    req,u = self.build_opener(url,postdata,extraheaders,forbid_redirect)
                    try:
                        openerdirector = u.open(req)
                    except urllib2.URLError, e:
                        self.logger.exception("Can't open connection")
                        self.logger.error(e)
                        self.logger.debug(req.get_method()+' '+url)
                        self.logger.debug(openerdirector.code+' '+openerdirector.msg)
                        self.logger.debug(openerdirector.headers)
                    self.cookies.extract_cookies(openerdirector,req)
                    if only_head:
                        return openerdirector
                    if openerdirector.headers.has_key('content-length'):
                        length = long(openerdirector.headers['content-length'])
                    else:
                        length = 0
                    dlength = 0
                    if fd:
                        while True:
                            data = openerdirector.read(1024)
                            if not data:
                                break
                            dlength += len(data)
                            fd.write(data)
                            if onprogress:
                                onprogress(length,dlength)
                    else:
                        data = ''
                        while True:
                            newdata = openerdirector.read(1024)
                            if not newdata:
                                break
                            dlength += len(newdata)
                            data += newdata
                            if onprogress:
                                onprogress(length,dlength)
                        if not (self.cacher is None):
                            self.cacher[key] = data
                else:
                    data = self.cacher[key]
                return data
            except urllib2.URLError:
                self.logger.debug("MozillaEmulator: urllib2.URLError, retryting %i ",cnt)
                cnt += 1
                if (trycount > -1) and (trycount < cnt):
                    raise


    def post_multipart(self,url,fields, files, fields2=None, forbid_redirect=False, onprogress=None):
        """Post fields and files to an http host as multipart/form-data.
        fields is a sequence of (name, value) elements for regular form fields.
        files is a sequence of (name, filename, value) elements for data to be uploaded as files
        Return the server's response page.
        """
        content_type, post_data = encode_multipart_formdata(fields, files, fields2)
        result = self.download(url,post_data,{
            'Content-Type': content_type,
            'Content-Length': str(len(post_data))
        },forbid_redirect=forbid_redirect,trycount=1, onprogress=onprogress
        )
        return result


class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
    """This is a custom http redirect handler that FORBIDS redirection."""
    def http_error_302(self, req, fp, code, msg, headers):
        e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
        if e.code in (301,302):
            if 'location' in headers:
                newurl = headers.getheaders('location')[0]
            elif 'uri' in headers:
                newurl = headers.getheaders('uri')[0]
            e.newurl = newurl
        raise e



def encode_multipart_formdata(fields, files):
    """
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return (content_type, body) ready for httplib.HTTP instance
    """
    BOUNDARY = '-----------------------------2385384675469946141908084969'
    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (key, filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
        L.append('Content-Type: %s' % get_content_type(filename))
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def get_content_type(filename):
    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'

Но ввиду полного отсутствия знаний в этой области загрузить файл у меня не получилось :(

s = MozillaEmulator()
url = 'http://forums.thinkingwithportals.com/downloads.php?view=detail&df_id=478'
f = open(r'C:\ddd/gg.hz', 'wb')
s.download(url,fd = f,extraheaders = {'action':"./downloads.php", 'method':"post", 'name':"download"})
f.close()

Что я неправильно делаю?

Александр Кошелев · Май 21, 2011 16:01:32

sp3
Но ввиду полного отсутствия знаний в этой области загрузить файл у меня не получилось :(

Что значит не получилось? В чем конретная проблема?

sp3 · Май 21, 2011 16:39:04

Поставил tamper data на firefox
Оказывается кнопка перенаправляет на другой ресурс с прямой ссылкой для загрузки.
Сейчас проблема получения заголовков ответа на пост запрос
запрос при нажатии кнопки на http://forums.thinkingwithportals.com/downloads.php?view=detail&df_id=470

Host=forums.thinkingwithportals.com
User-Agent=Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.17) Gecko/20110420 Firefox/3.6.17
Accept=text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language=ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding=gzip,deflate
Accept-Charset=windows-1251,utf-8;q=0.7,*;q=0.7
Keep-Alive=115
Connection=keep-alive
Referer=http://forums.thinkingwithportals.com/downloads.php?view=detail&df_id=470
Cookie=phpbb3_hwh84_u=1; phpbb3_hwh84_k=; phpbb3_hwh84_sid=15adbcde4fb2a39c341f92d11b08e9af; __utma=234265473.673183499.1305960446.1305979668.1305984523.4; __utmz=234265473.1305960446.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=234265473; __utmb=234265473.1.10.1305984523
Content-Type=application/x-www-form-urlencoded
Content-Length=156
POSTDATA=file_version=0&submit=Download&hotlink_id=59d466de085c73c82f931842f08e3f7e&df_id=470&modcp=0&cat_id=10&hotlink_id=59d466de085c73c82f931842f08e3f7e&view=load

ответ

Status=Found - 302
Date=Sat, 21 May 2011 13:29:48 GMT
Server=Apache/2.2.9 (Debian) PHP/5.2.6-1+lenny10 with Suhosin-Patch
X-Powered-By=PHP/5.2.6-1+lenny10
Content-Disposition=attachment; filename="sp_enrichment_2.zip"
Location=http://mirror.pointysoftware.net/thinkingwithportals/downloads/portal2/maps/sp_enrichment_2.zip
Content-Length=0
Keep-Alive=timeout=15
Connection=Keep-Alive
Content-Type=application/octet-stream

как получить эти заголовки на питоне?

import urllib2
url = 'http://forums.thinkingwithportals.com/downloads.php?view=detail&df_id=470'
postdata='file_version=0&submit=Download&hotlink_id=59d466de085c73c82f931842f08e3f7e&df_id=470&modcp=0&cat_id=10&hotlink_id=59d466de085c73c82f931842f08e3f7e&view=load'



txheaders = {
            'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language':'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Keep-Alive': '300',
            'Connection': 'keep-alive'}

req = urllib2.Request(url, postdata, txheaders)
r = urllib2.urlopen(req)

print r.headers

так не получается :(

Python-сообщество

Уведомления

#1 Май 21, 2011 00:58:46

загрузка файла

#2 Май 21, 2011 16:01:32

загрузка файла

#3 Май 21, 2011 16:39:04

загрузка файла

Board footer