from http import cookies, client
from urllib.parse import urlencode, urlsplit, urljoin
from lxml.html import document_fromstring
class SimpleBrowser(object):
def __init__(self, headers=None, timeout=10):
self._cookies = cookies.SimpleCookie()
self._headers = headers if (headers is not None) \
else [('User-Agent', 'SimpleBrowser/0.1')]
self._headers.append(('Connection', 'close'))
self._host = None
self._conn = None
self._timeout = timeout
def __ensure_connection(self, host, scheme):
if self._host == host:
return
if self._conn is not None:
self._conn.close()
self._host = host
if scheme == 'http':
self._conn = client.HTTPConnection(host, timeout=self._timeout)
elif scheme == 'https':
self._conn = client.HTTPSConnection(host, timeout=self._timeout)
else:
raise ValueError('Unknown scheme: %s' % scheme)
def __prepare_url(self, url):
parsed = urlsplit(url, scheme='http')
host = parsed.netloc if parsed.netloc else self._host
self.__ensure_connection(host, parsed.scheme)
return '%s?%s' % (parsed.path, parsed.query)
def close(self):
if self._conn is not None:
self._conn.close()
self._conn = None
self._host = None
def add_header(self, name, value):
self._headers.append((name, value))
def add_cookie(self, name, value):
self._cookies[name] = value
def request(self, req, url, data=None):
if isinstance(data, dict):
data = urlencode(data)
loc = self.__prepare_url(url)
self._conn.putrequest(req, loc)
for header in self._headers:
self._conn.putheader(*header)
cookies = self._cookies.output(attrs=[], header='')
if cookies:
self._conn.putheader('Cookie', cookies.strip())
self._conn.endheaders(data)
resp = self._conn.getresponse()
resp_headers = dict(resp.getheaders())
resp_body = resp.read()
self._conn.close()
if 'Set-Cookie' in resp_headers:
self._cookies.load(resp_headers['Set-Cookie'])
if 'Set-Cookie2' in resp_headers:
self._cookies.load(resp_headers['Set-Cookie2'])
if resp.status == client.OK:
return (client.OK, document_fromstring(resp_body))
elif resp.status in (client.MOVED_PERMANENTLY, client.FOUND):
return self.request(req, resp_headers['Location'], data)
else:
return (resp.status, resp.reason)
def get(self, url):
return self.request('GET', url)
def post(self, url, data):
return self.request('POST', url, data)
if __name__ == '__main__':
sb = SimpleBrowser()
res, doc = sb.get('http://pogoda.yandex.ru/moscow/')
if res == client.OK:
for div in doc.cssselect("div.b-forecast__tday"):
print(div.text)
P.S. То, что первоначальный код сработал для 2.x но при этом не работал в 3.x ‘виновата’ именно версия:
http://docs.python.org/release/3.2/library/urllib.request.html
According to the letter of RFC 2616, 301 and 302 responses to POST requests must not be automatically redirected without confirmation by the user. In reality, browsers do allow automatic redirection of these responses, changing the POST to a GET, and urllib reproduces this behaviour.