Форум сайта python.su
0
Вот код:
#!/usr/bin/env python3 import time from urllib.request import urlopen from urllib.parse import urljoin from lxml.html import fromstring URL = 'http://www.estateline.ru/companies/sales-production/noise-heat-insulation/' ITEM_PATH = 'tbody tr .name' DESCR_PATH = '.profiler .rightProf .itemBox' REGION_PATH = '.itemleft' def parse_compan(): f = urlopen(URL) list_html = f.read().decode('utf-8') list_doc = fromstring(list_html) for elem in list_doc.cssselect(ITEM_PATH): a = elem.cssselect('a.text')[0] href = a.get('href') name = a.text small = elem.cssselect('small')[0] town = small.text url = urljoin(URL, href) company = {'name': name, 'town': town, 'url': url} details_html = urlopen(url).read().decode('utf-8') details_doc = fromstring(details_html) descr_elem = details_doc.cssselect(DESCR_PATH)[0] descr = descr_elem.text_content() region_elems = details_doc.cssselect(REGION_PATH)[0] regions = [region_elem.text for region_elem in region_elems] company['descr'] = descr company['regions'] = regions compan.append(company) return compan def main(): compan = parse_compan() print(compan) time.sleep(3) if __name__ == '__main__': main()
Отредактировано islate (Май 2, 2015 04:32:56)
Офлайн
5
def parse_compan(): compan = [] ...
Офлайн
0
Paranoia_AgentСпасибо большое, помогло. +
Офлайн