Чтобы не ждать долго, парсер всего сайта сузил до 1 страницы
Spider:
# -*- coding: utf-8 -*- from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.loader.processor import TakeFirst from scrapy.contrib.loader import XPathItemLoader from scrapy.selector import HtmlXPathSelector from orphanage.items import OrphanageItem class OrphanSpider(CrawlSpider): name = "detskiedomiki" allowed_domains = ["www.detskiedomiki.ru"] start_urls = ["http://www.detskiedomiki.ru/guide/child/"] rules = ( #Rule(SgmlLinkExtractor(allow=('act=home_reg', 'act=home_zone')), follow=True), Rule(SgmlLinkExtractor(allow=('http://detskiedomiki.ru/?act=home_more&id=6278&z_id=3&part_id=65')), callback='parse_item'), ) def parse_item(self, response): hxs = HtmlXPathSelector(response) l = OrphanLoader(OrphanageItem(), hxs) # l.add_xpath('id', "//td[text()='%s']/following-sibling::td/text()" % u"Рег. номер:") l.add_xpath('region', "//td[text()='%s']/following-sibling::td/text()" % u"Регион:") l.add_xpath('district', "//td[text()='%s']/following-sibling::td/text()" % u"Район:") l.add_xpath('type', "//td[text()='%s']/following-sibling::td/text()" % u"Тип учреждения:") l.add_xpath('name', "//td[text()='%s']/following-sibling::td/strong/text()" % u"Название:") return l.load_item() [b]pipeline[/b]: import json class FasttorrentPipeline(object): def __init__(self): self.file = open('items.txt', 'wb') def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line) return item
Как вывести название нормально?