Уведомления

Группа в Telegram: @pythonsu

#1 Фев. 17, 2017 17:03:17

MetalHead
От: Ленгер
Зарегистрирован: 2013-12-17
Сообщения: 88
Репутация: +  1  -
Профиль   Отправить e-mail  

Парсер на lxml

Добрый день товарищи. Много времени потратил на исправление бага с кодировкой впустую. Не пойму почему бинарная строка не кодируется в строку на utf-8. Главное функция rectify_elements обрабатывает корректно. Кстати на версии 3.4 нет таких проблем, а вот на 3.6 не идет вообще никак. Спасибо.

 UnicodeEncodeError: 'ascii' codec can't encode character '\u2027' in position 50: ordinal not in range(128)
Source Code
 # -*- coding: <utf-8> -*-
from lxml import html, etree
import requests
import csv
class Parser:
    def __init__(self, word):
        self.word = word
        number = 2
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
            page = requests.get('http://www.ldoceonline.com/dictionary/%s' % self.word, headers=headers)
        except:
            print("An error occured. Check your network connection.")
            return None
        self.tree = html.fromstring(page.content)
        self.lines = self.tree.xpath('//span[starts-with(@id,"%s__")]' % word)
        tails = self.tree.xpath('//span[@class="Tail"]/span[starts-with(@id,"%s__")]/@id' % word)
        assetlinks = self.tree.xpath('//span[@class="assetlink"]/span[starts-with(@id,"%s__")]/@id' % word)
        self.black_indexes = [] # will extend from the beneath function
        self.get_black_indexes(tails)
        self.get_black_indexes(assetlinks)
        
        self.homnum = self.tree.xpath('//span[@class="HOMNUM"]/text()')
        self.homnum_num = len(self.homnum)
            
        self.examples_num = self.tree.xpath('//span[@class="EXAMPLE"]/text()')
        self.get_word()
        
        del page
        
    def get_word(self):
        # counters and digit data
        lines_sum = len(self.lines) 
        lines_counter = 1 
        dictentries_sum = len(self.tree.xpath('//span[@class="dictentry"]'))
        lines_tempt_counter = 1
        while lines_tempt_counter <= lines_sum:
            self.double_lines = self.tree.xpath('//span[@id="%s__%d"]/span[starts-with(@id,"%s__")]' %
                                           (self.word, lines_tempt_counter, self.word))
            if len(self.double_lines) > 1:
                self.black_indexes.append(lines_tempt_counter)
                
            lines_tempt_counter += 1
        # get some points
        dictentry_temp_counter = 1
        dictentry_last_coordinates = []
        while dictentries_sum >= dictentry_temp_counter:
            dictentry = self.tree.xpath('//span[@class="dictentry"][%s]//span[starts-with(@id,"%s__")]/@id' % (dictentry_temp_counter, self.word))
            dictentry_last_coordinates.append(self.get_destination(dictentry[-1]))
            dictentry_temp_counter += 1
        print(dictentry_last_coordinates)
        del dictentry, dictentry_temp_counter
        
        # constant data
        title = self.tree.xpath('//span[@class="pagetitle span"]/text()')
        example_sum = 0
        min_examples = 5 # minimum num of examples for each word
        dictentry_temp_counter = 1
        ampronounce = list(self.tree.xpath('//span[@class="speaker amefile fa fa-volume-up"]/@data-src-mp3'))[0].split()[0]
        self.download_audio(ampronounce)
        ampronounce = self.split_audio_url(ampronounce)
        self.ampronounce = ampronounce
        
        while lines_sum >= lines_counter:
            if lines_counter in self.black_indexes:
                print("oops, black list ->", lines_counter)
            else:
                element = self.getElements(dictentry_temp_counter, lines_counter)
                definition = list(self.tree.xpath('//span[@id="%s__%d"]%sspan[@class="DEF"]'
                                                % (self.word, lines_counter, '//' or '/')))
                definition = self.check_span(definition)
                try:
                    definition = etree.tostring(definition[0])
                    definition = definition.decode('utf-8')
                except IndexError:
                    pass
                #was here
                element = self.rectify_elements(element)
                if len(element["dictionary_source"]) == 0:
                        element["dictionary_source"] = "n/a" 
                example = list(self.tree.xpath('//span[@id="%s__%d"]//span[@class="EXAMPLE"]'
                                             % (self.word, lines_counter)))
                example_pronounce = list(self.tree.xpath('//span[@id="%s__%d"]//span[@class="EXAMPLE"]//span/@data-src-mp3' %
                                                              (self.word, lines_counter)))
                
                example_temp_counter = 0
                while example_temp_counter != len(example):
                    ex = etree.tostring(example[example_temp_counter])
                    ex = ex.decode('utf-8')
                    try:
                        example_pro = example_pronounce[example_temp_counter].split()[0]
                        self.download_audio(example_pro)
                        example_pro = self.split_audio_url(example_pro)
                    except IndexError:
                        example_pro = ""
                        
                    ex_parsed = len(example)
                    example_sum = self.append_csv(element, ampronounce, definition, ex, example_pro, ex_parsed)
                    example_sum += len(example) #need to return
                    print(example_sum)
                    
                    example_temp_counter += 1
 
                
            if lines_counter in dictentry_last_coordinates and lines_counter != dictentry_last_coordinates[-1]:
                
                dictentry_temp_counter += 1
                print("WEEEE CHANGE TO -> ", dictentry_temp_counter)
                
            lines_counter += 1
            
        print("dictentry", dictentry_temp_counter-1, "exams", example_sum)
        if example_sum <= min_examples:
            self.get_corpus_examples()
            #pass
        
    def get_destination(self, obj): # get last element
        return int(obj[len(self.word) + 2:])
    def get_black_indexes(self, points):
        for index in points:
            self.black_indexes.append(self.get_destination(index))
        return
    def check_span(self, obj):
        if len(obj) != 0:
            return obj
        else:
            blank = ""
            return blank
    def rectify_elements(self, element):
        for item in element:
            if self.check_span(element[item]) == element[item]:
                        
                element[item] = self.check_span(element[item])
                        
                if len(element[item]) > 0 and len(element[item]) < 2:
                        element[item] = element[item][0]
                        element[item] = str(element[item]).encode()
                        element[item] = element[item].decode('utf-8')
                else:
                    element[item] = self.extract_from_list(element[item])
            else:
                element[item] = ""
        return element
    def getElements(self, dictentry_temp_counter, lines_counter):
        element = {}
        element["dictionary_source"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="dictionary_intro span"]/text()'
                                            % dictentry_temp_counter))
        element["pos"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="POS"]/text()'
                                              % (dictentry_temp_counter, '//' or '/')))
        element["tags"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sa[@class="topic"]/text()'
                                                % (dictentry_temp_counter, '//' or '/')))   
        element["hyphenation"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HYPHENATION"]/text()'
                                                    % (dictentry_temp_counter, '//' or '/')))
        element["homnum"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HOMNUM"]/text()'
                                                 % (dictentry_temp_counter, '//' or '/'))) 
        element["pron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="PRON"]/text()'
                                               % (dictentry_temp_counter, '//' or '/'))) 
        element["amevarpron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="AMEVARPRON"]/text()' % dictentry_temp_counter))
        element["freq"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="FREQ"]/@title'
                                               % (dictentry_temp_counter, '//' or '/')))
        element["signpost"] = list(self.tree.xpath('//span[@id="%s__%d"]%sspan[@class="SIGNPOST"]/text()'
                                                   % (self.word, lines_counter, '//' or '/'))) 
        element["gram"] = list(self.tree.xpath('//span[@id="%s__%d"]/span[@class="GRAM"]/text()'
                                               % (self.word, lines_counter)))
        return element
    def append_csv(self, element, ampronounce, definition, ex, example_pro, ex_parsed=None):
        example_sum = 0
        example = ex
        with open('dictionary-output.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',',quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow([element["dictionary_source"], self.word, ampronounce, element["hyphenation"],
                             element["homnum"], element["pron"], element["amevarpron"], element["pos"],
                             element["freq"], element["signpost"], element["gram"], definition,
                             self.highlight_word(ex), example_pro, "", element["tags"]])
        
            print(self.word, "was added to csv")
            if ex_parsed != None:
                example_sum += ex_parsed
                return example_sum
            else:
                return True
    def extract_from_list(self, row):
        return '%s' % ', '.join(row)
    def download_audio(self, url):
        link_var = requests.get(url)
        out = open(self.split_audio_url(url), "wb")
        out.write(link_var.content)
        out.close()
        print("Audio was downloaded!")
        return
    def split_audio_url(self, raw):
        return raw.split('/')[-1]
    def highlight_word(self, word):
        result = word.replace(self.word, '<span class="hightlight-word">%s</span>' % self.word)
        return result
    
    def get_corpus_examples(self):
        dictentries_sum = len(self.tree.xpath('//span[@class="dictentry"]'))
        counter = 1
        while counter <= dictentries_sum:
            corpus = list(self.tree.xpath('//span[@class="exaGroup cexa%s exaGroup"]' % counter))
            
            definition = self.get_corpus_definition(counter)
            
            if len(corpus) > 0:
                for item in corpus:
                    one_corpus = etree.tostring(item)
                    one_corpus = one_corpus.decode('utf-8')
                    element = self.get_corpus_elements(counter)
                    element["dictionary_source"] = "Corpus(US)"
                    self.append_csv(element, self.ampronounce, definition, "", "")
                    #print(one_corpus)
            else:
                print('do not have corpus')
            counter += 1
    def get_corpus_definition(self, dict_id):
        definition = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="DEF"]'
                                                % (dict_id, '//' or '/')))
        definition = self.check_span(definition)
        try:
            definition = etree.tostring(definition[0])
            definition = definition.decode('utf-8')
            return definition
        except IndexError:
            return ""
    def get_corpus_elements(self, dictentry_temp_counter):
        element = {}
        element["pos"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="POS"]/text()'
                                              % (dictentry_temp_counter, '//' or '/'))) or ""
        element["tags"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sa[@class="topic"]/text()'
                                                % (dictentry_temp_counter, '//' or '/'))) or "" 
        element["hyphenation"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HYPHENATION"]/text()'
                                                    % (dictentry_temp_counter, '//' or '/')))
        element["homnum"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HOMNUM"]/text()'
                                                 % (dictentry_temp_counter, '//' or '/'))) or ""
        element["pron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="PRON"]/text()'
                                               % (dictentry_temp_counter, '//' or '/'))) or ""
        
        element["amevarpron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="AMEVARPRON"]/text()'
                                                     % dictentry_temp_counter)) or ""
        element["freq"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="FREQ"]/@title'
                                               % (dictentry_temp_counter, '//' or '/'))) or ""
        element["signpost"] = "" 
        element["gram"] = ""
        element = self.rectify_elements(element)
        print("from corpus", element)
        return element
        
if __name__ == "__main__":
    obj = Parser("caterpillar")

Офлайн

#2 Фев. 18, 2017 05:20:03

py.user.next
От:
Зарегистрирован: 2010-04-29
Сообщения: 9873
Репутация: +  853  -
Профиль   Отправить e-mail  

Парсер на lxml

MetalHead
 with open('dictionary-output.csv', 'a', newline='') as csvfile:
Надо указывать кодировку при открытии файла в текстовом режиме. Текстовый режим - это не бинарный режим (без буквы b).



Отредактировано py.user.next (Фев. 18, 2017 05:20:58)

Офлайн

#3 Фев. 18, 2017 11:47:02

MetalHead
От: Ленгер
Зарегистрирован: 2013-12-17
Сообщения: 88
Репутация: +  1  -
Профиль   Отправить e-mail  

Парсер на lxml

py.user.next
Действительно, а я долго бился. Спасибо огромное.

Офлайн

Board footer

Модераторировать

Powered by DjangoBB

Lo-Fi Version