Форум сайта python.su
Добрый день товарищи. Много времени потратил на исправление бага с кодировкой впустую. Не пойму почему бинарная строка не кодируется в строку на utf-8. Главное функция rectify_elements обрабатывает корректно. Кстати на версии 3.4 нет таких проблем, а вот на 3.6 не идет вообще никак. Спасибо.
UnicodeEncodeError: 'ascii' codec can't encode character '\u2027' in position 50: ordinal not in range(128)
# -*- coding: <utf-8> -*- from lxml import html, etree import requests import csv class Parser: def __init__(self, word): self.word = word number = 2 try: headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} page = requests.get('http://www.ldoceonline.com/dictionary/%s' % self.word, headers=headers) except: print("An error occured. Check your network connection.") return None self.tree = html.fromstring(page.content) self.lines = self.tree.xpath('//span[starts-with(@id,"%s__")]' % word) tails = self.tree.xpath('//span[@class="Tail"]/span[starts-with(@id,"%s__")]/@id' % word) assetlinks = self.tree.xpath('//span[@class="assetlink"]/span[starts-with(@id,"%s__")]/@id' % word) self.black_indexes = [] # will extend from the beneath function self.get_black_indexes(tails) self.get_black_indexes(assetlinks) self.homnum = self.tree.xpath('//span[@class="HOMNUM"]/text()') self.homnum_num = len(self.homnum) self.examples_num = self.tree.xpath('//span[@class="EXAMPLE"]/text()') self.get_word() del page def get_word(self): # counters and digit data lines_sum = len(self.lines) lines_counter = 1 dictentries_sum = len(self.tree.xpath('//span[@class="dictentry"]')) lines_tempt_counter = 1 while lines_tempt_counter <= lines_sum: self.double_lines = self.tree.xpath('//span[@id="%s__%d"]/span[starts-with(@id,"%s__")]' % (self.word, lines_tempt_counter, self.word)) if len(self.double_lines) > 1: self.black_indexes.append(lines_tempt_counter) lines_tempt_counter += 1 # get some points dictentry_temp_counter = 1 dictentry_last_coordinates = [] while dictentries_sum >= dictentry_temp_counter: dictentry = self.tree.xpath('//span[@class="dictentry"][%s]//span[starts-with(@id,"%s__")]/@id' % (dictentry_temp_counter, self.word)) dictentry_last_coordinates.append(self.get_destination(dictentry[-1])) dictentry_temp_counter += 1 print(dictentry_last_coordinates) del dictentry, dictentry_temp_counter # constant data title = self.tree.xpath('//span[@class="pagetitle span"]/text()') example_sum = 0 min_examples = 5 # minimum num of examples for each word dictentry_temp_counter = 1 ampronounce = list(self.tree.xpath('//span[@class="speaker amefile fa fa-volume-up"]/@data-src-mp3'))[0].split()[0] self.download_audio(ampronounce) ampronounce = self.split_audio_url(ampronounce) self.ampronounce = ampronounce while lines_sum >= lines_counter: if lines_counter in self.black_indexes: print("oops, black list ->", lines_counter) else: element = self.getElements(dictentry_temp_counter, lines_counter) definition = list(self.tree.xpath('//span[@id="%s__%d"]%sspan[@class="DEF"]' % (self.word, lines_counter, '//' or '/'))) definition = self.check_span(definition) try: definition = etree.tostring(definition[0]) definition = definition.decode('utf-8') except IndexError: pass #was here element = self.rectify_elements(element) if len(element["dictionary_source"]) == 0: element["dictionary_source"] = "n/a" example = list(self.tree.xpath('//span[@id="%s__%d"]//span[@class="EXAMPLE"]' % (self.word, lines_counter))) example_pronounce = list(self.tree.xpath('//span[@id="%s__%d"]//span[@class="EXAMPLE"]//span/@data-src-mp3' % (self.word, lines_counter))) example_temp_counter = 0 while example_temp_counter != len(example): ex = etree.tostring(example[example_temp_counter]) ex = ex.decode('utf-8') try: example_pro = example_pronounce[example_temp_counter].split()[0] self.download_audio(example_pro) example_pro = self.split_audio_url(example_pro) except IndexError: example_pro = "" ex_parsed = len(example) example_sum = self.append_csv(element, ampronounce, definition, ex, example_pro, ex_parsed) example_sum += len(example) #need to return print(example_sum) example_temp_counter += 1 if lines_counter in dictentry_last_coordinates and lines_counter != dictentry_last_coordinates[-1]: dictentry_temp_counter += 1 print("WEEEE CHANGE TO -> ", dictentry_temp_counter) lines_counter += 1 print("dictentry", dictentry_temp_counter-1, "exams", example_sum) if example_sum <= min_examples: self.get_corpus_examples() #pass def get_destination(self, obj): # get last element return int(obj[len(self.word) + 2:]) def get_black_indexes(self, points): for index in points: self.black_indexes.append(self.get_destination(index)) return def check_span(self, obj): if len(obj) != 0: return obj else: blank = "" return blank def rectify_elements(self, element): for item in element: if self.check_span(element[item]) == element[item]: element[item] = self.check_span(element[item]) if len(element[item]) > 0 and len(element[item]) < 2: element[item] = element[item][0] element[item] = str(element[item]).encode() element[item] = element[item].decode('utf-8') else: element[item] = self.extract_from_list(element[item]) else: element[item] = "" return element def getElements(self, dictentry_temp_counter, lines_counter): element = {} element["dictionary_source"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="dictionary_intro span"]/text()' % dictentry_temp_counter)) element["pos"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="POS"]/text()' % (dictentry_temp_counter, '//' or '/'))) element["tags"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sa[@class="topic"]/text()' % (dictentry_temp_counter, '//' or '/'))) element["hyphenation"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HYPHENATION"]/text()' % (dictentry_temp_counter, '//' or '/'))) element["homnum"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HOMNUM"]/text()' % (dictentry_temp_counter, '//' or '/'))) element["pron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="PRON"]/text()' % (dictentry_temp_counter, '//' or '/'))) element["amevarpron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="AMEVARPRON"]/text()' % dictentry_temp_counter)) element["freq"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="FREQ"]/@title' % (dictentry_temp_counter, '//' or '/'))) element["signpost"] = list(self.tree.xpath('//span[@id="%s__%d"]%sspan[@class="SIGNPOST"]/text()' % (self.word, lines_counter, '//' or '/'))) element["gram"] = list(self.tree.xpath('//span[@id="%s__%d"]/span[@class="GRAM"]/text()' % (self.word, lines_counter))) return element def append_csv(self, element, ampronounce, definition, ex, example_pro, ex_parsed=None): example_sum = 0 example = ex with open('dictionary-output.csv', 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',',quotechar="'", quoting=csv.QUOTE_NONNUMERIC) writer.writerow([element["dictionary_source"], self.word, ampronounce, element["hyphenation"], element["homnum"], element["pron"], element["amevarpron"], element["pos"], element["freq"], element["signpost"], element["gram"], definition, self.highlight_word(ex), example_pro, "", element["tags"]]) print(self.word, "was added to csv") if ex_parsed != None: example_sum += ex_parsed return example_sum else: return True def extract_from_list(self, row): return '%s' % ', '.join(row) def download_audio(self, url): link_var = requests.get(url) out = open(self.split_audio_url(url), "wb") out.write(link_var.content) out.close() print("Audio was downloaded!") return def split_audio_url(self, raw): return raw.split('/')[-1] def highlight_word(self, word): result = word.replace(self.word, '<span class="hightlight-word">%s</span>' % self.word) return result def get_corpus_examples(self): dictentries_sum = len(self.tree.xpath('//span[@class="dictentry"]')) counter = 1 while counter <= dictentries_sum: corpus = list(self.tree.xpath('//span[@class="exaGroup cexa%s exaGroup"]' % counter)) definition = self.get_corpus_definition(counter) if len(corpus) > 0: for item in corpus: one_corpus = etree.tostring(item) one_corpus = one_corpus.decode('utf-8') element = self.get_corpus_elements(counter) element["dictionary_source"] = "Corpus(US)" self.append_csv(element, self.ampronounce, definition, "", "") #print(one_corpus) else: print('do not have corpus') counter += 1 def get_corpus_definition(self, dict_id): definition = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="DEF"]' % (dict_id, '//' or '/'))) definition = self.check_span(definition) try: definition = etree.tostring(definition[0]) definition = definition.decode('utf-8') return definition except IndexError: return "" def get_corpus_elements(self, dictentry_temp_counter): element = {} element["pos"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="POS"]/text()' % (dictentry_temp_counter, '//' or '/'))) or "" element["tags"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sa[@class="topic"]/text()' % (dictentry_temp_counter, '//' or '/'))) or "" element["hyphenation"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HYPHENATION"]/text()' % (dictentry_temp_counter, '//' or '/'))) element["homnum"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HOMNUM"]/text()' % (dictentry_temp_counter, '//' or '/'))) or "" element["pron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="PRON"]/text()' % (dictentry_temp_counter, '//' or '/'))) or "" element["amevarpron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="AMEVARPRON"]/text()' % dictentry_temp_counter)) or "" element["freq"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="FREQ"]/@title' % (dictentry_temp_counter, '//' or '/'))) or "" element["signpost"] = "" element["gram"] = "" element = self.rectify_elements(element) print("from corpus", element) return element if __name__ == "__main__": obj = Parser("caterpillar")
Офлайн
MetalHeadНадо указывать кодировку при открытии файла в текстовом режиме. Текстовый режим - это не бинарный режим (без буквы b).with open('dictionary-output.csv', 'a', newline='') as csvfile:
Отредактировано py.user.next (Фев. 18, 2017 05:20:58)
Офлайн
py.user.next
Действительно, а я долго бился. Спасибо огромное.
Офлайн