Добрый день товарищи. Много времени потратил на исправление бага с кодировкой впустую. Не пойму почему бинарная строка не кодируется в строку на utf-8. Главное функция rectify_elements обрабатывает корректно. Кстати на версии 3.4 нет таких проблем, а вот на 3.6 не идет вообще никак. Спасибо.
# -*- coding: <utf-8> -*-
from lxml import html, etree
import requests
import csv
class Parser:
def __init__(self, word):
self.word = word
number = 2
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = requests.get('http://www.ldoceonline.com/dictionary/%s' % self.word, headers=headers)
except:
print("An error occured. Check your network connection.")
return None
self.tree = html.fromstring(page.content)
self.lines = self.tree.xpath('//span[starts-with(@id,"%s__")]' % word)
tails = self.tree.xpath('//span[@class="Tail"]/span[starts-with(@id,"%s__")]/@id' % word)
assetlinks = self.tree.xpath('//span[@class="assetlink"]/span[starts-with(@id,"%s__")]/@id' % word)
self.black_indexes = [] # will extend from the beneath function
self.get_black_indexes(tails)
self.get_black_indexes(assetlinks)
self.homnum = self.tree.xpath('//span[@class="HOMNUM"]/text()')
self.homnum_num = len(self.homnum)
self.examples_num = self.tree.xpath('//span[@class="EXAMPLE"]/text()')
self.get_word()
del page
def get_word(self):
# counters and digit data
lines_sum = len(self.lines)
lines_counter = 1
dictentries_sum = len(self.tree.xpath('//span[@class="dictentry"]'))
lines_tempt_counter = 1
while lines_tempt_counter <= lines_sum:
self.double_lines = self.tree.xpath('//span[@id="%s__%d"]/span[starts-with(@id,"%s__")]' %
(self.word, lines_tempt_counter, self.word))
if len(self.double_lines) > 1:
self.black_indexes.append(lines_tempt_counter)
lines_tempt_counter += 1
# get some points
dictentry_temp_counter = 1
dictentry_last_coordinates = []
while dictentries_sum >= dictentry_temp_counter:
dictentry = self.tree.xpath('//span[@class="dictentry"][%s]//span[starts-with(@id,"%s__")]/@id' % (dictentry_temp_counter, self.word))
dictentry_last_coordinates.append(self.get_destination(dictentry[-1]))
dictentry_temp_counter += 1
print(dictentry_last_coordinates)
del dictentry, dictentry_temp_counter
# constant data
title = self.tree.xpath('//span[@class="pagetitle span"]/text()')
example_sum = 0
min_examples = 5 # minimum num of examples for each word
dictentry_temp_counter = 1
ampronounce = list(self.tree.xpath('//span[@class="speaker amefile fa fa-volume-up"]/@data-src-mp3'))[0].split()[0]
self.download_audio(ampronounce)
ampronounce = self.split_audio_url(ampronounce)
self.ampronounce = ampronounce
while lines_sum >= lines_counter:
if lines_counter in self.black_indexes:
print("oops, black list ->", lines_counter)
else:
element = self.getElements(dictentry_temp_counter, lines_counter)
definition = list(self.tree.xpath('//span[@id="%s__%d"]%sspan[@class="DEF"]'
% (self.word, lines_counter, '//' or '/')))
definition = self.check_span(definition)
try:
definition = etree.tostring(definition[0])
definition = definition.decode('utf-8')
except IndexError:
pass
#was here
element = self.rectify_elements(element)
if len(element["dictionary_source"]) == 0:
element["dictionary_source"] = "n/a"
example = list(self.tree.xpath('//span[@id="%s__%d"]//span[@class="EXAMPLE"]'
% (self.word, lines_counter)))
example_pronounce = list(self.tree.xpath('//span[@id="%s__%d"]//span[@class="EXAMPLE"]//span/@data-src-mp3' %
(self.word, lines_counter)))
example_temp_counter = 0
while example_temp_counter != len(example):
ex = etree.tostring(example[example_temp_counter])
ex = ex.decode('utf-8')
try:
example_pro = example_pronounce[example_temp_counter].split()[0]
self.download_audio(example_pro)
example_pro = self.split_audio_url(example_pro)
except IndexError:
example_pro = ""
ex_parsed = len(example)
example_sum = self.append_csv(element, ampronounce, definition, ex, example_pro, ex_parsed)
example_sum += len(example) #need to return
print(example_sum)
example_temp_counter += 1
if lines_counter in dictentry_last_coordinates and lines_counter != dictentry_last_coordinates[-1]:
dictentry_temp_counter += 1
print("WEEEE CHANGE TO -> ", dictentry_temp_counter)
lines_counter += 1
print("dictentry", dictentry_temp_counter-1, "exams", example_sum)
if example_sum <= min_examples:
self.get_corpus_examples()
#pass
def get_destination(self, obj): # get last element
return int(obj[len(self.word) + 2:])
def get_black_indexes(self, points):
for index in points:
self.black_indexes.append(self.get_destination(index))
return
def check_span(self, obj):
if len(obj) != 0:
return obj
else:
blank = ""
return blank
def rectify_elements(self, element):
for item in element:
if self.check_span(element[item]) == element[item]:
element[item] = self.check_span(element[item])
if len(element[item]) > 0 and len(element[item]) < 2:
element[item] = element[item][0]
element[item] = str(element[item]).encode()
element[item] = element[item].decode('utf-8')
else:
element[item] = self.extract_from_list(element[item])
else:
element[item] = ""
return element
def getElements(self, dictentry_temp_counter, lines_counter):
element = {}
element["dictionary_source"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="dictionary_intro span"]/text()'
% dictentry_temp_counter))
element["pos"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="POS"]/text()'
% (dictentry_temp_counter, '//' or '/')))
element["tags"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sa[@class="topic"]/text()'
% (dictentry_temp_counter, '//' or '/')))
element["hyphenation"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HYPHENATION"]/text()'
% (dictentry_temp_counter, '//' or '/')))
element["homnum"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HOMNUM"]/text()'
% (dictentry_temp_counter, '//' or '/')))
element["pron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="PRON"]/text()'
% (dictentry_temp_counter, '//' or '/')))
element["amevarpron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="AMEVARPRON"]/text()' % dictentry_temp_counter))
element["freq"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="FREQ"]/@title'
% (dictentry_temp_counter, '//' or '/')))
element["signpost"] = list(self.tree.xpath('//span[@id="%s__%d"]%sspan[@class="SIGNPOST"]/text()'
% (self.word, lines_counter, '//' or '/')))
element["gram"] = list(self.tree.xpath('//span[@id="%s__%d"]/span[@class="GRAM"]/text()'
% (self.word, lines_counter)))
return element
def append_csv(self, element, ampronounce, definition, ex, example_pro, ex_parsed=None):
example_sum = 0
example = ex
with open('dictionary-output.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',',quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
writer.writerow([element["dictionary_source"], self.word, ampronounce, element["hyphenation"],
element["homnum"], element["pron"], element["amevarpron"], element["pos"],
element["freq"], element["signpost"], element["gram"], definition,
self.highlight_word(ex), example_pro, "", element["tags"]])
print(self.word, "was added to csv")
if ex_parsed != None:
example_sum += ex_parsed
return example_sum
else:
return True
def extract_from_list(self, row):
return '%s' % ', '.join(row)
def download_audio(self, url):
link_var = requests.get(url)
out = open(self.split_audio_url(url), "wb")
out.write(link_var.content)
out.close()
print("Audio was downloaded!")
return
def split_audio_url(self, raw):
return raw.split('/')[-1]
def highlight_word(self, word):
result = word.replace(self.word, '<span class="hightlight-word">%s</span>' % self.word)
return result
def get_corpus_examples(self):
dictentries_sum = len(self.tree.xpath('//span[@class="dictentry"]'))
counter = 1
while counter <= dictentries_sum:
corpus = list(self.tree.xpath('//span[@class="exaGroup cexa%s exaGroup"]' % counter))
definition = self.get_corpus_definition(counter)
if len(corpus) > 0:
for item in corpus:
one_corpus = etree.tostring(item)
one_corpus = one_corpus.decode('utf-8')
element = self.get_corpus_elements(counter)
element["dictionary_source"] = "Corpus(US)"
self.append_csv(element, self.ampronounce, definition, "", "")
#print(one_corpus)
else:
print('do not have corpus')
counter += 1
def get_corpus_definition(self, dict_id):
definition = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="DEF"]'
% (dict_id, '//' or '/')))
definition = self.check_span(definition)
try:
definition = etree.tostring(definition[0])
definition = definition.decode('utf-8')
return definition
except IndexError:
return ""
def get_corpus_elements(self, dictentry_temp_counter):
element = {}
element["pos"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="POS"]/text()'
% (dictentry_temp_counter, '//' or '/'))) or ""
element["tags"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sa[@class="topic"]/text()'
% (dictentry_temp_counter, '//' or '/'))) or ""
element["hyphenation"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HYPHENATION"]/text()'
% (dictentry_temp_counter, '//' or '/')))
element["homnum"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="HOMNUM"]/text()'
% (dictentry_temp_counter, '//' or '/'))) or ""
element["pron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="PRON"]/text()'
% (dictentry_temp_counter, '//' or '/'))) or ""
element["amevarpron"] = list(self.tree.xpath('//span[@class="dictentry"][%d]//span[@class="AMEVARPRON"]/text()'
% dictentry_temp_counter)) or ""
element["freq"] = list(self.tree.xpath('//span[@class="dictentry"][%d]%sspan[@class="FREQ"]/@title'
% (dictentry_temp_counter, '//' or '/'))) or ""
element["signpost"] = ""
element["gram"] = ""
element = self.rectify_elements(element)
print("from corpus", element)
return element
if __name__ == "__main__":
obj = Parser("caterpillar")