Форум сайта python.su
Добрый день! Есть веб-ресурс, на котором есть список различных названий, которые я выдёргиваю с помощью html5lib + lxml. Первые 10-15 страниц распарсились нормально, на каждой по 50 значений. Но далее я получил ошибку следующего вида:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "vkp.py", line 32, in get_url
return html5lib.parse(t, treebuilder = "lxml", namespaceHTMLElements = False), t
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 38, in parse
return p.parse(doc, encoding=encoding)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 211, in parse
parseMeta=parseMeta, useChardet=useChardet)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 111, in _parse
self.mainLoop()
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 179, in mainLoop
self.phase.processStartTag(token)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 447, in processStartTag
self.startTagHandler[token["name"]](token)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 447, in processStartTag
self.startTagHandler[token["name"]](token)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 1050, in startTagA
self.addFormattingElement(token)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 918, in addFormattingElement
self.tree.insertElement(token)
File "/usr/lib/pymodules/python2.6/html5lib/treebuilders/_base.py", line 259, in insertElementNormal
element.attributes = token["data"]
File "/usr/lib/pymodules/python2.6/html5lib/treebuilders/etree_lxml.py", line 219, in _setAttributes
self._attributes = Attributes(self, attributes)
File "/usr/lib/pymodules/python2.6/html5lib/treebuilders/etree_lxml.py", line 189, in __init__
self._element._element.attrib[name] = value
File "lxml.etree.pyx", line 1945, in lxml.etree._Attrib.__setitem__ (src/lxml/lxml.etree.c:42933)
File "apihelpers.pxi", line 488, in lxml.etree._setAttributeValue (src/lxml/lxml.etree.c:13918)
File "apihelpers.pxi", line 1295, in lxml.etree._utf8 (src/lxml/lxml.etree.c:20212)
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
Офлайн