import pyPdf
def getPDFContent():
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file(filenamePdf, 'rb'))
# Iterate pages
for i in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(i).extractText() + " \n"
# Collapse whitespace
content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
return content
f = open(filenameTxt,'w+')
f.write(getPDFContent())
f.close()
но при попытке парсинга вываливает:
line 8, in getPDFContent for i in range(0, pdf.getNumPages()):
line 534, in readFromStream
raise utils.PdfReadError, "multiple definitions in dictionary"
pyPdf.utils.PdfReadError: multiple definitions in dictionary