Python PDFMIner - PDF в CSV
Я хочу иметь возможность конвертировать PDF-файлы в CSV-файлы и нашел несколько полезных скриптов, но, будучи новичком в Python, у меня возник вопрос:
Где вы указываете путь к файлу PDF и CSV, который вы хотите распечатать?
Я использую Python 2.7.11 и PDFMiner 20140328.
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print data
if __name__ == '__main__':
pdfparser(sys.argv[1])
1 ответ
Вот некоторый модифицированный код из этого SO ответа, написанного tgray:
def pdf_to_csv(filename, separator, threshold):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
self.separator = separator
self.threshold = threshold
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda: {})
for child in self.cur_item._objs: # <-- changed
if isinstance(child, LTChar):
(_, _, x, y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) # <-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.line_creator(line)
self.outfp.write(self.line_creator(line))
self.outfp.write("\n")
def line_creator(self, line):
keys = sorted(line.keys())
# calculate the average distange between each character on this row
average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
# append the first character to the result
result = [line[keys[0]]]
for i in range(1, len(keys)):
# if the distance between this character and the last character is greater than the average*threshold
if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
# append the separator into that position
result.append(self.separator)
# append the character
result.append(line[keys[i]])
printable_line = ''.join(result)
return printable_line
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
fp = open(filename, 'rb')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(PDFPage.get_pages(fp)):
outfp.write("START PAGE %d\n" % i)
if page is not None:
print 'none'
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
if __name__ == '__main__':
# the separator to use with the CSV
separator = ';'
# the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well
threshold = 1.5
print pdf_to_csv('myLovelyFile.pdf', separator, threshold)
Основным различием между ответом в ссылке и этим является метод line_creator, который пытается извлечь некоторую структуру из PDF.
Должен работать с PDFminer 20140328.