PDF to Excel using advanced python NLP and Computer Vision AKA Document AI - Part 1
installation:
pip install matplotlib
pip install "camelot-py[cv]"
brew install ghostscript tcl-tk
import camelot
#import camelot.io as camelot
import pandas as pd
import PyPDF2 as pyPdf
import matplotlib.pyplot as plt
#import tabula
tables = camelot.read_pdf('icic.pdf')
camelot.plot(tables[0], kind='text')
plt.show()
camelot.plot(tables[0], kind='grid')
plt.show()
reader = pyPdf.PdfFileReader(open("icic.pdf", mode='rb' ))
n = reader.getNumPages()
df = []
for page in [str(i+1) for i in range(n)]:
if page == "1":
df.append(read_pdf(r"icic.pdf", area=(530,12.75,790.5,561), pages=page))
else:
df.append(read_pdf(r"icic.pdf", pages=page))
Comments
Post a Comment