betterzila_assignment / invoice_convertor.py
Pratik Dwivedi
New App
25b98b6
import PyPDF2, os, re
import pandas as pd
class InvoiceConvertor():
"""
This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
Usage:
convertor = InvoiceConvertor()
convertor.read_pdfs('path_to_pdfs')
result_df = convertor.convert()
"""
def __init__(self):
self.invoices = []
def read_pdfs(self,path):
for file in os.listdir(path):
if file.startswith('invoice'):
pdf_file = open(path + file, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
pdf_file.close()
self.invoices.append(text)
return self.invoices
def save_as_csv(self, details, save_as = "invoice.csv"):
# if the csv already exists then concat a new one to it, else create a new one
if os.path.exists(save_as):
df = pd.read_csv(save_as)
df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
else:
df = pd.DataFrame(details, index=[0])
df.to_csv(save_as, index=False)
def extract_invoice_details(self, text):
invoice_details = {}
try:
invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
except:
print('Order Number not found')
item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
if item_match:
item_info = item_match.group(1)
item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
invoice_details['Item'] = item_name
# print(item_name)
else:
print("No item found in the invoice.")
total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
if total_mount_match:
total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
invoice_details['Total Amount'] = total_mount
else:
print("No total amount found in the invoice.")
gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
if gstin_match:
invoice_details['GSTIN'] = gstin_match.group(1).strip()
else:
print("No GSTIN found in the invoice.")
by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
if by_match:
invoice_details['Sold By'] = by_match.group(1).strip()
else:
print("No seller found in the invoice.")
return invoice_details
def convert(self):
for invoice in self.invoices:
details = self.extract_invoice_details(invoice)
self.save_as_csv(details)
return pd.read_csv('invoice.csv')