Spaces:
Sleeping
Sleeping
import PyPDF2, os, re | |
import pandas as pd | |
class InvoiceConvertor(): | |
""" | |
This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file. | |
Usage: | |
convertor = InvoiceConvertor() | |
convertor.read_pdfs('path_to_pdfs') | |
result_df = convertor.convert() | |
""" | |
def __init__(self): | |
self.invoices = [] | |
def read_pdfs(self,path): | |
for file in os.listdir(path): | |
if file.startswith('invoice'): | |
pdf_file = open(path + file, 'rb') | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = '' | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
pdf_file.close() | |
self.invoices.append(text) | |
return self.invoices | |
def save_as_csv(self, details, save_as = "invoice.csv"): | |
# if the csv already exists then concat a new one to it, else create a new one | |
if os.path.exists(save_as): | |
df = pd.read_csv(save_as) | |
df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True) | |
else: | |
df = pd.DataFrame(details, index=[0]) | |
df.to_csv(save_as, index=False) | |
def extract_invoice_details(self, text): | |
invoice_details = {} | |
try: | |
invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1) | |
invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1) | |
invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1) | |
invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1) | |
invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1) | |
invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip() | |
invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip() | |
invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1) | |
except: | |
print('Order Number not found') | |
item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL) | |
if item_match: | |
item_info = item_match.group(1) | |
item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip() | |
invoice_details['Item'] = item_name | |
# print(item_name) | |
else: | |
print("No item found in the invoice.") | |
total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL) | |
if total_mount_match: | |
total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0] | |
invoice_details['Total Amount'] = total_mount | |
else: | |
print("No total amount found in the invoice.") | |
gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text) | |
if gstin_match: | |
invoice_details['GSTIN'] = gstin_match.group(1).strip() | |
else: | |
print("No GSTIN found in the invoice.") | |
by_match = re.search(r'By :([\s\S]+?)PAN No:', text) | |
if by_match: | |
invoice_details['Sold By'] = by_match.group(1).strip() | |
else: | |
print("No seller found in the invoice.") | |
return invoice_details | |
def convert(self): | |
for invoice in self.invoices: | |
details = self.extract_invoice_details(invoice) | |
self.save_as_csv(details) | |
return pd.read_csv('invoice.csv') | |