import PyPDF2, os, re import pandas as pd class InvoiceConvertor(): """ This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file. Usage: convertor = InvoiceConvertor() convertor.read_pdfs('path_to_pdfs') result_df = convertor.convert() """ def __init__(self): self.invoices = [] def read_pdfs(self,path): for file in os.listdir(path): if file.startswith('invoice'): pdf_file = open(path + file, 'rb') pdf_reader = PyPDF2.PdfReader(pdf_file) text = '' for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() pdf_file.close() self.invoices.append(text) return self.invoices def save_as_csv(self, details, save_as = "invoice.csv"): # if the csv already exists then concat a new one to it, else create a new one if os.path.exists(save_as): df = pd.read_csv(save_as) df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True) else: df = pd.DataFrame(details, index=[0]) df.to_csv(save_as, index=False) def extract_invoice_details(self, text): invoice_details = {} try: invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1) invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1) invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1) invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1) invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1) invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip() invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip() invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1) except: print('Order Number not found') item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL) if item_match: item_info = item_match.group(1) item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip() invoice_details['Item'] = item_name # print(item_name) else: print("No item found in the invoice.") total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL) if total_mount_match: total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0] invoice_details['Total Amount'] = total_mount else: print("No total amount found in the invoice.") gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text) if gstin_match: invoice_details['GSTIN'] = gstin_match.group(1).strip() else: print("No GSTIN found in the invoice.") by_match = re.search(r'By :([\s\S]+?)PAN No:', text) if by_match: invoice_details['Sold By'] = by_match.group(1).strip() else: print("No seller found in the invoice.") return invoice_details def convert(self): for invoice in self.invoices: details = self.extract_invoice_details(invoice) self.save_as_csv(details) return pd.read_csv('invoice.csv')