Spaces:

Dekode
/

betterzila_assignment

Sleeping

betterzila_assignment / invoice_convertor.py

Pratik Dwivedi

New App

25b98b6 about 1 year ago

3.75 kB

	import PyPDF2, os, re
	import pandas as pd

	class InvoiceConvertor():
	"""
	This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.

	Usage:
	convertor = InvoiceConvertor()
	convertor.read_pdfs('path_to_pdfs')
	result_df = convertor.convert()

	"""
	def __init__(self):
	self.invoices = []

	def read_pdfs(self,path):
	for file in os.listdir(path):
	if file.startswith('invoice'):
	pdf_file = open(path + file, 'rb')
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ''
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text += page.extract_text()
	pdf_file.close()
	self.invoices.append(text)
	return self.invoices

	def save_as_csv(self, details, save_as = "invoice.csv"):
	# if the csv already exists then concat a new one to it, else create a new one
	if os.path.exists(save_as):
	df = pd.read_csv(save_as)
	df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
	else:
	df = pd.DataFrame(details, index=[0])
	df.to_csv(save_as, index=False)

	def extract_invoice_details(self, text):
	invoice_details = {}
	try:
	invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
	invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
	invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
	invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
	invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
	invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
	invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
	invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
	except:
	print('Order Number not found')

	item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
	if item_match:
	item_info = item_match.group(1)
	item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
	invoice_details['Item'] = item_name
	# print(item_name)
	else:
	print("No item found in the invoice.")
	total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
	if total_mount_match:
	total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
	invoice_details['Total Amount'] = total_mount
	else:
	print("No total amount found in the invoice.")
	gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
	if gstin_match:
	invoice_details['GSTIN'] = gstin_match.group(1).strip()
	else:
	print("No GSTIN found in the invoice.")
	by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
	if by_match:
	invoice_details['Sold By'] = by_match.group(1).strip()
	else:
	print("No seller found in the invoice.")
	return invoice_details

	def convert(self):
	for invoice in self.invoices:
	details = self.extract_invoice_details(invoice)
	self.save_as_csv(details)
	return pd.read_csv('invoice.csv')