File size: 3,745 Bytes
25b98b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import PyPDF2, os, re
import pandas as pd

class InvoiceConvertor():
    """
    This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
    
    Usage:
    convertor = InvoiceConvertor()
    convertor.read_pdfs('path_to_pdfs')
    result_df = convertor.convert()

    """
    def __init__(self):
        self.invoices = []
        
    def read_pdfs(self,path):
        for file in os.listdir(path):
            if file.startswith('invoice'):
                pdf_file = open(path + file, 'rb')
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                text = ''
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text()
                pdf_file.close()
                self.invoices.append(text)
        return self.invoices
    
    def save_as_csv(self, details, save_as = "invoice.csv"):
        # if the csv already exists then concat a new one to it, else create a new one
        if os.path.exists(save_as):
            df = pd.read_csv(save_as)
            df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
        else:  
            df = pd.DataFrame(details, index=[0])
        df.to_csv(save_as, index=False)
        
    def extract_invoice_details(self, text):
        invoice_details = {}
        try:
            invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
            invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
            invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
            invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
            invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
            invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
            invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
            invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
        except:
            print('Order Number not found')

        item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
        if item_match:
            item_info = item_match.group(1)
            item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
            invoice_details['Item'] = item_name
            # print(item_name)
        else:
            print("No item found in the invoice.")
        total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
        if total_mount_match:
            total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
            invoice_details['Total Amount'] = total_mount
        else:
            print("No total amount found in the invoice.")
        gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
        if gstin_match:
            invoice_details['GSTIN'] = gstin_match.group(1).strip()
        else:
            print("No GSTIN found in the invoice.")
        by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
        if by_match:
            invoice_details['Sold By'] = by_match.group(1).strip()
        else:
            print("No seller found in the invoice.")
        return invoice_details
    
    def convert(self):
        for invoice in self.invoices:
            details = self.extract_invoice_details(invoice)
            self.save_as_csv(details)
        return pd.read_csv('invoice.csv')