Pratik Dwivedi commited on
Commit
25b98b6
·
1 Parent(s): 837a786
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from invoice_convertor import InvoiceConvertor
4
+ def main():
5
+ st.set_page_config(layout="wide")
6
+ st.title('Amazon Invoice Convertor')
7
+ st.write('This app converts your Amazon invoice pdfs to a csv file.')
8
+ convertor = InvoiceConvertor()
9
+ files = st.file_uploader('Upload your invoice pdfs', type=['pdf'], accept_multiple_files=True)
10
+ if files:
11
+ for file in files:
12
+ with open('data/' + file.name, 'wb') as f:
13
+ f.write(file.getbuffer())
14
+ convertor.read_pdfs('data/')
15
+ result_df = convertor.convert()
16
+ st.write(result_df)
17
+ st.download_button('Download csv', data=result_df.to_csv(), file_name='invoice.csv', mime='text/csv')
18
+ for file in os.listdir('data/'):
19
+ os.remove('data/' + file)
20
+ if st.button('Clear csv file') and os.path.exists('invoice.csv'):
21
+ os.remove('invoice.csv')
22
+
23
+ if __name__ == '__main__':
24
+ main()
extractor.ipynb ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import PyPDF2, os\n",
10
+ "import pandas as pd"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "def read_pdf(path):\n",
20
+ " pdf_file = open(path, 'rb')\n",
21
+ " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
22
+ " text = ''\n",
23
+ " for page_num in range(len(pdf_reader.pages)):\n",
24
+ " page = pdf_reader.pages[page_num]\n",
25
+ " text += page.extract_text()\n",
26
+ " pdf_file.close()\n",
27
+ " return text\n",
28
+ "\n",
29
+ "invoices = []\n",
30
+ "path = 'invoices/'\n",
31
+ "\n",
32
+ "for file in os.listdir(path):\n",
33
+ " if file.startswith('invoice'):\n",
34
+ " text = read_pdf(path + file)\n",
35
+ " print(text)\n",
36
+ " invoices.append(text)"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "import os\n",
46
+ "def save_as_csv(details, save_as = \"invoice.csv\"):\n",
47
+ " # if the csv already exists then concat a new one to it, else create a new one\n",
48
+ " if os.path.exists(save_as):\n",
49
+ " df = pd.read_csv(save_as)\n",
50
+ " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
51
+ " else: \n",
52
+ " df = pd.DataFrame(details, index=[0])\n",
53
+ " df.to_csv(save_as, index=False)"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "import re\n",
63
+ "\n",
64
+ "def extract_invoice_details(text):\n",
65
+ " invoice_details = {}\n",
66
+ " try:\n",
67
+ " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
68
+ " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
69
+ " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
70
+ " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
71
+ " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
72
+ " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
73
+ " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
74
+ " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
75
+ " except:\n",
76
+ " print('Order Number not found')\n",
77
+ " \n",
78
+ " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
79
+ " if item_match:\n",
80
+ " item_info = item_match.group(1)\n",
81
+ " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
82
+ " invoice_details['Item'] = item_name\n",
83
+ " print(item_name)\n",
84
+ " else:\n",
85
+ " print(\"No item found in the invoice.\")\n",
86
+ " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
87
+ " if total_mount_match:\n",
88
+ " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
89
+ " invoice_details['Total Amount'] = total_mount\n",
90
+ " else:\n",
91
+ " print(\"No total amount found in the invoice.\")\n",
92
+ " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
93
+ " if gstin_match:\n",
94
+ " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
95
+ " else:\n",
96
+ " print(\"No GSTIN found in the invoice.\")\n",
97
+ " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
98
+ " if by_match:\n",
99
+ " invoice_details['Sold By'] = by_match.group(1).strip()\n",
100
+ " else:\n",
101
+ " print(\"No seller found in the invoice.\")\n",
102
+ " \n",
103
+ " return invoice_details"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "for invoice in invoices:\n",
113
+ " # print(invoice)\n",
114
+ " details = extract_invoice_details(invoice)\n",
115
+ " save_as_csv(details)"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": null,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "df = pd.read_csv('invoice.csv')\n",
125
+ "df.head(10)"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 8,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "import PyPDF2, os, re\n",
135
+ "import pandas as pd\n",
136
+ "\n",
137
+ "class InvoiceConvertor:\n",
138
+ " \"\"\"\n",
139
+ " This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
140
+ " \n",
141
+ " Usage:\n",
142
+ " convertor = InvoiceConvertor()\n",
143
+ " convertor.read_pdfs('path_to_pdfs')\n",
144
+ " result_df = convertor.convert()\n",
145
+ "\n",
146
+ " \"\"\"\n",
147
+ " def __init__(self):\n",
148
+ " self.invoices = []\n",
149
+ " \n",
150
+ " def read_pdfs(self,path):\n",
151
+ " for file in os.listdir(path):\n",
152
+ " if file.startswith('invoice'):\n",
153
+ " pdf_file = open(path + file, 'rb')\n",
154
+ " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
155
+ " text = ''\n",
156
+ " for page_num in range(len(pdf_reader.pages)):\n",
157
+ " page = pdf_reader.pages[page_num]\n",
158
+ " text += page.extract_text()\n",
159
+ " pdf_file.close()\n",
160
+ " self.invoices.append(text)\n",
161
+ " return self.invoices\n",
162
+ " \n",
163
+ " def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
164
+ " # if the csv already exists then concat a new one to it, else create a new one\n",
165
+ " if os.path.exists(save_as):\n",
166
+ " df = pd.read_csv(save_as)\n",
167
+ " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
168
+ " else: \n",
169
+ " df = pd.DataFrame(details, index=[0])\n",
170
+ " df.to_csv(save_as, index=False)\n",
171
+ " \n",
172
+ " def extract_invoice_details(self, text):\n",
173
+ " invoice_details = {}\n",
174
+ " try:\n",
175
+ " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
176
+ " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
177
+ " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
178
+ " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
179
+ " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
180
+ " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
181
+ " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
182
+ " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
183
+ " except:\n",
184
+ " print('Order Number not found')\n",
185
+ "\n",
186
+ " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
187
+ " if item_match:\n",
188
+ " item_info = item_match.group(1)\n",
189
+ " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
190
+ " invoice_details['Item'] = item_name\n",
191
+ " # print(item_name)\n",
192
+ " else:\n",
193
+ " print(\"No item found in the invoice.\")\n",
194
+ " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
195
+ " if total_mount_match:\n",
196
+ " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
197
+ " invoice_details['Total Amount'] = total_mount\n",
198
+ " else:\n",
199
+ " print(\"No total amount found in the invoice.\")\n",
200
+ " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
201
+ " if gstin_match:\n",
202
+ " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
203
+ " else:\n",
204
+ " print(\"No GSTIN found in the invoice.\")\n",
205
+ " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
206
+ " if by_match:\n",
207
+ " invoice_details['Sold By'] = by_match.group(1).strip()\n",
208
+ " else:\n",
209
+ " print(\"No seller found in the invoice.\")\n",
210
+ " return invoice_details\n",
211
+ " \n",
212
+ " def convert(self):\n",
213
+ " for invoice in self.invoices:\n",
214
+ " details = self.extract_invoice_details(invoice)\n",
215
+ " self.save_as_csv(details)\n",
216
+ " return pd.read_csv('invoice.csv')"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 9,
222
+ "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "name": "stdout",
226
+ "output_type": "stream",
227
+ "text": [
228
+ "Order Number not found\n"
229
+ ]
230
+ },
231
+ {
232
+ "data": {
233
+ "text/html": [
234
+ "<div>\n",
235
+ "<style scoped>\n",
236
+ " .dataframe tbody tr th:only-of-type {\n",
237
+ " vertical-align: middle;\n",
238
+ " }\n",
239
+ "\n",
240
+ " .dataframe tbody tr th {\n",
241
+ " vertical-align: top;\n",
242
+ " }\n",
243
+ "\n",
244
+ " .dataframe thead th {\n",
245
+ " text-align: right;\n",
246
+ " }\n",
247
+ "</style>\n",
248
+ "<table border=\"1\" class=\"dataframe\">\n",
249
+ " <thead>\n",
250
+ " <tr style=\"text-align: right;\">\n",
251
+ " <th></th>\n",
252
+ " <th>Order Number</th>\n",
253
+ " <th>Invoice Number</th>\n",
254
+ " <th>Order Date</th>\n",
255
+ " <th>Invoice Details</th>\n",
256
+ " <th>Invoice Date</th>\n",
257
+ " <th>Billing Address</th>\n",
258
+ " <th>Shipping Address</th>\n",
259
+ " <th>PAN</th>\n",
260
+ " <th>Item</th>\n",
261
+ " <th>Total Amount</th>\n",
262
+ " <th>GSTIN</th>\n",
263
+ " <th>Sold By</th>\n",
264
+ " </tr>\n",
265
+ " </thead>\n",
266
+ " <tbody>\n",
267
+ " <tr>\n",
268
+ " <th>0</th>\n",
269
+ " <td>402-7035529-3886722</td>\n",
270
+ " <td>NAG1-192347</td>\n",
271
+ " <td>17.08.2023</td>\n",
272
+ " <td>MH-NAG1-1034-2324</td>\n",
273
+ " <td>17.08.2023</td>\n",
274
+ " <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
275
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
276
+ " <td>AALCA0171E</td>\n",
277
+ " <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
278
+ " <td>458.0</td>\n",
279
+ " <td>27AALCA0171E1ZZ</td>\n",
280
+ " <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
281
+ " </tr>\n",
282
+ " <tr>\n",
283
+ " <th>1</th>\n",
284
+ " <td>402-7035529-3886722</td>\n",
285
+ " <td>BOM5-1379800</td>\n",
286
+ " <td>17.08.2023</td>\n",
287
+ " <td>MH-BOM5-1034-2324</td>\n",
288
+ " <td>17.08.2023</td>\n",
289
+ " <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
290
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
291
+ " <td>AALCA0171E</td>\n",
292
+ " <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
293
+ " <td>13,099.00</td>\n",
294
+ " <td>27AALCA0171E1ZZ</td>\n",
295
+ " <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
296
+ " </tr>\n",
297
+ " <tr>\n",
298
+ " <th>2</th>\n",
299
+ " <td>405-4419941-9848328</td>\n",
300
+ " <td>DEX3-4683</td>\n",
301
+ " <td>23.07.2023</td>\n",
302
+ " <td>DL-DEX3-157533501-2324</td>\n",
303
+ " <td>23.07.2023</td>\n",
304
+ " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
305
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
306
+ " <td>ABEPW6057C</td>\n",
307
+ " <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
308
+ " <td>474.00</td>\n",
309
+ " <td>07ABEPW6057C1ZK</td>\n",
310
+ " <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>3</th>\n",
314
+ " <td>405-4419941-9848328</td>\n",
315
+ " <td>HYD8-29019</td>\n",
316
+ " <td>23.07.2023</td>\n",
317
+ " <td>TG-HYD8-817549015-2324</td>\n",
318
+ " <td>23.07.2023</td>\n",
319
+ " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
320
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
321
+ " <td>AACCN8253B</td>\n",
322
+ " <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
323
+ " <td>399.00</td>\n",
324
+ " <td>36AACCN8253B1ZN</td>\n",
325
+ " <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
326
+ " </tr>\n",
327
+ " <tr>\n",
328
+ " <th>4</th>\n",
329
+ " <td>405-0015964-5687515</td>\n",
330
+ " <td>IN-5040</td>\n",
331
+ " <td>23.07.2023</td>\n",
332
+ " <td>DL-1922955505-2324</td>\n",
333
+ " <td>23.07.2023</td>\n",
334
+ " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
335
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
336
+ " <td>JISPS4412R</td>\n",
337
+ " <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
338
+ " <td>149.00</td>\n",
339
+ " <td>07JISPS4412R1Z4</td>\n",
340
+ " <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
341
+ " </tr>\n",
342
+ " <tr>\n",
343
+ " <th>5</th>\n",
344
+ " <td>408-4974466-7793143</td>\n",
345
+ " <td>JPX2-223775</td>\n",
346
+ " <td>02.01.2024</td>\n",
347
+ " <td>RJ-JPX2-1317922175-2324</td>\n",
348
+ " <td>02.01.2024</td>\n",
349
+ " <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
350
+ " <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
351
+ " <td>AADCV4254H</td>\n",
352
+ " <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
353
+ " <td>569.00</td>\n",
354
+ " <td>08AADCV4254H1Z8</td>\n",
355
+ " <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
356
+ " </tr>\n",
357
+ " <tr>\n",
358
+ " <th>6</th>\n",
359
+ " <td>NaN</td>\n",
360
+ " <td>NaN</td>\n",
361
+ " <td>NaN</td>\n",
362
+ " <td>NaN</td>\n",
363
+ " <td>NaN</td>\n",
364
+ " <td>NaN</td>\n",
365
+ " <td>NaN</td>\n",
366
+ " <td>NaN</td>\n",
367
+ " <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
368
+ " <td>6,320.00</td>\n",
369
+ " <td>36AARCA3925C1ZQBilling</td>\n",
370
+ " <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
371
+ " </tr>\n",
372
+ " </tbody>\n",
373
+ "</table>\n",
374
+ "</div>"
375
+ ],
376
+ "text/plain": [
377
+ " Order Number Invoice Number Order Date Invoice Details \\\n",
378
+ "0 402-7035529-3886722 NAG1-192347 17.08.2023 MH-NAG1-1034-2324 \n",
379
+ "1 402-7035529-3886722 BOM5-1379800 17.08.2023 MH-BOM5-1034-2324 \n",
380
+ "2 405-4419941-9848328 DEX3-4683 23.07.2023 DL-DEX3-157533501-2324 \n",
381
+ "3 405-4419941-9848328 HYD8-29019 23.07.2023 TG-HYD8-817549015-2324 \n",
382
+ "4 405-0015964-5687515 IN-5040 23.07.2023 DL-1922955505-2324 \n",
383
+ "5 408-4974466-7793143 JPX2-223775 02.01.2024 RJ-JPX2-1317922175-2324 \n",
384
+ "6 NaN NaN NaN NaN \n",
385
+ "\n",
386
+ " Invoice Date Billing Address \\\n",
387
+ "0 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
388
+ "1 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
389
+ "2 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
390
+ "3 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
391
+ "4 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
392
+ "5 02.01.2024 Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA... \n",
393
+ "6 NaN NaN \n",
394
+ "\n",
395
+ " Shipping Address PAN \\\n",
396
+ "0 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
397
+ "1 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
398
+ "2 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... ABEPW6057C \n",
399
+ "3 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... AACCN8253B \n",
400
+ "4 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... JISPS4412R \n",
401
+ "5 Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE... AADCV4254H \n",
402
+ "6 NaN NaN \n",
403
+ "\n",
404
+ " Item Total Amount \\\n",
405
+ "0 Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph... 458.0 \n",
406
+ "1 LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc... 13,099.00 \n",
407
+ "2 Amozo Easy Fit Tempered Glass Screen Protector... 474.00 \n",
408
+ "3 ESR for iPhone 13/14 Cover, Shockproof Drop Pr... 399.00 \n",
409
+ "4 imluckies Camera Lens Protector Compatible wit... 149.00 \n",
410
+ "5 Amazon Basics Sleek Rechargeable LED Table Lam... 569.00 \n",
411
+ "6 Saregama Carvaan Telugu - Portable Music Playe... 6,320.00 \n",
412
+ "\n",
413
+ " GSTIN Sold By \n",
414
+ "0 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*TCI Supply Chain... \n",
415
+ "1 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*Renaissance indu... \n",
416
+ "2 07ABEPW6057C1ZK RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co... \n",
417
+ "3 36AACCN8253B1ZN TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp... \n",
418
+ "4 07JISPS4412R1Z4 M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA... \n",
419
+ "5 08AADCV4254H1Z8 ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ... \n",
420
+ "6 36AARCA3925C1ZQBilling AATS Connect Private Limited \\n* GMR Airport C... "
421
+ ]
422
+ },
423
+ "execution_count": 9,
424
+ "metadata": {},
425
+ "output_type": "execute_result"
426
+ }
427
+ ],
428
+ "source": [
429
+ "invoice_convertor = InvoiceConvertor()\n",
430
+ "invoice_convertor.read_pdfs('invoices/')\n",
431
+ "res = invoice_convertor.convert()\n",
432
+ "res.head(10)"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": null,
438
+ "metadata": {},
439
+ "outputs": [],
440
+ "source": []
441
+ }
442
+ ],
443
+ "metadata": {
444
+ "kernelspec": {
445
+ "display_name": "resparser",
446
+ "language": "python",
447
+ "name": "python3"
448
+ },
449
+ "language_info": {
450
+ "codemirror_mode": {
451
+ "name": "ipython",
452
+ "version": 3
453
+ },
454
+ "file_extension": ".py",
455
+ "mimetype": "text/x-python",
456
+ "name": "python",
457
+ "nbconvert_exporter": "python",
458
+ "pygments_lexer": "ipython3",
459
+ "version": "3.9.16"
460
+ }
461
+ },
462
+ "nbformat": 4,
463
+ "nbformat_minor": 2
464
+ }
invoice_convertor.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2, os, re
2
+ import pandas as pd
3
+
4
+ class InvoiceConvertor():
5
+ """
6
+ This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
7
+
8
+ Usage:
9
+ convertor = InvoiceConvertor()
10
+ convertor.read_pdfs('path_to_pdfs')
11
+ result_df = convertor.convert()
12
+
13
+ """
14
+ def __init__(self):
15
+ self.invoices = []
16
+
17
+ def read_pdfs(self,path):
18
+ for file in os.listdir(path):
19
+ if file.startswith('invoice'):
20
+ pdf_file = open(path + file, 'rb')
21
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
22
+ text = ''
23
+ for page_num in range(len(pdf_reader.pages)):
24
+ page = pdf_reader.pages[page_num]
25
+ text += page.extract_text()
26
+ pdf_file.close()
27
+ self.invoices.append(text)
28
+ return self.invoices
29
+
30
+ def save_as_csv(self, details, save_as = "invoice.csv"):
31
+ # if the csv already exists then concat a new one to it, else create a new one
32
+ if os.path.exists(save_as):
33
+ df = pd.read_csv(save_as)
34
+ df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
35
+ else:
36
+ df = pd.DataFrame(details, index=[0])
37
+ df.to_csv(save_as, index=False)
38
+
39
+ def extract_invoice_details(self, text):
40
+ invoice_details = {}
41
+ try:
42
+ invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
43
+ invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
44
+ invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
45
+ invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
46
+ invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
47
+ invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
48
+ invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
49
+ invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
50
+ except:
51
+ print('Order Number not found')
52
+
53
+ item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
54
+ if item_match:
55
+ item_info = item_match.group(1)
56
+ item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
57
+ invoice_details['Item'] = item_name
58
+ # print(item_name)
59
+ else:
60
+ print("No item found in the invoice.")
61
+ total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
62
+ if total_mount_match:
63
+ total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
64
+ invoice_details['Total Amount'] = total_mount
65
+ else:
66
+ print("No total amount found in the invoice.")
67
+ gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
68
+ if gstin_match:
69
+ invoice_details['GSTIN'] = gstin_match.group(1).strip()
70
+ else:
71
+ print("No GSTIN found in the invoice.")
72
+ by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
73
+ if by_match:
74
+ invoice_details['Sold By'] = by_match.group(1).strip()
75
+ else:
76
+ print("No seller found in the invoice.")
77
+ return invoice_details
78
+
79
+ def convert(self):
80
+ for invoice in self.invoices:
81
+ details = self.extract_invoice_details(invoice)
82
+ self.save_as_csv(details)
83
+ return pd.read_csv('invoice.csv')
84
+
invoices/invoice1.pdf ADDED
Binary file (48.3 kB). View file
 
invoices/invoice2.pdf ADDED
Binary file (48.4 kB). View file
 
invoices/invoice3.pdf ADDED
Binary file (54.2 kB). View file
 
invoices/invoice4.pdf ADDED
Binary file (103 kB). View file
 
invoices/invoice5.pdf ADDED
Binary file (48 kB). View file
 
invoices/invoice7.pdf ADDED
Binary file (50.2 kB). View file
 
invoices/invoice8.pdf ADDED
Binary file (43.9 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.32.2
2
+ pyPDF2==3.0.1
3
+ pandas==1.3.5
4
+ regex==2023.12.25