Pratik Dwivedi commited on
Commit
837a786
·
2 Parent(s): 5abbd23 d24f176
app.py DELETED
@@ -1,24 +0,0 @@
1
- import streamlit as st
2
- import os
3
- from invoice_convertor import InvoiceConvertor
4
- def main():
5
- st.set_page_config(layout="wide")
6
- st.title('Amazon Invoice Convertor')
7
- st.write('This app converts your Amazon invoice pdfs to a csv file.')
8
- convertor = InvoiceConvertor()
9
- files = st.file_uploader('Upload your invoice pdfs', type=['pdf'], accept_multiple_files=True)
10
- if files:
11
- for file in files:
12
- with open('data/' + file.name, 'wb') as f:
13
- f.write(file.getbuffer())
14
- convertor.read_pdfs('data/')
15
- result_df = convertor.convert()
16
- st.write(result_df)
17
- st.download_button('Download csv', data=result_df.to_csv(), file_name='invoice.csv', mime='text/csv')
18
- for file in os.listdir('data/'):
19
- os.remove('data/' + file)
20
- if st.button('Clear csv file') and os.path.exists('invoice.csv'):
21
- os.remove('invoice.csv')
22
-
23
- if __name__ == '__main__':
24
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extractor.ipynb DELETED
@@ -1,464 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import PyPDF2, os\n",
10
- "import pandas as pd"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": null,
16
- "metadata": {},
17
- "outputs": [],
18
- "source": [
19
- "def read_pdf(path):\n",
20
- " pdf_file = open(path, 'rb')\n",
21
- " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
22
- " text = ''\n",
23
- " for page_num in range(len(pdf_reader.pages)):\n",
24
- " page = pdf_reader.pages[page_num]\n",
25
- " text += page.extract_text()\n",
26
- " pdf_file.close()\n",
27
- " return text\n",
28
- "\n",
29
- "invoices = []\n",
30
- "path = 'invoices/'\n",
31
- "\n",
32
- "for file in os.listdir(path):\n",
33
- " if file.startswith('invoice'):\n",
34
- " text = read_pdf(path + file)\n",
35
- " print(text)\n",
36
- " invoices.append(text)"
37
- ]
38
- },
39
- {
40
- "cell_type": "code",
41
- "execution_count": null,
42
- "metadata": {},
43
- "outputs": [],
44
- "source": [
45
- "import os\n",
46
- "def save_as_csv(details, save_as = \"invoice.csv\"):\n",
47
- " # if the csv already exists then concat a new one to it, else create a new one\n",
48
- " if os.path.exists(save_as):\n",
49
- " df = pd.read_csv(save_as)\n",
50
- " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
51
- " else: \n",
52
- " df = pd.DataFrame(details, index=[0])\n",
53
- " df.to_csv(save_as, index=False)"
54
- ]
55
- },
56
- {
57
- "cell_type": "code",
58
- "execution_count": null,
59
- "metadata": {},
60
- "outputs": [],
61
- "source": [
62
- "import re\n",
63
- "\n",
64
- "def extract_invoice_details(text):\n",
65
- " invoice_details = {}\n",
66
- " try:\n",
67
- " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
68
- " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
69
- " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
70
- " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
71
- " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
72
- " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
73
- " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
74
- " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
75
- " except:\n",
76
- " print('Order Number not found')\n",
77
- " \n",
78
- " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
79
- " if item_match:\n",
80
- " item_info = item_match.group(1)\n",
81
- " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
82
- " invoice_details['Item'] = item_name\n",
83
- " print(item_name)\n",
84
- " else:\n",
85
- " print(\"No item found in the invoice.\")\n",
86
- " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
87
- " if total_mount_match:\n",
88
- " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
89
- " invoice_details['Total Amount'] = total_mount\n",
90
- " else:\n",
91
- " print(\"No total amount found in the invoice.\")\n",
92
- " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
93
- " if gstin_match:\n",
94
- " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
95
- " else:\n",
96
- " print(\"No GSTIN found in the invoice.\")\n",
97
- " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
98
- " if by_match:\n",
99
- " invoice_details['Sold By'] = by_match.group(1).strip()\n",
100
- " else:\n",
101
- " print(\"No seller found in the invoice.\")\n",
102
- " \n",
103
- " return invoice_details"
104
- ]
105
- },
106
- {
107
- "cell_type": "code",
108
- "execution_count": null,
109
- "metadata": {},
110
- "outputs": [],
111
- "source": [
112
- "for invoice in invoices:\n",
113
- " # print(invoice)\n",
114
- " details = extract_invoice_details(invoice)\n",
115
- " save_as_csv(details)"
116
- ]
117
- },
118
- {
119
- "cell_type": "code",
120
- "execution_count": null,
121
- "metadata": {},
122
- "outputs": [],
123
- "source": [
124
- "df = pd.read_csv('invoice.csv')\n",
125
- "df.head(10)"
126
- ]
127
- },
128
- {
129
- "cell_type": "code",
130
- "execution_count": 8,
131
- "metadata": {},
132
- "outputs": [],
133
- "source": [
134
- "import PyPDF2, os, re\n",
135
- "import pandas as pd\n",
136
- "\n",
137
- "class InvoiceConvertor:\n",
138
- " \"\"\"\n",
139
- " This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
140
- " \n",
141
- " Usage:\n",
142
- " convertor = InvoiceConvertor()\n",
143
- " convertor.read_pdfs('path_to_pdfs')\n",
144
- " result_df = convertor.convert()\n",
145
- "\n",
146
- " \"\"\"\n",
147
- " def __init__(self):\n",
148
- " self.invoices = []\n",
149
- " \n",
150
- " def read_pdfs(self,path):\n",
151
- " for file in os.listdir(path):\n",
152
- " if file.startswith('invoice'):\n",
153
- " pdf_file = open(path + file, 'rb')\n",
154
- " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
155
- " text = ''\n",
156
- " for page_num in range(len(pdf_reader.pages)):\n",
157
- " page = pdf_reader.pages[page_num]\n",
158
- " text += page.extract_text()\n",
159
- " pdf_file.close()\n",
160
- " self.invoices.append(text)\n",
161
- " return self.invoices\n",
162
- " \n",
163
- " def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
164
- " # if the csv already exists then concat a new one to it, else create a new one\n",
165
- " if os.path.exists(save_as):\n",
166
- " df = pd.read_csv(save_as)\n",
167
- " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
168
- " else: \n",
169
- " df = pd.DataFrame(details, index=[0])\n",
170
- " df.to_csv(save_as, index=False)\n",
171
- " \n",
172
- " def extract_invoice_details(self, text):\n",
173
- " invoice_details = {}\n",
174
- " try:\n",
175
- " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
176
- " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
177
- " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
178
- " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
179
- " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
180
- " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
181
- " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
182
- " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
183
- " except:\n",
184
- " print('Order Number not found')\n",
185
- "\n",
186
- " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
187
- " if item_match:\n",
188
- " item_info = item_match.group(1)\n",
189
- " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
190
- " invoice_details['Item'] = item_name\n",
191
- " # print(item_name)\n",
192
- " else:\n",
193
- " print(\"No item found in the invoice.\")\n",
194
- " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
195
- " if total_mount_match:\n",
196
- " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
197
- " invoice_details['Total Amount'] = total_mount\n",
198
- " else:\n",
199
- " print(\"No total amount found in the invoice.\")\n",
200
- " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
201
- " if gstin_match:\n",
202
- " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
203
- " else:\n",
204
- " print(\"No GSTIN found in the invoice.\")\n",
205
- " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
206
- " if by_match:\n",
207
- " invoice_details['Sold By'] = by_match.group(1).strip()\n",
208
- " else:\n",
209
- " print(\"No seller found in the invoice.\")\n",
210
- " return invoice_details\n",
211
- " \n",
212
- " def convert(self):\n",
213
- " for invoice in self.invoices:\n",
214
- " details = self.extract_invoice_details(invoice)\n",
215
- " self.save_as_csv(details)\n",
216
- " return pd.read_csv('invoice.csv')"
217
- ]
218
- },
219
- {
220
- "cell_type": "code",
221
- "execution_count": 9,
222
- "metadata": {},
223
- "outputs": [
224
- {
225
- "name": "stdout",
226
- "output_type": "stream",
227
- "text": [
228
- "Order Number not found\n"
229
- ]
230
- },
231
- {
232
- "data": {
233
- "text/html": [
234
- "<div>\n",
235
- "<style scoped>\n",
236
- " .dataframe tbody tr th:only-of-type {\n",
237
- " vertical-align: middle;\n",
238
- " }\n",
239
- "\n",
240
- " .dataframe tbody tr th {\n",
241
- " vertical-align: top;\n",
242
- " }\n",
243
- "\n",
244
- " .dataframe thead th {\n",
245
- " text-align: right;\n",
246
- " }\n",
247
- "</style>\n",
248
- "<table border=\"1\" class=\"dataframe\">\n",
249
- " <thead>\n",
250
- " <tr style=\"text-align: right;\">\n",
251
- " <th></th>\n",
252
- " <th>Order Number</th>\n",
253
- " <th>Invoice Number</th>\n",
254
- " <th>Order Date</th>\n",
255
- " <th>Invoice Details</th>\n",
256
- " <th>Invoice Date</th>\n",
257
- " <th>Billing Address</th>\n",
258
- " <th>Shipping Address</th>\n",
259
- " <th>PAN</th>\n",
260
- " <th>Item</th>\n",
261
- " <th>Total Amount</th>\n",
262
- " <th>GSTIN</th>\n",
263
- " <th>Sold By</th>\n",
264
- " </tr>\n",
265
- " </thead>\n",
266
- " <tbody>\n",
267
- " <tr>\n",
268
- " <th>0</th>\n",
269
- " <td>402-7035529-3886722</td>\n",
270
- " <td>NAG1-192347</td>\n",
271
- " <td>17.08.2023</td>\n",
272
- " <td>MH-NAG1-1034-2324</td>\n",
273
- " <td>17.08.2023</td>\n",
274
- " <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
275
- " <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
276
- " <td>AALCA0171E</td>\n",
277
- " <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
278
- " <td>458.0</td>\n",
279
- " <td>27AALCA0171E1ZZ</td>\n",
280
- " <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
281
- " </tr>\n",
282
- " <tr>\n",
283
- " <th>1</th>\n",
284
- " <td>402-7035529-3886722</td>\n",
285
- " <td>BOM5-1379800</td>\n",
286
- " <td>17.08.2023</td>\n",
287
- " <td>MH-BOM5-1034-2324</td>\n",
288
- " <td>17.08.2023</td>\n",
289
- " <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
290
- " <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
291
- " <td>AALCA0171E</td>\n",
292
- " <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
293
- " <td>13,099.00</td>\n",
294
- " <td>27AALCA0171E1ZZ</td>\n",
295
- " <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
296
- " </tr>\n",
297
- " <tr>\n",
298
- " <th>2</th>\n",
299
- " <td>405-4419941-9848328</td>\n",
300
- " <td>DEX3-4683</td>\n",
301
- " <td>23.07.2023</td>\n",
302
- " <td>DL-DEX3-157533501-2324</td>\n",
303
- " <td>23.07.2023</td>\n",
304
- " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
305
- " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
306
- " <td>ABEPW6057C</td>\n",
307
- " <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
308
- " <td>474.00</td>\n",
309
- " <td>07ABEPW6057C1ZK</td>\n",
310
- " <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
311
- " </tr>\n",
312
- " <tr>\n",
313
- " <th>3</th>\n",
314
- " <td>405-4419941-9848328</td>\n",
315
- " <td>HYD8-29019</td>\n",
316
- " <td>23.07.2023</td>\n",
317
- " <td>TG-HYD8-817549015-2324</td>\n",
318
- " <td>23.07.2023</td>\n",
319
- " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
320
- " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
321
- " <td>AACCN8253B</td>\n",
322
- " <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
323
- " <td>399.00</td>\n",
324
- " <td>36AACCN8253B1ZN</td>\n",
325
- " <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
326
- " </tr>\n",
327
- " <tr>\n",
328
- " <th>4</th>\n",
329
- " <td>405-0015964-5687515</td>\n",
330
- " <td>IN-5040</td>\n",
331
- " <td>23.07.2023</td>\n",
332
- " <td>DL-1922955505-2324</td>\n",
333
- " <td>23.07.2023</td>\n",
334
- " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
335
- " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
336
- " <td>JISPS4412R</td>\n",
337
- " <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
338
- " <td>149.00</td>\n",
339
- " <td>07JISPS4412R1Z4</td>\n",
340
- " <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
341
- " </tr>\n",
342
- " <tr>\n",
343
- " <th>5</th>\n",
344
- " <td>408-4974466-7793143</td>\n",
345
- " <td>JPX2-223775</td>\n",
346
- " <td>02.01.2024</td>\n",
347
- " <td>RJ-JPX2-1317922175-2324</td>\n",
348
- " <td>02.01.2024</td>\n",
349
- " <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
350
- " <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
351
- " <td>AADCV4254H</td>\n",
352
- " <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
353
- " <td>569.00</td>\n",
354
- " <td>08AADCV4254H1Z8</td>\n",
355
- " <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
356
- " </tr>\n",
357
- " <tr>\n",
358
- " <th>6</th>\n",
359
- " <td>NaN</td>\n",
360
- " <td>NaN</td>\n",
361
- " <td>NaN</td>\n",
362
- " <td>NaN</td>\n",
363
- " <td>NaN</td>\n",
364
- " <td>NaN</td>\n",
365
- " <td>NaN</td>\n",
366
- " <td>NaN</td>\n",
367
- " <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
368
- " <td>6,320.00</td>\n",
369
- " <td>36AARCA3925C1ZQBilling</td>\n",
370
- " <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
371
- " </tr>\n",
372
- " </tbody>\n",
373
- "</table>\n",
374
- "</div>"
375
- ],
376
- "text/plain": [
377
- " Order Number Invoice Number Order Date Invoice Details \\\n",
378
- "0 402-7035529-3886722 NAG1-192347 17.08.2023 MH-NAG1-1034-2324 \n",
379
- "1 402-7035529-3886722 BOM5-1379800 17.08.2023 MH-BOM5-1034-2324 \n",
380
- "2 405-4419941-9848328 DEX3-4683 23.07.2023 DL-DEX3-157533501-2324 \n",
381
- "3 405-4419941-9848328 HYD8-29019 23.07.2023 TG-HYD8-817549015-2324 \n",
382
- "4 405-0015964-5687515 IN-5040 23.07.2023 DL-1922955505-2324 \n",
383
- "5 408-4974466-7793143 JPX2-223775 02.01.2024 RJ-JPX2-1317922175-2324 \n",
384
- "6 NaN NaN NaN NaN \n",
385
- "\n",
386
- " Invoice Date Billing Address \\\n",
387
- "0 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
388
- "1 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
389
- "2 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
390
- "3 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
391
- "4 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
392
- "5 02.01.2024 Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA... \n",
393
- "6 NaN NaN \n",
394
- "\n",
395
- " Shipping Address PAN \\\n",
396
- "0 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
397
- "1 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
398
- "2 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... ABEPW6057C \n",
399
- "3 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... AACCN8253B \n",
400
- "4 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... JISPS4412R \n",
401
- "5 Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE... AADCV4254H \n",
402
- "6 NaN NaN \n",
403
- "\n",
404
- " Item Total Amount \\\n",
405
- "0 Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph... 458.0 \n",
406
- "1 LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc... 13,099.00 \n",
407
- "2 Amozo Easy Fit Tempered Glass Screen Protector... 474.00 \n",
408
- "3 ESR for iPhone 13/14 Cover, Shockproof Drop Pr... 399.00 \n",
409
- "4 imluckies Camera Lens Protector Compatible wit... 149.00 \n",
410
- "5 Amazon Basics Sleek Rechargeable LED Table Lam... 569.00 \n",
411
- "6 Saregama Carvaan Telugu - Portable Music Playe... 6,320.00 \n",
412
- "\n",
413
- " GSTIN Sold By \n",
414
- "0 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*TCI Supply Chain... \n",
415
- "1 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*Renaissance indu... \n",
416
- "2 07ABEPW6057C1ZK RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co... \n",
417
- "3 36AACCN8253B1ZN TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp... \n",
418
- "4 07JISPS4412R1Z4 M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA... \n",
419
- "5 08AADCV4254H1Z8 ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ... \n",
420
- "6 36AARCA3925C1ZQBilling AATS Connect Private Limited \\n* GMR Airport C... "
421
- ]
422
- },
423
- "execution_count": 9,
424
- "metadata": {},
425
- "output_type": "execute_result"
426
- }
427
- ],
428
- "source": [
429
- "invoice_convertor = InvoiceConvertor()\n",
430
- "invoice_convertor.read_pdfs('invoices/')\n",
431
- "res = invoice_convertor.convert()\n",
432
- "res.head(10)"
433
- ]
434
- },
435
- {
436
- "cell_type": "code",
437
- "execution_count": null,
438
- "metadata": {},
439
- "outputs": [],
440
- "source": []
441
- }
442
- ],
443
- "metadata": {
444
- "kernelspec": {
445
- "display_name": "resparser",
446
- "language": "python",
447
- "name": "python3"
448
- },
449
- "language_info": {
450
- "codemirror_mode": {
451
- "name": "ipython",
452
- "version": 3
453
- },
454
- "file_extension": ".py",
455
- "mimetype": "text/x-python",
456
- "name": "python",
457
- "nbconvert_exporter": "python",
458
- "pygments_lexer": "ipython3",
459
- "version": "3.9.16"
460
- }
461
- },
462
- "nbformat": 4,
463
- "nbformat_minor": 2
464
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
invoice_convertor.py DELETED
@@ -1,84 +0,0 @@
1
- import PyPDF2, os, re
2
- import pandas as pd
3
-
4
- class InvoiceConvertor():
5
- """
6
- This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
7
-
8
- Usage:
9
- convertor = InvoiceConvertor()
10
- convertor.read_pdfs('path_to_pdfs')
11
- result_df = convertor.convert()
12
-
13
- """
14
- def __init__(self):
15
- self.invoices = []
16
-
17
- def read_pdfs(self,path):
18
- for file in os.listdir(path):
19
- if file.startswith('invoice'):
20
- pdf_file = open(path + file, 'rb')
21
- pdf_reader = PyPDF2.PdfReader(pdf_file)
22
- text = ''
23
- for page_num in range(len(pdf_reader.pages)):
24
- page = pdf_reader.pages[page_num]
25
- text += page.extract_text()
26
- pdf_file.close()
27
- self.invoices.append(text)
28
- return self.invoices
29
-
30
- def save_as_csv(self, details, save_as = "invoice.csv"):
31
- # if the csv already exists then concat a new one to it, else create a new one
32
- if os.path.exists(save_as):
33
- df = pd.read_csv(save_as)
34
- df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
35
- else:
36
- df = pd.DataFrame(details, index=[0])
37
- df.to_csv(save_as, index=False)
38
-
39
- def extract_invoice_details(self, text):
40
- invoice_details = {}
41
- try:
42
- invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
43
- invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
44
- invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
45
- invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
46
- invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
47
- invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
48
- invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
49
- invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
50
- except:
51
- print('Order Number not found')
52
-
53
- item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
54
- if item_match:
55
- item_info = item_match.group(1)
56
- item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
57
- invoice_details['Item'] = item_name
58
- # print(item_name)
59
- else:
60
- print("No item found in the invoice.")
61
- total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
62
- if total_mount_match:
63
- total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
64
- invoice_details['Total Amount'] = total_mount
65
- else:
66
- print("No total amount found in the invoice.")
67
- gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
68
- if gstin_match:
69
- invoice_details['GSTIN'] = gstin_match.group(1).strip()
70
- else:
71
- print("No GSTIN found in the invoice.")
72
- by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
73
- if by_match:
74
- invoice_details['Sold By'] = by_match.group(1).strip()
75
- else:
76
- print("No seller found in the invoice.")
77
- return invoice_details
78
-
79
- def convert(self):
80
- for invoice in self.invoices:
81
- details = self.extract_invoice_details(invoice)
82
- self.save_as_csv(details)
83
- return pd.read_csv('invoice.csv')
84
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
invoices/invoice1.pdf DELETED
Binary file (48.3 kB)
 
invoices/invoice2.pdf DELETED
Binary file (48.4 kB)
 
invoices/invoice3.pdf DELETED
Binary file (54.2 kB)
 
invoices/invoice4.pdf DELETED
Binary file (103 kB)
 
invoices/invoice5.pdf DELETED
Binary file (48 kB)
 
invoices/invoice7.pdf DELETED
Binary file (50.2 kB)
 
invoices/invoice8.pdf DELETED
Binary file (43.9 kB)
 
requirements.txt DELETED
@@ -1,4 +0,0 @@
1
- streamlit==1.32.2
2
- pyPDF2==3.0.1
3
- pandas==1.3.5
4
- regex==2023.12.25