Pratik Dwivedi commited on
Commit
5abbd23
·
1 Parent(s): 0e5546c
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore DELETED
@@ -1,3 +0,0 @@
1
- .env
2
- .gitattributes
3
- secrets.toml
 
 
 
 
README.md DELETED
@@ -1,13 +0,0 @@
1
- ---
2
- title: Betterzila Assignment
3
- emoji: 📉
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.30.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from invoice_convertor import InvoiceConvertor
4
+ def main():
5
+ st.set_page_config(layout="wide")
6
+ st.title('Amazon Invoice Convertor')
7
+ st.write('This app converts your Amazon invoice pdfs to a csv file.')
8
+ convertor = InvoiceConvertor()
9
+ files = st.file_uploader('Upload your invoice pdfs', type=['pdf'], accept_multiple_files=True)
10
+ if files:
11
+ for file in files:
12
+ with open('data/' + file.name, 'wb') as f:
13
+ f.write(file.getbuffer())
14
+ convertor.read_pdfs('data/')
15
+ result_df = convertor.convert()
16
+ st.write(result_df)
17
+ st.download_button('Download csv', data=result_df.to_csv(), file_name='invoice.csv', mime='text/csv')
18
+ for file in os.listdir('data/'):
19
+ os.remove('data/' + file)
20
+ if st.button('Clear csv file') and os.path.exists('invoice.csv'):
21
+ os.remove('invoice.csv')
22
+
23
+ if __name__ == '__main__':
24
+ main()
application.py DELETED
@@ -1,60 +0,0 @@
1
- import streamlit as st
2
- from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
3
- from langchain.text_splitter import CharacterTextSplitter
4
- from langchain_community.embeddings import HuggingFaceInstructEmbeddings
5
- from langchain_community.vectorstores import FAISS
6
- from langchain.chains import ConversationalRetrievalChain
7
- from langchain_community.llms import HuggingFaceHub
8
- from langchain.memory import ConversationBufferMemory
9
-
10
- def make_vectorstore(embeddings):
11
- # use glob to find all the pdf files in the data folder in the base directory
12
- loader = PyPDFDirectoryLoader("data")
13
-
14
- # load the documents
15
- documents = loader.load()
16
-
17
- # split the documents into chunks of 1400 characters with 0 overlap
18
- text_splitter = CharacterTextSplitter(chunk_size=1400, chunk_overlap=0)
19
-
20
- # split the documents into chunks of 1400 characters with 0 overlap
21
- texts = text_splitter.split_documents(documents)
22
-
23
- # create a vector store from the documents
24
- docsearch = FAISS.from_documents(texts, embeddings)
25
-
26
- return docsearch
27
-
28
- def get_conversation(vectorstore):
29
-
30
- # create a memory object to store the conversation history
31
- memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True,)
32
-
33
- conversation_chain = ConversationalRetrievalChain.from_chain_type(
34
- llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}, huggingfacehub_api_token = st.secrets["hf_token"]),
35
- chain_type="stuff",
36
- retriever=vectorstore.as_retriever(),
37
- memory=memory)
38
-
39
- return conversation_chain
40
-
41
- def get_response(conversation_chain, query):
42
- # get the response
43
- response = conversation_chain.run(query)
44
- return response
45
-
46
- def main():
47
- st.title("BetterZila RAG Enabled LLM")
48
- embeddings = HuggingFaceInstructEmbeddings(model_name="google/t5-v1_1-xl", model_kwargs = {'device': 'cpu'})
49
- vectorstore = make_vectorstore(embeddings)
50
- conversation_chain = get_conversation(vectorstore)
51
- queries = ["Can you give me an example from history where the enemy was crushed totally from the book?", "What's the point of making myself less accessible?", "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"]
52
- for query in queries:
53
- st.subheader(f"Query: {query}")
54
- response = get_response(conversation_chain, query)
55
- st.write(query)
56
- st.write(response["llm_response"])
57
- st.success("Responses generated!")
58
-
59
- if __name__ == "__main__":
60
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/48lawsofpower.pdf DELETED
Binary file (105 kB)
 
extractor.ipynb ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import PyPDF2, os\n",
10
+ "import pandas as pd"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "def read_pdf(path):\n",
20
+ " pdf_file = open(path, 'rb')\n",
21
+ " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
22
+ " text = ''\n",
23
+ " for page_num in range(len(pdf_reader.pages)):\n",
24
+ " page = pdf_reader.pages[page_num]\n",
25
+ " text += page.extract_text()\n",
26
+ " pdf_file.close()\n",
27
+ " return text\n",
28
+ "\n",
29
+ "invoices = []\n",
30
+ "path = 'invoices/'\n",
31
+ "\n",
32
+ "for file in os.listdir(path):\n",
33
+ " if file.startswith('invoice'):\n",
34
+ " text = read_pdf(path + file)\n",
35
+ " print(text)\n",
36
+ " invoices.append(text)"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "import os\n",
46
+ "def save_as_csv(details, save_as = \"invoice.csv\"):\n",
47
+ " # if the csv already exists then concat a new one to it, else create a new one\n",
48
+ " if os.path.exists(save_as):\n",
49
+ " df = pd.read_csv(save_as)\n",
50
+ " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
51
+ " else: \n",
52
+ " df = pd.DataFrame(details, index=[0])\n",
53
+ " df.to_csv(save_as, index=False)"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "import re\n",
63
+ "\n",
64
+ "def extract_invoice_details(text):\n",
65
+ " invoice_details = {}\n",
66
+ " try:\n",
67
+ " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
68
+ " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
69
+ " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
70
+ " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
71
+ " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
72
+ " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
73
+ " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
74
+ " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
75
+ " except:\n",
76
+ " print('Order Number not found')\n",
77
+ " \n",
78
+ " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
79
+ " if item_match:\n",
80
+ " item_info = item_match.group(1)\n",
81
+ " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
82
+ " invoice_details['Item'] = item_name\n",
83
+ " print(item_name)\n",
84
+ " else:\n",
85
+ " print(\"No item found in the invoice.\")\n",
86
+ " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
87
+ " if total_mount_match:\n",
88
+ " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
89
+ " invoice_details['Total Amount'] = total_mount\n",
90
+ " else:\n",
91
+ " print(\"No total amount found in the invoice.\")\n",
92
+ " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
93
+ " if gstin_match:\n",
94
+ " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
95
+ " else:\n",
96
+ " print(\"No GSTIN found in the invoice.\")\n",
97
+ " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
98
+ " if by_match:\n",
99
+ " invoice_details['Sold By'] = by_match.group(1).strip()\n",
100
+ " else:\n",
101
+ " print(\"No seller found in the invoice.\")\n",
102
+ " \n",
103
+ " return invoice_details"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "for invoice in invoices:\n",
113
+ " # print(invoice)\n",
114
+ " details = extract_invoice_details(invoice)\n",
115
+ " save_as_csv(details)"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": null,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "df = pd.read_csv('invoice.csv')\n",
125
+ "df.head(10)"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 8,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "import PyPDF2, os, re\n",
135
+ "import pandas as pd\n",
136
+ "\n",
137
+ "class InvoiceConvertor:\n",
138
+ " \"\"\"\n",
139
+ " This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
140
+ " \n",
141
+ " Usage:\n",
142
+ " convertor = InvoiceConvertor()\n",
143
+ " convertor.read_pdfs('path_to_pdfs')\n",
144
+ " result_df = convertor.convert()\n",
145
+ "\n",
146
+ " \"\"\"\n",
147
+ " def __init__(self):\n",
148
+ " self.invoices = []\n",
149
+ " \n",
150
+ " def read_pdfs(self,path):\n",
151
+ " for file in os.listdir(path):\n",
152
+ " if file.startswith('invoice'):\n",
153
+ " pdf_file = open(path + file, 'rb')\n",
154
+ " pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
155
+ " text = ''\n",
156
+ " for page_num in range(len(pdf_reader.pages)):\n",
157
+ " page = pdf_reader.pages[page_num]\n",
158
+ " text += page.extract_text()\n",
159
+ " pdf_file.close()\n",
160
+ " self.invoices.append(text)\n",
161
+ " return self.invoices\n",
162
+ " \n",
163
+ " def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
164
+ " # if the csv already exists then concat a new one to it, else create a new one\n",
165
+ " if os.path.exists(save_as):\n",
166
+ " df = pd.read_csv(save_as)\n",
167
+ " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
168
+ " else: \n",
169
+ " df = pd.DataFrame(details, index=[0])\n",
170
+ " df.to_csv(save_as, index=False)\n",
171
+ " \n",
172
+ " def extract_invoice_details(self, text):\n",
173
+ " invoice_details = {}\n",
174
+ " try:\n",
175
+ " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
176
+ " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
177
+ " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
178
+ " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
179
+ " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
180
+ " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
181
+ " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
182
+ " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
183
+ " except:\n",
184
+ " print('Order Number not found')\n",
185
+ "\n",
186
+ " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
187
+ " if item_match:\n",
188
+ " item_info = item_match.group(1)\n",
189
+ " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
190
+ " invoice_details['Item'] = item_name\n",
191
+ " # print(item_name)\n",
192
+ " else:\n",
193
+ " print(\"No item found in the invoice.\")\n",
194
+ " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
195
+ " if total_mount_match:\n",
196
+ " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
197
+ " invoice_details['Total Amount'] = total_mount\n",
198
+ " else:\n",
199
+ " print(\"No total amount found in the invoice.\")\n",
200
+ " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
201
+ " if gstin_match:\n",
202
+ " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
203
+ " else:\n",
204
+ " print(\"No GSTIN found in the invoice.\")\n",
205
+ " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
206
+ " if by_match:\n",
207
+ " invoice_details['Sold By'] = by_match.group(1).strip()\n",
208
+ " else:\n",
209
+ " print(\"No seller found in the invoice.\")\n",
210
+ " return invoice_details\n",
211
+ " \n",
212
+ " def convert(self):\n",
213
+ " for invoice in self.invoices:\n",
214
+ " details = self.extract_invoice_details(invoice)\n",
215
+ " self.save_as_csv(details)\n",
216
+ " return pd.read_csv('invoice.csv')"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 9,
222
+ "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "name": "stdout",
226
+ "output_type": "stream",
227
+ "text": [
228
+ "Order Number not found\n"
229
+ ]
230
+ },
231
+ {
232
+ "data": {
233
+ "text/html": [
234
+ "<div>\n",
235
+ "<style scoped>\n",
236
+ " .dataframe tbody tr th:only-of-type {\n",
237
+ " vertical-align: middle;\n",
238
+ " }\n",
239
+ "\n",
240
+ " .dataframe tbody tr th {\n",
241
+ " vertical-align: top;\n",
242
+ " }\n",
243
+ "\n",
244
+ " .dataframe thead th {\n",
245
+ " text-align: right;\n",
246
+ " }\n",
247
+ "</style>\n",
248
+ "<table border=\"1\" class=\"dataframe\">\n",
249
+ " <thead>\n",
250
+ " <tr style=\"text-align: right;\">\n",
251
+ " <th></th>\n",
252
+ " <th>Order Number</th>\n",
253
+ " <th>Invoice Number</th>\n",
254
+ " <th>Order Date</th>\n",
255
+ " <th>Invoice Details</th>\n",
256
+ " <th>Invoice Date</th>\n",
257
+ " <th>Billing Address</th>\n",
258
+ " <th>Shipping Address</th>\n",
259
+ " <th>PAN</th>\n",
260
+ " <th>Item</th>\n",
261
+ " <th>Total Amount</th>\n",
262
+ " <th>GSTIN</th>\n",
263
+ " <th>Sold By</th>\n",
264
+ " </tr>\n",
265
+ " </thead>\n",
266
+ " <tbody>\n",
267
+ " <tr>\n",
268
+ " <th>0</th>\n",
269
+ " <td>402-7035529-3886722</td>\n",
270
+ " <td>NAG1-192347</td>\n",
271
+ " <td>17.08.2023</td>\n",
272
+ " <td>MH-NAG1-1034-2324</td>\n",
273
+ " <td>17.08.2023</td>\n",
274
+ " <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
275
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
276
+ " <td>AALCA0171E</td>\n",
277
+ " <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
278
+ " <td>458.0</td>\n",
279
+ " <td>27AALCA0171E1ZZ</td>\n",
280
+ " <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
281
+ " </tr>\n",
282
+ " <tr>\n",
283
+ " <th>1</th>\n",
284
+ " <td>402-7035529-3886722</td>\n",
285
+ " <td>BOM5-1379800</td>\n",
286
+ " <td>17.08.2023</td>\n",
287
+ " <td>MH-BOM5-1034-2324</td>\n",
288
+ " <td>17.08.2023</td>\n",
289
+ " <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
290
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
291
+ " <td>AALCA0171E</td>\n",
292
+ " <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
293
+ " <td>13,099.00</td>\n",
294
+ " <td>27AALCA0171E1ZZ</td>\n",
295
+ " <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
296
+ " </tr>\n",
297
+ " <tr>\n",
298
+ " <th>2</th>\n",
299
+ " <td>405-4419941-9848328</td>\n",
300
+ " <td>DEX3-4683</td>\n",
301
+ " <td>23.07.2023</td>\n",
302
+ " <td>DL-DEX3-157533501-2324</td>\n",
303
+ " <td>23.07.2023</td>\n",
304
+ " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
305
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
306
+ " <td>ABEPW6057C</td>\n",
307
+ " <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
308
+ " <td>474.00</td>\n",
309
+ " <td>07ABEPW6057C1ZK</td>\n",
310
+ " <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>3</th>\n",
314
+ " <td>405-4419941-9848328</td>\n",
315
+ " <td>HYD8-29019</td>\n",
316
+ " <td>23.07.2023</td>\n",
317
+ " <td>TG-HYD8-817549015-2324</td>\n",
318
+ " <td>23.07.2023</td>\n",
319
+ " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
320
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
321
+ " <td>AACCN8253B</td>\n",
322
+ " <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
323
+ " <td>399.00</td>\n",
324
+ " <td>36AACCN8253B1ZN</td>\n",
325
+ " <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
326
+ " </tr>\n",
327
+ " <tr>\n",
328
+ " <th>4</th>\n",
329
+ " <td>405-0015964-5687515</td>\n",
330
+ " <td>IN-5040</td>\n",
331
+ " <td>23.07.2023</td>\n",
332
+ " <td>DL-1922955505-2324</td>\n",
333
+ " <td>23.07.2023</td>\n",
334
+ " <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
335
+ " <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
336
+ " <td>JISPS4412R</td>\n",
337
+ " <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
338
+ " <td>149.00</td>\n",
339
+ " <td>07JISPS4412R1Z4</td>\n",
340
+ " <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
341
+ " </tr>\n",
342
+ " <tr>\n",
343
+ " <th>5</th>\n",
344
+ " <td>408-4974466-7793143</td>\n",
345
+ " <td>JPX2-223775</td>\n",
346
+ " <td>02.01.2024</td>\n",
347
+ " <td>RJ-JPX2-1317922175-2324</td>\n",
348
+ " <td>02.01.2024</td>\n",
349
+ " <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
350
+ " <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
351
+ " <td>AADCV4254H</td>\n",
352
+ " <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
353
+ " <td>569.00</td>\n",
354
+ " <td>08AADCV4254H1Z8</td>\n",
355
+ " <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
356
+ " </tr>\n",
357
+ " <tr>\n",
358
+ " <th>6</th>\n",
359
+ " <td>NaN</td>\n",
360
+ " <td>NaN</td>\n",
361
+ " <td>NaN</td>\n",
362
+ " <td>NaN</td>\n",
363
+ " <td>NaN</td>\n",
364
+ " <td>NaN</td>\n",
365
+ " <td>NaN</td>\n",
366
+ " <td>NaN</td>\n",
367
+ " <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
368
+ " <td>6,320.00</td>\n",
369
+ " <td>36AARCA3925C1ZQBilling</td>\n",
370
+ " <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
371
+ " </tr>\n",
372
+ " </tbody>\n",
373
+ "</table>\n",
374
+ "</div>"
375
+ ],
376
+ "text/plain": [
377
+ " Order Number Invoice Number Order Date Invoice Details \\\n",
378
+ "0 402-7035529-3886722 NAG1-192347 17.08.2023 MH-NAG1-1034-2324 \n",
379
+ "1 402-7035529-3886722 BOM5-1379800 17.08.2023 MH-BOM5-1034-2324 \n",
380
+ "2 405-4419941-9848328 DEX3-4683 23.07.2023 DL-DEX3-157533501-2324 \n",
381
+ "3 405-4419941-9848328 HYD8-29019 23.07.2023 TG-HYD8-817549015-2324 \n",
382
+ "4 405-0015964-5687515 IN-5040 23.07.2023 DL-1922955505-2324 \n",
383
+ "5 408-4974466-7793143 JPX2-223775 02.01.2024 RJ-JPX2-1317922175-2324 \n",
384
+ "6 NaN NaN NaN NaN \n",
385
+ "\n",
386
+ " Invoice Date Billing Address \\\n",
387
+ "0 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
388
+ "1 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
389
+ "2 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
390
+ "3 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
391
+ "4 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
392
+ "5 02.01.2024 Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA... \n",
393
+ "6 NaN NaN \n",
394
+ "\n",
395
+ " Shipping Address PAN \\\n",
396
+ "0 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
397
+ "1 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
398
+ "2 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... ABEPW6057C \n",
399
+ "3 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... AACCN8253B \n",
400
+ "4 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... JISPS4412R \n",
401
+ "5 Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE... AADCV4254H \n",
402
+ "6 NaN NaN \n",
403
+ "\n",
404
+ " Item Total Amount \\\n",
405
+ "0 Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph... 458.0 \n",
406
+ "1 LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc... 13,099.00 \n",
407
+ "2 Amozo Easy Fit Tempered Glass Screen Protector... 474.00 \n",
408
+ "3 ESR for iPhone 13/14 Cover, Shockproof Drop Pr... 399.00 \n",
409
+ "4 imluckies Camera Lens Protector Compatible wit... 149.00 \n",
410
+ "5 Amazon Basics Sleek Rechargeable LED Table Lam... 569.00 \n",
411
+ "6 Saregama Carvaan Telugu - Portable Music Playe... 6,320.00 \n",
412
+ "\n",
413
+ " GSTIN Sold By \n",
414
+ "0 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*TCI Supply Chain... \n",
415
+ "1 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*Renaissance indu... \n",
416
+ "2 07ABEPW6057C1ZK RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co... \n",
417
+ "3 36AACCN8253B1ZN TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp... \n",
418
+ "4 07JISPS4412R1Z4 M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA... \n",
419
+ "5 08AADCV4254H1Z8 ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ... \n",
420
+ "6 36AARCA3925C1ZQBilling AATS Connect Private Limited \\n* GMR Airport C... "
421
+ ]
422
+ },
423
+ "execution_count": 9,
424
+ "metadata": {},
425
+ "output_type": "execute_result"
426
+ }
427
+ ],
428
+ "source": [
429
+ "invoice_convertor = InvoiceConvertor()\n",
430
+ "invoice_convertor.read_pdfs('invoices/')\n",
431
+ "res = invoice_convertor.convert()\n",
432
+ "res.head(10)"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": null,
438
+ "metadata": {},
439
+ "outputs": [],
440
+ "source": []
441
+ }
442
+ ],
443
+ "metadata": {
444
+ "kernelspec": {
445
+ "display_name": "resparser",
446
+ "language": "python",
447
+ "name": "python3"
448
+ },
449
+ "language_info": {
450
+ "codemirror_mode": {
451
+ "name": "ipython",
452
+ "version": 3
453
+ },
454
+ "file_extension": ".py",
455
+ "mimetype": "text/x-python",
456
+ "name": "python",
457
+ "nbconvert_exporter": "python",
458
+ "pygments_lexer": "ipython3",
459
+ "version": "3.9.16"
460
+ }
461
+ },
462
+ "nbformat": 4,
463
+ "nbformat_minor": 2
464
+ }
invoice_convertor.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2, os, re
2
+ import pandas as pd
3
+
4
+ class InvoiceConvertor():
5
+ """
6
+ This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
7
+
8
+ Usage:
9
+ convertor = InvoiceConvertor()
10
+ convertor.read_pdfs('path_to_pdfs')
11
+ result_df = convertor.convert()
12
+
13
+ """
14
+ def __init__(self):
15
+ self.invoices = []
16
+
17
+ def read_pdfs(self,path):
18
+ for file in os.listdir(path):
19
+ if file.startswith('invoice'):
20
+ pdf_file = open(path + file, 'rb')
21
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
22
+ text = ''
23
+ for page_num in range(len(pdf_reader.pages)):
24
+ page = pdf_reader.pages[page_num]
25
+ text += page.extract_text()
26
+ pdf_file.close()
27
+ self.invoices.append(text)
28
+ return self.invoices
29
+
30
+ def save_as_csv(self, details, save_as = "invoice.csv"):
31
+ # if the csv already exists then concat a new one to it, else create a new one
32
+ if os.path.exists(save_as):
33
+ df = pd.read_csv(save_as)
34
+ df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
35
+ else:
36
+ df = pd.DataFrame(details, index=[0])
37
+ df.to_csv(save_as, index=False)
38
+
39
+ def extract_invoice_details(self, text):
40
+ invoice_details = {}
41
+ try:
42
+ invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
43
+ invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
44
+ invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
45
+ invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
46
+ invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
47
+ invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
48
+ invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
49
+ invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
50
+ except:
51
+ print('Order Number not found')
52
+
53
+ item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
54
+ if item_match:
55
+ item_info = item_match.group(1)
56
+ item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
57
+ invoice_details['Item'] = item_name
58
+ # print(item_name)
59
+ else:
60
+ print("No item found in the invoice.")
61
+ total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
62
+ if total_mount_match:
63
+ total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
64
+ invoice_details['Total Amount'] = total_mount
65
+ else:
66
+ print("No total amount found in the invoice.")
67
+ gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
68
+ if gstin_match:
69
+ invoice_details['GSTIN'] = gstin_match.group(1).strip()
70
+ else:
71
+ print("No GSTIN found in the invoice.")
72
+ by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
73
+ if by_match:
74
+ invoice_details['Sold By'] = by_match.group(1).strip()
75
+ else:
76
+ print("No seller found in the invoice.")
77
+ return invoice_details
78
+
79
+ def convert(self):
80
+ for invoice in self.invoices:
81
+ details = self.extract_invoice_details(invoice)
82
+ self.save_as_csv(details)
83
+ return pd.read_csv('invoice.csv')
84
+
invoices/invoice1.pdf ADDED
Binary file (48.3 kB). View file
 
invoices/invoice2.pdf ADDED
Binary file (48.4 kB). View file
 
invoices/invoice3.pdf ADDED
Binary file (54.2 kB). View file
 
invoices/invoice4.pdf ADDED
Binary file (103 kB). View file
 
invoices/invoice5.pdf ADDED
Binary file (48 kB). View file
 
invoices/invoice7.pdf ADDED
Binary file (50.2 kB). View file
 
invoices/invoice8.pdf ADDED
Binary file (43.9 kB). View file
 
requirements.txt CHANGED
@@ -1,10 +1,4 @@
1
- transformers
2
- langchain
3
- langchain-community
4
- InstructorEmbedding
5
- streamlit
6
- PyPDF2
7
- sentence-transformers
8
- python-dotenv
9
- pypdf
10
- faiss-cpu
 
1
+ streamlit==1.32.2
2
+ pyPDF2==3.0.1
3
+ pandas==1.3.5
4
+ regex==2023.12.25
 
 
 
 
 
 
secrets.toml DELETED
@@ -1 +0,0 @@
1
- hf_token="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"