Spaces:
Sleeping
Sleeping
Pratik Dwivedi
commited on
Commit
·
5abbd23
1
Parent(s):
0e5546c
New app
Browse files- .gitattributes +0 -35
- .gitignore +0 -3
- README.md +0 -13
- app.py +24 -0
- application.py +0 -60
- data/48lawsofpower.pdf +0 -0
- extractor.ipynb +464 -0
- invoice_convertor.py +84 -0
- invoices/invoice1.pdf +0 -0
- invoices/invoice2.pdf +0 -0
- invoices/invoice3.pdf +0 -0
- invoices/invoice4.pdf +0 -0
- invoices/invoice5.pdf +0 -0
- invoices/invoice7.pdf +0 -0
- invoices/invoice8.pdf +0 -0
- requirements.txt +4 -10
- secrets.toml +0 -1
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
.env
|
2 |
-
.gitattributes
|
3 |
-
secrets.toml
|
|
|
|
|
|
|
|
README.md
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Betterzila Assignment
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: purple
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.30.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from invoice_convertor import InvoiceConvertor
|
4 |
+
def main():
|
5 |
+
st.set_page_config(layout="wide")
|
6 |
+
st.title('Amazon Invoice Convertor')
|
7 |
+
st.write('This app converts your Amazon invoice pdfs to a csv file.')
|
8 |
+
convertor = InvoiceConvertor()
|
9 |
+
files = st.file_uploader('Upload your invoice pdfs', type=['pdf'], accept_multiple_files=True)
|
10 |
+
if files:
|
11 |
+
for file in files:
|
12 |
+
with open('data/' + file.name, 'wb') as f:
|
13 |
+
f.write(file.getbuffer())
|
14 |
+
convertor.read_pdfs('data/')
|
15 |
+
result_df = convertor.convert()
|
16 |
+
st.write(result_df)
|
17 |
+
st.download_button('Download csv', data=result_df.to_csv(), file_name='invoice.csv', mime='text/csv')
|
18 |
+
for file in os.listdir('data/'):
|
19 |
+
os.remove('data/' + file)
|
20 |
+
if st.button('Clear csv file') and os.path.exists('invoice.csv'):
|
21 |
+
os.remove('invoice.csv')
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
main()
|
application.py
DELETED
@@ -1,60 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
|
3 |
-
from langchain.text_splitter import CharacterTextSplitter
|
4 |
-
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
5 |
-
from langchain_community.vectorstores import FAISS
|
6 |
-
from langchain.chains import ConversationalRetrievalChain
|
7 |
-
from langchain_community.llms import HuggingFaceHub
|
8 |
-
from langchain.memory import ConversationBufferMemory
|
9 |
-
|
10 |
-
def make_vectorstore(embeddings):
|
11 |
-
# use glob to find all the pdf files in the data folder in the base directory
|
12 |
-
loader = PyPDFDirectoryLoader("data")
|
13 |
-
|
14 |
-
# load the documents
|
15 |
-
documents = loader.load()
|
16 |
-
|
17 |
-
# split the documents into chunks of 1400 characters with 0 overlap
|
18 |
-
text_splitter = CharacterTextSplitter(chunk_size=1400, chunk_overlap=0)
|
19 |
-
|
20 |
-
# split the documents into chunks of 1400 characters with 0 overlap
|
21 |
-
texts = text_splitter.split_documents(documents)
|
22 |
-
|
23 |
-
# create a vector store from the documents
|
24 |
-
docsearch = FAISS.from_documents(texts, embeddings)
|
25 |
-
|
26 |
-
return docsearch
|
27 |
-
|
28 |
-
def get_conversation(vectorstore):
|
29 |
-
|
30 |
-
# create a memory object to store the conversation history
|
31 |
-
memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True,)
|
32 |
-
|
33 |
-
conversation_chain = ConversationalRetrievalChain.from_chain_type(
|
34 |
-
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}, huggingfacehub_api_token = st.secrets["hf_token"]),
|
35 |
-
chain_type="stuff",
|
36 |
-
retriever=vectorstore.as_retriever(),
|
37 |
-
memory=memory)
|
38 |
-
|
39 |
-
return conversation_chain
|
40 |
-
|
41 |
-
def get_response(conversation_chain, query):
|
42 |
-
# get the response
|
43 |
-
response = conversation_chain.run(query)
|
44 |
-
return response
|
45 |
-
|
46 |
-
def main():
|
47 |
-
st.title("BetterZila RAG Enabled LLM")
|
48 |
-
embeddings = HuggingFaceInstructEmbeddings(model_name="google/t5-v1_1-xl", model_kwargs = {'device': 'cpu'})
|
49 |
-
vectorstore = make_vectorstore(embeddings)
|
50 |
-
conversation_chain = get_conversation(vectorstore)
|
51 |
-
queries = ["Can you give me an example from history where the enemy was crushed totally from the book?", "What's the point of making myself less accessible?", "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"]
|
52 |
-
for query in queries:
|
53 |
-
st.subheader(f"Query: {query}")
|
54 |
-
response = get_response(conversation_chain, query)
|
55 |
-
st.write(query)
|
56 |
-
st.write(response["llm_response"])
|
57 |
-
st.success("Responses generated!")
|
58 |
-
|
59 |
-
if __name__ == "__main__":
|
60 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/48lawsofpower.pdf
DELETED
Binary file (105 kB)
|
|
extractor.ipynb
ADDED
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import PyPDF2, os\n",
|
10 |
+
"import pandas as pd"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"def read_pdf(path):\n",
|
20 |
+
" pdf_file = open(path, 'rb')\n",
|
21 |
+
" pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
|
22 |
+
" text = ''\n",
|
23 |
+
" for page_num in range(len(pdf_reader.pages)):\n",
|
24 |
+
" page = pdf_reader.pages[page_num]\n",
|
25 |
+
" text += page.extract_text()\n",
|
26 |
+
" pdf_file.close()\n",
|
27 |
+
" return text\n",
|
28 |
+
"\n",
|
29 |
+
"invoices = []\n",
|
30 |
+
"path = 'invoices/'\n",
|
31 |
+
"\n",
|
32 |
+
"for file in os.listdir(path):\n",
|
33 |
+
" if file.startswith('invoice'):\n",
|
34 |
+
" text = read_pdf(path + file)\n",
|
35 |
+
" print(text)\n",
|
36 |
+
" invoices.append(text)"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": null,
|
42 |
+
"metadata": {},
|
43 |
+
"outputs": [],
|
44 |
+
"source": [
|
45 |
+
"import os\n",
|
46 |
+
"def save_as_csv(details, save_as = \"invoice.csv\"):\n",
|
47 |
+
" # if the csv already exists then concat a new one to it, else create a new one\n",
|
48 |
+
" if os.path.exists(save_as):\n",
|
49 |
+
" df = pd.read_csv(save_as)\n",
|
50 |
+
" df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
|
51 |
+
" else: \n",
|
52 |
+
" df = pd.DataFrame(details, index=[0])\n",
|
53 |
+
" df.to_csv(save_as, index=False)"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": null,
|
59 |
+
"metadata": {},
|
60 |
+
"outputs": [],
|
61 |
+
"source": [
|
62 |
+
"import re\n",
|
63 |
+
"\n",
|
64 |
+
"def extract_invoice_details(text):\n",
|
65 |
+
" invoice_details = {}\n",
|
66 |
+
" try:\n",
|
67 |
+
" invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
|
68 |
+
" invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
|
69 |
+
" invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
|
70 |
+
" invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
|
71 |
+
" invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
|
72 |
+
" invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
|
73 |
+
" invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
|
74 |
+
" invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
|
75 |
+
" except:\n",
|
76 |
+
" print('Order Number not found')\n",
|
77 |
+
" \n",
|
78 |
+
" item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
|
79 |
+
" if item_match:\n",
|
80 |
+
" item_info = item_match.group(1)\n",
|
81 |
+
" item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
|
82 |
+
" invoice_details['Item'] = item_name\n",
|
83 |
+
" print(item_name)\n",
|
84 |
+
" else:\n",
|
85 |
+
" print(\"No item found in the invoice.\")\n",
|
86 |
+
" total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
|
87 |
+
" if total_mount_match:\n",
|
88 |
+
" total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
|
89 |
+
" invoice_details['Total Amount'] = total_mount\n",
|
90 |
+
" else:\n",
|
91 |
+
" print(\"No total amount found in the invoice.\")\n",
|
92 |
+
" gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
|
93 |
+
" if gstin_match:\n",
|
94 |
+
" invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
|
95 |
+
" else:\n",
|
96 |
+
" print(\"No GSTIN found in the invoice.\")\n",
|
97 |
+
" by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
|
98 |
+
" if by_match:\n",
|
99 |
+
" invoice_details['Sold By'] = by_match.group(1).strip()\n",
|
100 |
+
" else:\n",
|
101 |
+
" print(\"No seller found in the invoice.\")\n",
|
102 |
+
" \n",
|
103 |
+
" return invoice_details"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": null,
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"for invoice in invoices:\n",
|
113 |
+
" # print(invoice)\n",
|
114 |
+
" details = extract_invoice_details(invoice)\n",
|
115 |
+
" save_as_csv(details)"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": null,
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"df = pd.read_csv('invoice.csv')\n",
|
125 |
+
"df.head(10)"
|
126 |
+
]
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"cell_type": "code",
|
130 |
+
"execution_count": 8,
|
131 |
+
"metadata": {},
|
132 |
+
"outputs": [],
|
133 |
+
"source": [
|
134 |
+
"import PyPDF2, os, re\n",
|
135 |
+
"import pandas as pd\n",
|
136 |
+
"\n",
|
137 |
+
"class InvoiceConvertor:\n",
|
138 |
+
" \"\"\"\n",
|
139 |
+
" This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
|
140 |
+
" \n",
|
141 |
+
" Usage:\n",
|
142 |
+
" convertor = InvoiceConvertor()\n",
|
143 |
+
" convertor.read_pdfs('path_to_pdfs')\n",
|
144 |
+
" result_df = convertor.convert()\n",
|
145 |
+
"\n",
|
146 |
+
" \"\"\"\n",
|
147 |
+
" def __init__(self):\n",
|
148 |
+
" self.invoices = []\n",
|
149 |
+
" \n",
|
150 |
+
" def read_pdfs(self,path):\n",
|
151 |
+
" for file in os.listdir(path):\n",
|
152 |
+
" if file.startswith('invoice'):\n",
|
153 |
+
" pdf_file = open(path + file, 'rb')\n",
|
154 |
+
" pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
|
155 |
+
" text = ''\n",
|
156 |
+
" for page_num in range(len(pdf_reader.pages)):\n",
|
157 |
+
" page = pdf_reader.pages[page_num]\n",
|
158 |
+
" text += page.extract_text()\n",
|
159 |
+
" pdf_file.close()\n",
|
160 |
+
" self.invoices.append(text)\n",
|
161 |
+
" return self.invoices\n",
|
162 |
+
" \n",
|
163 |
+
" def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
|
164 |
+
" # if the csv already exists then concat a new one to it, else create a new one\n",
|
165 |
+
" if os.path.exists(save_as):\n",
|
166 |
+
" df = pd.read_csv(save_as)\n",
|
167 |
+
" df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
|
168 |
+
" else: \n",
|
169 |
+
" df = pd.DataFrame(details, index=[0])\n",
|
170 |
+
" df.to_csv(save_as, index=False)\n",
|
171 |
+
" \n",
|
172 |
+
" def extract_invoice_details(self, text):\n",
|
173 |
+
" invoice_details = {}\n",
|
174 |
+
" try:\n",
|
175 |
+
" invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
|
176 |
+
" invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
|
177 |
+
" invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
|
178 |
+
" invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
|
179 |
+
" invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
|
180 |
+
" invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
|
181 |
+
" invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
|
182 |
+
" invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
|
183 |
+
" except:\n",
|
184 |
+
" print('Order Number not found')\n",
|
185 |
+
"\n",
|
186 |
+
" item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
|
187 |
+
" if item_match:\n",
|
188 |
+
" item_info = item_match.group(1)\n",
|
189 |
+
" item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
|
190 |
+
" invoice_details['Item'] = item_name\n",
|
191 |
+
" # print(item_name)\n",
|
192 |
+
" else:\n",
|
193 |
+
" print(\"No item found in the invoice.\")\n",
|
194 |
+
" total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
|
195 |
+
" if total_mount_match:\n",
|
196 |
+
" total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
|
197 |
+
" invoice_details['Total Amount'] = total_mount\n",
|
198 |
+
" else:\n",
|
199 |
+
" print(\"No total amount found in the invoice.\")\n",
|
200 |
+
" gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
|
201 |
+
" if gstin_match:\n",
|
202 |
+
" invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
|
203 |
+
" else:\n",
|
204 |
+
" print(\"No GSTIN found in the invoice.\")\n",
|
205 |
+
" by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
|
206 |
+
" if by_match:\n",
|
207 |
+
" invoice_details['Sold By'] = by_match.group(1).strip()\n",
|
208 |
+
" else:\n",
|
209 |
+
" print(\"No seller found in the invoice.\")\n",
|
210 |
+
" return invoice_details\n",
|
211 |
+
" \n",
|
212 |
+
" def convert(self):\n",
|
213 |
+
" for invoice in self.invoices:\n",
|
214 |
+
" details = self.extract_invoice_details(invoice)\n",
|
215 |
+
" self.save_as_csv(details)\n",
|
216 |
+
" return pd.read_csv('invoice.csv')"
|
217 |
+
]
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"cell_type": "code",
|
221 |
+
"execution_count": 9,
|
222 |
+
"metadata": {},
|
223 |
+
"outputs": [
|
224 |
+
{
|
225 |
+
"name": "stdout",
|
226 |
+
"output_type": "stream",
|
227 |
+
"text": [
|
228 |
+
"Order Number not found\n"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"data": {
|
233 |
+
"text/html": [
|
234 |
+
"<div>\n",
|
235 |
+
"<style scoped>\n",
|
236 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
237 |
+
" vertical-align: middle;\n",
|
238 |
+
" }\n",
|
239 |
+
"\n",
|
240 |
+
" .dataframe tbody tr th {\n",
|
241 |
+
" vertical-align: top;\n",
|
242 |
+
" }\n",
|
243 |
+
"\n",
|
244 |
+
" .dataframe thead th {\n",
|
245 |
+
" text-align: right;\n",
|
246 |
+
" }\n",
|
247 |
+
"</style>\n",
|
248 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
249 |
+
" <thead>\n",
|
250 |
+
" <tr style=\"text-align: right;\">\n",
|
251 |
+
" <th></th>\n",
|
252 |
+
" <th>Order Number</th>\n",
|
253 |
+
" <th>Invoice Number</th>\n",
|
254 |
+
" <th>Order Date</th>\n",
|
255 |
+
" <th>Invoice Details</th>\n",
|
256 |
+
" <th>Invoice Date</th>\n",
|
257 |
+
" <th>Billing Address</th>\n",
|
258 |
+
" <th>Shipping Address</th>\n",
|
259 |
+
" <th>PAN</th>\n",
|
260 |
+
" <th>Item</th>\n",
|
261 |
+
" <th>Total Amount</th>\n",
|
262 |
+
" <th>GSTIN</th>\n",
|
263 |
+
" <th>Sold By</th>\n",
|
264 |
+
" </tr>\n",
|
265 |
+
" </thead>\n",
|
266 |
+
" <tbody>\n",
|
267 |
+
" <tr>\n",
|
268 |
+
" <th>0</th>\n",
|
269 |
+
" <td>402-7035529-3886722</td>\n",
|
270 |
+
" <td>NAG1-192347</td>\n",
|
271 |
+
" <td>17.08.2023</td>\n",
|
272 |
+
" <td>MH-NAG1-1034-2324</td>\n",
|
273 |
+
" <td>17.08.2023</td>\n",
|
274 |
+
" <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
|
275 |
+
" <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
|
276 |
+
" <td>AALCA0171E</td>\n",
|
277 |
+
" <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
|
278 |
+
" <td>458.0</td>\n",
|
279 |
+
" <td>27AALCA0171E1ZZ</td>\n",
|
280 |
+
" <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
|
281 |
+
" </tr>\n",
|
282 |
+
" <tr>\n",
|
283 |
+
" <th>1</th>\n",
|
284 |
+
" <td>402-7035529-3886722</td>\n",
|
285 |
+
" <td>BOM5-1379800</td>\n",
|
286 |
+
" <td>17.08.2023</td>\n",
|
287 |
+
" <td>MH-BOM5-1034-2324</td>\n",
|
288 |
+
" <td>17.08.2023</td>\n",
|
289 |
+
" <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
|
290 |
+
" <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
|
291 |
+
" <td>AALCA0171E</td>\n",
|
292 |
+
" <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
|
293 |
+
" <td>13,099.00</td>\n",
|
294 |
+
" <td>27AALCA0171E1ZZ</td>\n",
|
295 |
+
" <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
|
296 |
+
" </tr>\n",
|
297 |
+
" <tr>\n",
|
298 |
+
" <th>2</th>\n",
|
299 |
+
" <td>405-4419941-9848328</td>\n",
|
300 |
+
" <td>DEX3-4683</td>\n",
|
301 |
+
" <td>23.07.2023</td>\n",
|
302 |
+
" <td>DL-DEX3-157533501-2324</td>\n",
|
303 |
+
" <td>23.07.2023</td>\n",
|
304 |
+
" <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
|
305 |
+
" <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
|
306 |
+
" <td>ABEPW6057C</td>\n",
|
307 |
+
" <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
|
308 |
+
" <td>474.00</td>\n",
|
309 |
+
" <td>07ABEPW6057C1ZK</td>\n",
|
310 |
+
" <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
|
311 |
+
" </tr>\n",
|
312 |
+
" <tr>\n",
|
313 |
+
" <th>3</th>\n",
|
314 |
+
" <td>405-4419941-9848328</td>\n",
|
315 |
+
" <td>HYD8-29019</td>\n",
|
316 |
+
" <td>23.07.2023</td>\n",
|
317 |
+
" <td>TG-HYD8-817549015-2324</td>\n",
|
318 |
+
" <td>23.07.2023</td>\n",
|
319 |
+
" <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
|
320 |
+
" <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
|
321 |
+
" <td>AACCN8253B</td>\n",
|
322 |
+
" <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
|
323 |
+
" <td>399.00</td>\n",
|
324 |
+
" <td>36AACCN8253B1ZN</td>\n",
|
325 |
+
" <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
|
326 |
+
" </tr>\n",
|
327 |
+
" <tr>\n",
|
328 |
+
" <th>4</th>\n",
|
329 |
+
" <td>405-0015964-5687515</td>\n",
|
330 |
+
" <td>IN-5040</td>\n",
|
331 |
+
" <td>23.07.2023</td>\n",
|
332 |
+
" <td>DL-1922955505-2324</td>\n",
|
333 |
+
" <td>23.07.2023</td>\n",
|
334 |
+
" <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
|
335 |
+
" <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
|
336 |
+
" <td>JISPS4412R</td>\n",
|
337 |
+
" <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
|
338 |
+
" <td>149.00</td>\n",
|
339 |
+
" <td>07JISPS4412R1Z4</td>\n",
|
340 |
+
" <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
|
341 |
+
" </tr>\n",
|
342 |
+
" <tr>\n",
|
343 |
+
" <th>5</th>\n",
|
344 |
+
" <td>408-4974466-7793143</td>\n",
|
345 |
+
" <td>JPX2-223775</td>\n",
|
346 |
+
" <td>02.01.2024</td>\n",
|
347 |
+
" <td>RJ-JPX2-1317922175-2324</td>\n",
|
348 |
+
" <td>02.01.2024</td>\n",
|
349 |
+
" <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
|
350 |
+
" <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
|
351 |
+
" <td>AADCV4254H</td>\n",
|
352 |
+
" <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
|
353 |
+
" <td>569.00</td>\n",
|
354 |
+
" <td>08AADCV4254H1Z8</td>\n",
|
355 |
+
" <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
|
356 |
+
" </tr>\n",
|
357 |
+
" <tr>\n",
|
358 |
+
" <th>6</th>\n",
|
359 |
+
" <td>NaN</td>\n",
|
360 |
+
" <td>NaN</td>\n",
|
361 |
+
" <td>NaN</td>\n",
|
362 |
+
" <td>NaN</td>\n",
|
363 |
+
" <td>NaN</td>\n",
|
364 |
+
" <td>NaN</td>\n",
|
365 |
+
" <td>NaN</td>\n",
|
366 |
+
" <td>NaN</td>\n",
|
367 |
+
" <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
|
368 |
+
" <td>6,320.00</td>\n",
|
369 |
+
" <td>36AARCA3925C1ZQBilling</td>\n",
|
370 |
+
" <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
|
371 |
+
" </tr>\n",
|
372 |
+
" </tbody>\n",
|
373 |
+
"</table>\n",
|
374 |
+
"</div>"
|
375 |
+
],
|
376 |
+
"text/plain": [
|
377 |
+
" Order Number Invoice Number Order Date Invoice Details \\\n",
|
378 |
+
"0 402-7035529-3886722 NAG1-192347 17.08.2023 MH-NAG1-1034-2324 \n",
|
379 |
+
"1 402-7035529-3886722 BOM5-1379800 17.08.2023 MH-BOM5-1034-2324 \n",
|
380 |
+
"2 405-4419941-9848328 DEX3-4683 23.07.2023 DL-DEX3-157533501-2324 \n",
|
381 |
+
"3 405-4419941-9848328 HYD8-29019 23.07.2023 TG-HYD8-817549015-2324 \n",
|
382 |
+
"4 405-0015964-5687515 IN-5040 23.07.2023 DL-1922955505-2324 \n",
|
383 |
+
"5 408-4974466-7793143 JPX2-223775 02.01.2024 RJ-JPX2-1317922175-2324 \n",
|
384 |
+
"6 NaN NaN NaN NaN \n",
|
385 |
+
"\n",
|
386 |
+
" Invoice Date Billing Address \\\n",
|
387 |
+
"0 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
|
388 |
+
"1 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n",
|
389 |
+
"2 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
|
390 |
+
"3 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
|
391 |
+
"4 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n",
|
392 |
+
"5 02.01.2024 Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA... \n",
|
393 |
+
"6 NaN NaN \n",
|
394 |
+
"\n",
|
395 |
+
" Shipping Address PAN \\\n",
|
396 |
+
"0 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
|
397 |
+
"1 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n",
|
398 |
+
"2 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... ABEPW6057C \n",
|
399 |
+
"3 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... AACCN8253B \n",
|
400 |
+
"4 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... JISPS4412R \n",
|
401 |
+
"5 Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE... AADCV4254H \n",
|
402 |
+
"6 NaN NaN \n",
|
403 |
+
"\n",
|
404 |
+
" Item Total Amount \\\n",
|
405 |
+
"0 Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph... 458.0 \n",
|
406 |
+
"1 LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc... 13,099.00 \n",
|
407 |
+
"2 Amozo Easy Fit Tempered Glass Screen Protector... 474.00 \n",
|
408 |
+
"3 ESR for iPhone 13/14 Cover, Shockproof Drop Pr... 399.00 \n",
|
409 |
+
"4 imluckies Camera Lens Protector Compatible wit... 149.00 \n",
|
410 |
+
"5 Amazon Basics Sleek Rechargeable LED Table Lam... 569.00 \n",
|
411 |
+
"6 Saregama Carvaan Telugu - Portable Music Playe... 6,320.00 \n",
|
412 |
+
"\n",
|
413 |
+
" GSTIN Sold By \n",
|
414 |
+
"0 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*TCI Supply Chain... \n",
|
415 |
+
"1 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*Renaissance indu... \n",
|
416 |
+
"2 07ABEPW6057C1ZK RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co... \n",
|
417 |
+
"3 36AACCN8253B1ZN TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp... \n",
|
418 |
+
"4 07JISPS4412R1Z4 M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA... \n",
|
419 |
+
"5 08AADCV4254H1Z8 ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ... \n",
|
420 |
+
"6 36AARCA3925C1ZQBilling AATS Connect Private Limited \\n* GMR Airport C... "
|
421 |
+
]
|
422 |
+
},
|
423 |
+
"execution_count": 9,
|
424 |
+
"metadata": {},
|
425 |
+
"output_type": "execute_result"
|
426 |
+
}
|
427 |
+
],
|
428 |
+
"source": [
|
429 |
+
"invoice_convertor = InvoiceConvertor()\n",
|
430 |
+
"invoice_convertor.read_pdfs('invoices/')\n",
|
431 |
+
"res = invoice_convertor.convert()\n",
|
432 |
+
"res.head(10)"
|
433 |
+
]
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"cell_type": "code",
|
437 |
+
"execution_count": null,
|
438 |
+
"metadata": {},
|
439 |
+
"outputs": [],
|
440 |
+
"source": []
|
441 |
+
}
|
442 |
+
],
|
443 |
+
"metadata": {
|
444 |
+
"kernelspec": {
|
445 |
+
"display_name": "resparser",
|
446 |
+
"language": "python",
|
447 |
+
"name": "python3"
|
448 |
+
},
|
449 |
+
"language_info": {
|
450 |
+
"codemirror_mode": {
|
451 |
+
"name": "ipython",
|
452 |
+
"version": 3
|
453 |
+
},
|
454 |
+
"file_extension": ".py",
|
455 |
+
"mimetype": "text/x-python",
|
456 |
+
"name": "python",
|
457 |
+
"nbconvert_exporter": "python",
|
458 |
+
"pygments_lexer": "ipython3",
|
459 |
+
"version": "3.9.16"
|
460 |
+
}
|
461 |
+
},
|
462 |
+
"nbformat": 4,
|
463 |
+
"nbformat_minor": 2
|
464 |
+
}
|
invoice_convertor.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2, os, re
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
class InvoiceConvertor():
|
5 |
+
"""
|
6 |
+
This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
|
7 |
+
|
8 |
+
Usage:
|
9 |
+
convertor = InvoiceConvertor()
|
10 |
+
convertor.read_pdfs('path_to_pdfs')
|
11 |
+
result_df = convertor.convert()
|
12 |
+
|
13 |
+
"""
|
14 |
+
def __init__(self):
|
15 |
+
self.invoices = []
|
16 |
+
|
17 |
+
def read_pdfs(self,path):
|
18 |
+
for file in os.listdir(path):
|
19 |
+
if file.startswith('invoice'):
|
20 |
+
pdf_file = open(path + file, 'rb')
|
21 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
22 |
+
text = ''
|
23 |
+
for page_num in range(len(pdf_reader.pages)):
|
24 |
+
page = pdf_reader.pages[page_num]
|
25 |
+
text += page.extract_text()
|
26 |
+
pdf_file.close()
|
27 |
+
self.invoices.append(text)
|
28 |
+
return self.invoices
|
29 |
+
|
30 |
+
def save_as_csv(self, details, save_as = "invoice.csv"):
|
31 |
+
# if the csv already exists then concat a new one to it, else create a new one
|
32 |
+
if os.path.exists(save_as):
|
33 |
+
df = pd.read_csv(save_as)
|
34 |
+
df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
|
35 |
+
else:
|
36 |
+
df = pd.DataFrame(details, index=[0])
|
37 |
+
df.to_csv(save_as, index=False)
|
38 |
+
|
39 |
+
def extract_invoice_details(self, text):
|
40 |
+
invoice_details = {}
|
41 |
+
try:
|
42 |
+
invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
|
43 |
+
invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
|
44 |
+
invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
|
45 |
+
invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
|
46 |
+
invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
|
47 |
+
invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
|
48 |
+
invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
|
49 |
+
invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
|
50 |
+
except:
|
51 |
+
print('Order Number not found')
|
52 |
+
|
53 |
+
item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
|
54 |
+
if item_match:
|
55 |
+
item_info = item_match.group(1)
|
56 |
+
item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
|
57 |
+
invoice_details['Item'] = item_name
|
58 |
+
# print(item_name)
|
59 |
+
else:
|
60 |
+
print("No item found in the invoice.")
|
61 |
+
total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
|
62 |
+
if total_mount_match:
|
63 |
+
total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
|
64 |
+
invoice_details['Total Amount'] = total_mount
|
65 |
+
else:
|
66 |
+
print("No total amount found in the invoice.")
|
67 |
+
gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
|
68 |
+
if gstin_match:
|
69 |
+
invoice_details['GSTIN'] = gstin_match.group(1).strip()
|
70 |
+
else:
|
71 |
+
print("No GSTIN found in the invoice.")
|
72 |
+
by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
|
73 |
+
if by_match:
|
74 |
+
invoice_details['Sold By'] = by_match.group(1).strip()
|
75 |
+
else:
|
76 |
+
print("No seller found in the invoice.")
|
77 |
+
return invoice_details
|
78 |
+
|
79 |
+
def convert(self):
|
80 |
+
for invoice in self.invoices:
|
81 |
+
details = self.extract_invoice_details(invoice)
|
82 |
+
self.save_as_csv(details)
|
83 |
+
return pd.read_csv('invoice.csv')
|
84 |
+
|
invoices/invoice1.pdf
ADDED
Binary file (48.3 kB). View file
|
|
invoices/invoice2.pdf
ADDED
Binary file (48.4 kB). View file
|
|
invoices/invoice3.pdf
ADDED
Binary file (54.2 kB). View file
|
|
invoices/invoice4.pdf
ADDED
Binary file (103 kB). View file
|
|
invoices/invoice5.pdf
ADDED
Binary file (48 kB). View file
|
|
invoices/invoice7.pdf
ADDED
Binary file (50.2 kB). View file
|
|
invoices/invoice8.pdf
ADDED
Binary file (43.9 kB). View file
|
|
requirements.txt
CHANGED
@@ -1,10 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
streamlit
|
6 |
-
PyPDF2
|
7 |
-
sentence-transformers
|
8 |
-
python-dotenv
|
9 |
-
pypdf
|
10 |
-
faiss-cpu
|
|
|
1 |
+
streamlit==1.32.2
|
2 |
+
pyPDF2==3.0.1
|
3 |
+
pandas==1.3.5
|
4 |
+
regex==2023.12.25
|
|
|
|
|
|
|
|
|
|
|
|
secrets.toml
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
hf_token="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"
|
|
|
|