Spaces:

Dekode
/

betterzila_assignment

Sleeping

App Files Files Community

Pratik Dwivedi commited on Mar 21, 2024

Commit

5abbd23

1 Parent(s): 0e5546c

New app

Browse files

Files changed (17) hide show

.gitattributes +0 -35
.gitignore +0 -3
README.md +0 -13
app.py +24 -0
application.py +0 -60
data/48lawsofpower.pdf +0 -0
extractor.ipynb +464 -0
invoice_convertor.py +84 -0
invoices/invoice1.pdf +0 -0
invoices/invoice2.pdf +0 -0
invoices/invoice3.pdf +0 -0
invoices/invoice4.pdf +0 -0
invoices/invoice5.pdf +0 -0
invoices/invoice7.pdf +0 -0
invoices/invoice8.pdf +0 -0
requirements.txt +4 -10
secrets.toml +0 -1

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,3 +0,0 @@
-.env
-.gitattributes
-secrets.toml

README.md DELETED Viewed

@@ -1,13 +0,0 @@
----
-title: Betterzila Assignment
-emoji: 📉
-colorFrom: gray
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.30.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+import os
+from invoice_convertor import InvoiceConvertor
+def main():
+    st.set_page_config(layout="wide")
+    st.title('Amazon Invoice Convertor')
+    st.write('This app converts your Amazon invoice pdfs to a csv file.')
+    convertor = InvoiceConvertor()
+    files = st.file_uploader('Upload your invoice pdfs', type=['pdf'], accept_multiple_files=True)
+    if files:
+        for file in files:
+            with open('data/' + file.name, 'wb') as f:
+                f.write(file.getbuffer())
+        convertor.read_pdfs('data/')
+        result_df = convertor.convert()
+        st.write(result_df)
+        st.download_button('Download csv', data=result_df.to_csv(), file_name='invoice.csv', mime='text/csv')
+    for file in os.listdir('data/'):
+        os.remove('data/' + file)
+    if st.button('Clear csv file') and os.path.exists('invoice.csv'):
+        os.remove('invoice.csv')
+if __name__ == '__main__':
+    main()

application.py DELETED Viewed

@@ -1,60 +0,0 @@
-import streamlit as st
-from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceInstructEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain.chains import ConversationalRetrievalChain
-from langchain_community.llms import HuggingFaceHub
-from langchain.memory import ConversationBufferMemory
-def make_vectorstore(embeddings):
-    # use glob to find all the pdf files in the data folder in the base directory
-    loader = PyPDFDirectoryLoader("data")
-    # load the documents
-    documents = loader.load()
-    # split the documents into chunks of 1400 characters with 0 overlap
-    text_splitter = CharacterTextSplitter(chunk_size=1400, chunk_overlap=0)
-    # split the documents into chunks of 1400 characters with 0 overlap
-    texts = text_splitter.split_documents(documents)
-    # create a vector store from the documents
-    docsearch = FAISS.from_documents(texts, embeddings)
-    return docsearch
-def get_conversation(vectorstore):
-    # create a memory object to store the conversation history
-    memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True,)
-    conversation_chain = ConversationalRetrievalChain.from_chain_type(
-        llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}, huggingfacehub_api_token = st.secrets["hf_token"]),
-        chain_type="stuff",
-        retriever=vectorstore.as_retriever(),
-        memory=memory)
-    return conversation_chain
-def get_response(conversation_chain, query):
-    # get the response
-    response = conversation_chain.run(query)
-    return response
-def main():
-    st.title("BetterZila RAG Enabled LLM")
-    embeddings = HuggingFaceInstructEmbeddings(model_name="google/t5-v1_1-xl", model_kwargs = {'device': 'cpu'})
-    vectorstore = make_vectorstore(embeddings)
-    conversation_chain = get_conversation(vectorstore)
-    queries = ["Can you give me an example from history where the enemy was crushed totally from the book?", "What's the point of making myself less accessible?", "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"]
-    for query in queries:
-        st.subheader(f"Query: {query}")
-        response = get_response(conversation_chain, query)
-        st.write(query)
-        st.write(response["llm_response"])
-    st.success("Responses generated!")
-if __name__ == "__main__":
-    main()

data/48lawsofpower.pdf DELETED Viewed

Binary file (105 kB)

extractor.ipynb ADDED Viewed

	@@ -0,0 +1,464 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import PyPDF2, os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_pdf(path):\n",
+    "    pdf_file = open(path, 'rb')\n",
+    "    pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
+    "    text = ''\n",
+    "    for page_num in range(len(pdf_reader.pages)):\n",
+    "        page = pdf_reader.pages[page_num]\n",
+    "        text += page.extract_text()\n",
+    "    pdf_file.close()\n",
+    "    return text\n",
+    "\n",
+    "invoices = []\n",
+    "path = 'invoices/'\n",
+    "\n",
+    "for file in os.listdir(path):\n",
+    "    if file.startswith('invoice'):\n",
+    "        text = read_pdf(path + file)\n",
+    "        print(text)\n",
+    "        invoices.append(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "def save_as_csv(details, save_as = \"invoice.csv\"):\n",
+    "    # if the csv already exists then concat a new one to it, else create a new one\n",
+    "    if os.path.exists(save_as):\n",
+    "        df = pd.read_csv(save_as)\n",
+    "        df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
+    "    else:  \n",
+    "        df = pd.DataFrame(details, index=[0])\n",
+    "    df.to_csv(save_as, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def extract_invoice_details(text):\n",
+    "    invoice_details = {}\n",
+    "    try:\n",
+    "        invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
+    "        invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
+    "        invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "        invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
+    "        invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "        invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
+    "        invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
+    "        invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
+    "    except:\n",
+    "        print('Order Number not found')\n",
+    "    \n",
+    "    item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
+    "    if item_match:\n",
+    "        item_info = item_match.group(1)\n",
+    "        item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
+    "        invoice_details['Item'] = item_name\n",
+    "        print(item_name)\n",
+    "    else:\n",
+    "        print(\"No item found in the invoice.\")\n",
+    "    total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
+    "    if total_mount_match:\n",
+    "        total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
+    "        invoice_details['Total Amount'] = total_mount\n",
+    "    else:\n",
+    "        print(\"No total amount found in the invoice.\")\n",
+    "    gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
+    "    if gstin_match:\n",
+    "        invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
+    "    else:\n",
+    "        print(\"No GSTIN found in the invoice.\")\n",
+    "    by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
+    "    if by_match:\n",
+    "        invoice_details['Sold By'] = by_match.group(1).strip()\n",
+    "    else:\n",
+    "        print(\"No seller found in the invoice.\")\n",
+    "        \n",
+    "    return invoice_details"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for invoice in invoices:\n",
+    "    # print(invoice)\n",
+    "    details = extract_invoice_details(invoice)\n",
+    "    save_as_csv(details)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('invoice.csv')\n",
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import PyPDF2, os, re\n",
+    "import pandas as pd\n",
+    "\n",
+    "class InvoiceConvertor:\n",
+    "    \"\"\"\n",
+    "    This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
+    "    \n",
+    "    Usage:\n",
+    "    convertor = InvoiceConvertor()\n",
+    "    convertor.read_pdfs('path_to_pdfs')\n",
+    "    result_df = convertor.convert()\n",
+    "\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        self.invoices = []\n",
+    "        \n",
+    "    def read_pdfs(self,path):\n",
+    "        for file in os.listdir(path):\n",
+    "            if file.startswith('invoice'):\n",
+    "                pdf_file = open(path + file, 'rb')\n",
+    "                pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
+    "                text = ''\n",
+    "                for page_num in range(len(pdf_reader.pages)):\n",
+    "                    page = pdf_reader.pages[page_num]\n",
+    "                    text += page.extract_text()\n",
+    "                pdf_file.close()\n",
+    "                self.invoices.append(text)\n",
+    "        return self.invoices\n",
+    "    \n",
+    "    def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
+    "        # if the csv already exists then concat a new one to it, else create a new one\n",
+    "        if os.path.exists(save_as):\n",
+    "            df = pd.read_csv(save_as)\n",
+    "            df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
+    "        else:  \n",
+    "            df = pd.DataFrame(details, index=[0])\n",
+    "        df.to_csv(save_as, index=False)\n",
+    "        \n",
+    "    def extract_invoice_details(self, text):\n",
+    "        invoice_details = {}\n",
+    "        try:\n",
+    "            invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
+    "            invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
+    "            invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "            invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
+    "            invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "            invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
+    "            invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
+    "            invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
+    "        except:\n",
+    "            print('Order Number not found')\n",
+    "\n",
+    "        item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
+    "        if item_match:\n",
+    "            item_info = item_match.group(1)\n",
+    "            item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
+    "            invoice_details['Item'] = item_name\n",
+    "            # print(item_name)\n",
+    "        else:\n",
+    "            print(\"No item found in the invoice.\")\n",
+    "        total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
+    "        if total_mount_match:\n",
+    "            total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
+    "            invoice_details['Total Amount'] = total_mount\n",
+    "        else:\n",
+    "            print(\"No total amount found in the invoice.\")\n",
+    "        gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
+    "        if gstin_match:\n",
+    "            invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
+    "        else:\n",
+    "            print(\"No GSTIN found in the invoice.\")\n",
+    "        by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
+    "        if by_match:\n",
+    "            invoice_details['Sold By'] = by_match.group(1).strip()\n",
+    "        else:\n",
+    "            print(\"No seller found in the invoice.\")\n",
+    "        return invoice_details\n",
+    "    \n",
+    "    def convert(self):\n",
+    "        for invoice in self.invoices:\n",
+    "            details = self.extract_invoice_details(invoice)\n",
+    "            self.save_as_csv(details)\n",
+    "        return pd.read_csv('invoice.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Order Number not found\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Order Number</th>\n",
+       "      <th>Invoice Number</th>\n",
+       "      <th>Order Date</th>\n",
+       "      <th>Invoice Details</th>\n",
+       "      <th>Invoice Date</th>\n",
+       "      <th>Billing Address</th>\n",
+       "      <th>Shipping Address</th>\n",
+       "      <th>PAN</th>\n",
+       "      <th>Item</th>\n",
+       "      <th>Total Amount</th>\n",
+       "      <th>GSTIN</th>\n",
+       "      <th>Sold By</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>402-7035529-3886722</td>\n",
+       "      <td>NAG1-192347</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>MH-NAG1-1034-2324</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
+       "      <td>AALCA0171E</td>\n",
+       "      <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
+       "      <td>458.0</td>\n",
+       "      <td>27AALCA0171E1ZZ</td>\n",
+       "      <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>402-7035529-3886722</td>\n",
+       "      <td>BOM5-1379800</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>MH-BOM5-1034-2324</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
+       "      <td>AALCA0171E</td>\n",
+       "      <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
+       "      <td>13,099.00</td>\n",
+       "      <td>27AALCA0171E1ZZ</td>\n",
+       "      <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>405-4419941-9848328</td>\n",
+       "      <td>DEX3-4683</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>DL-DEX3-157533501-2324</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
+       "      <td>ABEPW6057C</td>\n",
+       "      <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
+       "      <td>474.00</td>\n",
+       "      <td>07ABEPW6057C1ZK</td>\n",
+       "      <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>405-4419941-9848328</td>\n",
+       "      <td>HYD8-29019</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>TG-HYD8-817549015-2324</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
+       "      <td>AACCN8253B</td>\n",
+       "      <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
+       "      <td>399.00</td>\n",
+       "      <td>36AACCN8253B1ZN</td>\n",
+       "      <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>405-0015964-5687515</td>\n",
+       "      <td>IN-5040</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>DL-1922955505-2324</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
+       "      <td>JISPS4412R</td>\n",
+       "      <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
+       "      <td>149.00</td>\n",
+       "      <td>07JISPS4412R1Z4</td>\n",
+       "      <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>408-4974466-7793143</td>\n",
+       "      <td>JPX2-223775</td>\n",
+       "      <td>02.01.2024</td>\n",
+       "      <td>RJ-JPX2-1317922175-2324</td>\n",
+       "      <td>02.01.2024</td>\n",
+       "      <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
+       "      <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
+       "      <td>AADCV4254H</td>\n",
+       "      <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
+       "      <td>569.00</td>\n",
+       "      <td>08AADCV4254H1Z8</td>\n",
+       "      <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
+       "      <td>6,320.00</td>\n",
+       "      <td>36AARCA3925C1ZQBilling</td>\n",
+       "      <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Order Number Invoice Number  Order Date          Invoice Details  \\\n",
+       "0  402-7035529-3886722    NAG1-192347  17.08.2023        MH-NAG1-1034-2324   \n",
+       "1  402-7035529-3886722   BOM5-1379800  17.08.2023        MH-BOM5-1034-2324   \n",
+       "2  405-4419941-9848328      DEX3-4683  23.07.2023   DL-DEX3-157533501-2324   \n",
+       "3  405-4419941-9848328     HYD8-29019  23.07.2023   TG-HYD8-817549015-2324   \n",
+       "4  405-0015964-5687515        IN-5040  23.07.2023       DL-1922955505-2324   \n",
+       "5  408-4974466-7793143    JPX2-223775  02.01.2024  RJ-JPX2-1317922175-2324   \n",
+       "6                  NaN            NaN         NaN                      NaN   \n",
+       "\n",
+       "  Invoice Date                                    Billing Address  \\\n",
+       "0   17.08.2023  Pratik Dwivedi \\nBennett University, Plot Nos ...   \n",
+       "1   17.08.2023  Pratik Dwivedi \\nBennett University, Plot Nos ...   \n",
+       "2   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
+       "3   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
+       "4   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
+       "5   02.01.2024  Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...   \n",
+       "6          NaN                                                NaN   \n",
+       "\n",
+       "                                    Shipping Address         PAN  \\\n",
+       "0  Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...  AALCA0171E   \n",
+       "1  Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...  AALCA0171E   \n",
+       "2  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  ABEPW6057C   \n",
+       "3  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  AACCN8253B   \n",
+       "4  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  JISPS4412R   \n",
+       "5  Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...  AADCV4254H   \n",
+       "6                                                NaN         NaN   \n",
+       "\n",
+       "                                                Item Total Amount  \\\n",
+       "0  Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...        458.0   \n",
+       "1  LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...    13,099.00   \n",
+       "2  Amozo Easy Fit Tempered Glass Screen Protector...       474.00   \n",
+       "3  ESR for iPhone 13/14 Cover, Shockproof Drop Pr...       399.00   \n",
+       "4  imluckies Camera Lens Protector Compatible wit...       149.00   \n",
+       "5  Amazon Basics Sleek Rechargeable LED Table Lam...       569.00   \n",
+       "6  Saregama Carvaan Telugu - Portable Music Playe...     6,320.00   \n",
+       "\n",
+       "                    GSTIN                                            Sold By  \n",
+       "0         27AALCA0171E1ZZ  Appario Retail Private Ltd \\n*TCI Supply Chain...  \n",
+       "1         27AALCA0171E1ZZ  Appario Retail Private Ltd \\n*Renaissance indu...  \n",
+       "2         07ABEPW6057C1ZK  RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...  \n",
+       "3         36AACCN8253B1ZN  TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...  \n",
+       "4         07JISPS4412R1Z4  M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...  \n",
+       "5         08AADCV4254H1Z8  ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...  \n",
+       "6  36AARCA3925C1ZQBilling  AATS Connect Private Limited \\n* GMR Airport C...  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "invoice_convertor = InvoiceConvertor()\n",
+    "invoice_convertor.read_pdfs('invoices/')\n",
+    "res = invoice_convertor.convert()\n",
+    "res.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "resparser",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

invoice_convertor.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import PyPDF2, os, re
+import pandas as pd
+class InvoiceConvertor():
+    """
+    This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
+    Usage:
+    convertor = InvoiceConvertor()
+    convertor.read_pdfs('path_to_pdfs')
+    result_df = convertor.convert()
+    """
+    def __init__(self):
+        self.invoices = []
+    def read_pdfs(self,path):
+        for file in os.listdir(path):
+            if file.startswith('invoice'):
+                pdf_file = open(path + file, 'rb')
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                text = ''
+                for page_num in range(len(pdf_reader.pages)):
+                    page = pdf_reader.pages[page_num]
+                    text += page.extract_text()
+                pdf_file.close()
+                self.invoices.append(text)
+        return self.invoices
+    def save_as_csv(self, details, save_as = "invoice.csv"):
+        # if the csv already exists then concat a new one to it, else create a new one
+        if os.path.exists(save_as):
+            df = pd.read_csv(save_as)
+            df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
+        else:
+            df = pd.DataFrame(details, index=[0])
+        df.to_csv(save_as, index=False)
+    def extract_invoice_details(self, text):
+        invoice_details = {}
+        try:
+            invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
+            invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
+            invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
+            invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
+            invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
+            invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
+            invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
+            invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
+        except:
+            print('Order Number not found')
+        item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
+        if item_match:
+            item_info = item_match.group(1)
+            item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
+            invoice_details['Item'] = item_name
+            # print(item_name)
+        else:
+            print("No item found in the invoice.")
+        total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
+        if total_mount_match:
+            total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
+            invoice_details['Total Amount'] = total_mount
+        else:
+            print("No total amount found in the invoice.")
+        gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
+        if gstin_match:
+            invoice_details['GSTIN'] = gstin_match.group(1).strip()
+        else:
+            print("No GSTIN found in the invoice.")
+        by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
+        if by_match:
+            invoice_details['Sold By'] = by_match.group(1).strip()
+        else:
+            print("No seller found in the invoice.")
+        return invoice_details
+    def convert(self):
+        for invoice in self.invoices:
+            details = self.extract_invoice_details(invoice)
+            self.save_as_csv(details)
+        return pd.read_csv('invoice.csv')

invoices/invoice1.pdf ADDED Viewed

Binary file (48.3 kB). View file

invoices/invoice2.pdf ADDED Viewed

Binary file (48.4 kB). View file

invoices/invoice3.pdf ADDED Viewed

Binary file (54.2 kB). View file

invoices/invoice4.pdf ADDED Viewed

Binary file (103 kB). View file

invoices/invoice5.pdf ADDED Viewed

Binary file (48 kB). View file

invoices/invoice7.pdf ADDED Viewed

Binary file (50.2 kB). View file

invoices/invoice8.pdf ADDED Viewed

Binary file (43.9 kB). View file

requirements.txt CHANGED Viewed

@@ -1,10 +1,4 @@
-transformers
-langchain
-langchain-community
-InstructorEmbedding
-streamlit
-PyPDF2
-sentence-transformers
-python-dotenv
-pypdf
-faiss-cpu

+streamlit==1.32.2
+pyPDF2==3.0.1
+pandas==1.3.5
+regex==2023.12.25

secrets.toml DELETED Viewed

	@@ -1 +0,0 @@
1	- hf_token="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"