{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import PyPDF2, os\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_pdf(path):\n", " pdf_file = open(path, 'rb')\n", " pdf_reader = PyPDF2.PdfReader(pdf_file)\n", " text = ''\n", " for page_num in range(len(pdf_reader.pages)):\n", " page = pdf_reader.pages[page_num]\n", " text += page.extract_text()\n", " pdf_file.close()\n", " return text\n", "\n", "invoices = []\n", "path = 'invoices/'\n", "\n", "for file in os.listdir(path):\n", " if file.startswith('invoice'):\n", " text = read_pdf(path + file)\n", " print(text)\n", " invoices.append(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "def save_as_csv(details, save_as = \"invoice.csv\"):\n", " # if the csv already exists then concat a new one to it, else create a new one\n", " if os.path.exists(save_as):\n", " df = pd.read_csv(save_as)\n", " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n", " else: \n", " df = pd.DataFrame(details, index=[0])\n", " df.to_csv(save_as, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def extract_invoice_details(text):\n", " invoice_details = {}\n", " try:\n", " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n", " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n", " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n", " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n", " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n", " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n", " except:\n", " print('Order Number not found')\n", " \n", " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n", " if item_match:\n", " item_info = item_match.group(1)\n", " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n", " invoice_details['Item'] = item_name\n", " print(item_name)\n", " else:\n", " print(\"No item found in the invoice.\")\n", " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n", " if total_mount_match:\n", " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n", " invoice_details['Total Amount'] = total_mount\n", " else:\n", " print(\"No total amount found in the invoice.\")\n", " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n", " if gstin_match:\n", " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n", " else:\n", " print(\"No GSTIN found in the invoice.\")\n", " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n", " if by_match:\n", " invoice_details['Sold By'] = by_match.group(1).strip()\n", " else:\n", " print(\"No seller found in the invoice.\")\n", " \n", " return invoice_details" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for invoice in invoices:\n", " # print(invoice)\n", " details = extract_invoice_details(invoice)\n", " save_as_csv(details)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('invoice.csv')\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import PyPDF2, os, re\n", "import pandas as pd\n", "\n", "class InvoiceConvertor:\n", " \"\"\"\n", " This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n", " \n", " Usage:\n", " convertor = InvoiceConvertor()\n", " convertor.read_pdfs('path_to_pdfs')\n", " result_df = convertor.convert()\n", "\n", " \"\"\"\n", " def __init__(self):\n", " self.invoices = []\n", " \n", " def read_pdfs(self,path):\n", " for file in os.listdir(path):\n", " if file.startswith('invoice'):\n", " pdf_file = open(path + file, 'rb')\n", " pdf_reader = PyPDF2.PdfReader(pdf_file)\n", " text = ''\n", " for page_num in range(len(pdf_reader.pages)):\n", " page = pdf_reader.pages[page_num]\n", " text += page.extract_text()\n", " pdf_file.close()\n", " self.invoices.append(text)\n", " return self.invoices\n", " \n", " def save_as_csv(self, details, save_as = \"invoice.csv\"):\n", " # if the csv already exists then concat a new one to it, else create a new one\n", " if os.path.exists(save_as):\n", " df = pd.read_csv(save_as)\n", " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n", " else: \n", " df = pd.DataFrame(details, index=[0])\n", " df.to_csv(save_as, index=False)\n", " \n", " def extract_invoice_details(self, text):\n", " invoice_details = {}\n", " try:\n", " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n", " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n", " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n", " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n", " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n", " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n", " except:\n", " print('Order Number not found')\n", "\n", " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n", " if item_match:\n", " item_info = item_match.group(1)\n", " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n", " invoice_details['Item'] = item_name\n", " # print(item_name)\n", " else:\n", " print(\"No item found in the invoice.\")\n", " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n", " if total_mount_match:\n", " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n", " invoice_details['Total Amount'] = total_mount\n", " else:\n", " print(\"No total amount found in the invoice.\")\n", " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n", " if gstin_match:\n", " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n", " else:\n", " print(\"No GSTIN found in the invoice.\")\n", " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n", " if by_match:\n", " invoice_details['Sold By'] = by_match.group(1).strip()\n", " else:\n", " print(\"No seller found in the invoice.\")\n", " return invoice_details\n", " \n", " def convert(self):\n", " for invoice in self.invoices:\n", " details = self.extract_invoice_details(invoice)\n", " self.save_as_csv(details)\n", " return pd.read_csv('invoice.csv')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Order Number not found\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Order NumberInvoice NumberOrder DateInvoice DetailsInvoice DateBilling AddressShipping AddressPANItemTotal AmountGSTINSold By
0402-7035529-3886722NAG1-19234717.08.2023MH-NAG1-1034-232417.08.2023Pratik Dwivedi \\nBennett University, Plot Nos ...Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...AALCA0171ECosmic Byte CB-EP-05 Wired Gaming in Ear Earph...458.027AALCA0171E1ZZAppario Retail Private Ltd \\n*TCI Supply Chain...
1402-7035529-3886722BOM5-137980017.08.2023MH-BOM5-1034-232417.08.2023Pratik Dwivedi \\nBennett University, Plot Nos ...Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...AALCA0171ELG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...13,099.0027AALCA0171E1ZZAppario Retail Private Ltd \\n*Renaissance indu...
2405-4419941-9848328DEX3-468323.07.2023DL-DEX3-157533501-232423.07.2023Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...ABEPW6057CAmozo Easy Fit Tempered Glass Screen Protector...474.0007ABEPW6057C1ZKRADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...
3405-4419941-9848328HYD8-2901923.07.2023TG-HYD8-817549015-232423.07.2023Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...AACCN8253BESR for iPhone 13/14 Cover, Shockproof Drop Pr...399.0036AACCN8253B1ZNTIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...
4405-0015964-5687515IN-504023.07.2023DL-1922955505-232423.07.2023Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...JISPS4412Rimluckies Camera Lens Protector Compatible wit...149.0007JISPS4412R1Z4M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...
5408-4974466-7793143JPX2-22377502.01.2024RJ-JPX2-1317922175-232402.01.2024Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...AADCV4254HAmazon Basics Sleek Rechargeable LED Table Lam...569.0008AADCV4254H1Z8ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...
6NaNNaNNaNNaNNaNNaNNaNNaNSaregama Carvaan Telugu - Portable Music Playe...6,320.0036AARCA3925C1ZQBillingAATS Connect Private Limited \\n* GMR Airport C...
\n", "
" ], "text/plain": [ " Order Number Invoice Number Order Date Invoice Details \\\n", "0 402-7035529-3886722 NAG1-192347 17.08.2023 MH-NAG1-1034-2324 \n", "1 402-7035529-3886722 BOM5-1379800 17.08.2023 MH-BOM5-1034-2324 \n", "2 405-4419941-9848328 DEX3-4683 23.07.2023 DL-DEX3-157533501-2324 \n", "3 405-4419941-9848328 HYD8-29019 23.07.2023 TG-HYD8-817549015-2324 \n", "4 405-0015964-5687515 IN-5040 23.07.2023 DL-1922955505-2324 \n", "5 408-4974466-7793143 JPX2-223775 02.01.2024 RJ-JPX2-1317922175-2324 \n", "6 NaN NaN NaN NaN \n", "\n", " Invoice Date Billing Address \\\n", "0 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n", "1 17.08.2023 Pratik Dwivedi \\nBennett University, Plot Nos ... \n", "2 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n", "3 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n", "4 23.07.2023 Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... \n", "5 02.01.2024 Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA... \n", "6 NaN NaN \n", "\n", " Shipping Address PAN \\\n", "0 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n", "1 Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... AALCA0171E \n", "2 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... ABEPW6057C \n", "3 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... AACCN8253B \n", "4 Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... JISPS4412R \n", "5 Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE... AADCV4254H \n", "6 NaN NaN \n", "\n", " Item Total Amount \\\n", "0 Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph... 458.0 \n", "1 LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc... 13,099.00 \n", "2 Amozo Easy Fit Tempered Glass Screen Protector... 474.00 \n", "3 ESR for iPhone 13/14 Cover, Shockproof Drop Pr... 399.00 \n", "4 imluckies Camera Lens Protector Compatible wit... 149.00 \n", "5 Amazon Basics Sleek Rechargeable LED Table Lam... 569.00 \n", "6 Saregama Carvaan Telugu - Portable Music Playe... 6,320.00 \n", "\n", " GSTIN Sold By \n", "0 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*TCI Supply Chain... \n", "1 27AALCA0171E1ZZ Appario Retail Private Ltd \\n*Renaissance indu... \n", "2 07ABEPW6057C1ZK RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co... \n", "3 36AACCN8253B1ZN TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp... \n", "4 07JISPS4412R1Z4 M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA... \n", "5 08AADCV4254H1Z8 ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ... \n", "6 36AARCA3925C1ZQBilling AATS Connect Private Limited \\n* GMR Airport C... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invoice_convertor = InvoiceConvertor()\n", "invoice_convertor.read_pdfs('invoices/')\n", "res = invoice_convertor.convert()\n", "res.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "resparser", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 2 }