{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import PyPDF2, os\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_pdf(path):\n", " pdf_file = open(path, 'rb')\n", " pdf_reader = PyPDF2.PdfReader(pdf_file)\n", " text = ''\n", " for page_num in range(len(pdf_reader.pages)):\n", " page = pdf_reader.pages[page_num]\n", " text += page.extract_text()\n", " pdf_file.close()\n", " return text\n", "\n", "invoices = []\n", "path = 'invoices/'\n", "\n", "for file in os.listdir(path):\n", " if file.startswith('invoice'):\n", " text = read_pdf(path + file)\n", " print(text)\n", " invoices.append(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "def save_as_csv(details, save_as = \"invoice.csv\"):\n", " # if the csv already exists then concat a new one to it, else create a new one\n", " if os.path.exists(save_as):\n", " df = pd.read_csv(save_as)\n", " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n", " else: \n", " df = pd.DataFrame(details, index=[0])\n", " df.to_csv(save_as, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def extract_invoice_details(text):\n", " invoice_details = {}\n", " try:\n", " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n", " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n", " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n", " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n", " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n", " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n", " except:\n", " print('Order Number not found')\n", " \n", " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n", " if item_match:\n", " item_info = item_match.group(1)\n", " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n", " invoice_details['Item'] = item_name\n", " print(item_name)\n", " else:\n", " print(\"No item found in the invoice.\")\n", " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n", " if total_mount_match:\n", " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n", " invoice_details['Total Amount'] = total_mount\n", " else:\n", " print(\"No total amount found in the invoice.\")\n", " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n", " if gstin_match:\n", " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n", " else:\n", " print(\"No GSTIN found in the invoice.\")\n", " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n", " if by_match:\n", " invoice_details['Sold By'] = by_match.group(1).strip()\n", " else:\n", " print(\"No seller found in the invoice.\")\n", " \n", " return invoice_details" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for invoice in invoices:\n", " # print(invoice)\n", " details = extract_invoice_details(invoice)\n", " save_as_csv(details)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('invoice.csv')\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import PyPDF2, os, re\n", "import pandas as pd\n", "\n", "class InvoiceConvertor:\n", " \"\"\"\n", " This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n", " \n", " Usage:\n", " convertor = InvoiceConvertor()\n", " convertor.read_pdfs('path_to_pdfs')\n", " result_df = convertor.convert()\n", "\n", " \"\"\"\n", " def __init__(self):\n", " self.invoices = []\n", " \n", " def read_pdfs(self,path):\n", " for file in os.listdir(path):\n", " if file.startswith('invoice'):\n", " pdf_file = open(path + file, 'rb')\n", " pdf_reader = PyPDF2.PdfReader(pdf_file)\n", " text = ''\n", " for page_num in range(len(pdf_reader.pages)):\n", " page = pdf_reader.pages[page_num]\n", " text += page.extract_text()\n", " pdf_file.close()\n", " self.invoices.append(text)\n", " return self.invoices\n", " \n", " def save_as_csv(self, details, save_as = \"invoice.csv\"):\n", " # if the csv already exists then concat a new one to it, else create a new one\n", " if os.path.exists(save_as):\n", " df = pd.read_csv(save_as)\n", " df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n", " else: \n", " df = pd.DataFrame(details, index=[0])\n", " df.to_csv(save_as, index=False)\n", " \n", " def extract_invoice_details(self, text):\n", " invoice_details = {}\n", " try:\n", " invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n", " invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n", " invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n", " invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n", " invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n", " invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n", " invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n", " except:\n", " print('Order Number not found')\n", "\n", " item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n", " if item_match:\n", " item_info = item_match.group(1)\n", " item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n", " invoice_details['Item'] = item_name\n", " # print(item_name)\n", " else:\n", " print(\"No item found in the invoice.\")\n", " total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n", " if total_mount_match:\n", " total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n", " invoice_details['Total Amount'] = total_mount\n", " else:\n", " print(\"No total amount found in the invoice.\")\n", " gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n", " if gstin_match:\n", " invoice_details['GSTIN'] = gstin_match.group(1).strip()\n", " else:\n", " print(\"No GSTIN found in the invoice.\")\n", " by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n", " if by_match:\n", " invoice_details['Sold By'] = by_match.group(1).strip()\n", " else:\n", " print(\"No seller found in the invoice.\")\n", " return invoice_details\n", " \n", " def convert(self):\n", " for invoice in self.invoices:\n", " details = self.extract_invoice_details(invoice)\n", " self.save_as_csv(details)\n", " return pd.read_csv('invoice.csv')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Order Number not found\n" ] }, { "data": { "text/html": [ "
\n", " | Order Number | \n", "Invoice Number | \n", "Order Date | \n", "Invoice Details | \n", "Invoice Date | \n", "Billing Address | \n", "Shipping Address | \n", "PAN | \n", "Item | \n", "Total Amount | \n", "GSTIN | \n", "Sold By | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "402-7035529-3886722 | \n", "NAG1-192347 | \n", "17.08.2023 | \n", "MH-NAG1-1034-2324 | \n", "17.08.2023 | \n", "Pratik Dwivedi \\nBennett University, Plot Nos ... | \n", "Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... | \n", "AALCA0171E | \n", "Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph... | \n", "458.0 | \n", "27AALCA0171E1ZZ | \n", "Appario Retail Private Ltd \\n*TCI Supply Chain... | \n", "
1 | \n", "402-7035529-3886722 | \n", "BOM5-1379800 | \n", "17.08.2023 | \n", "MH-BOM5-1034-2324 | \n", "17.08.2023 | \n", "Pratik Dwivedi \\nBennett University, Plot Nos ... | \n", "Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ... | \n", "AALCA0171E | \n", "LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc... | \n", "13,099.00 | \n", "27AALCA0171E1ZZ | \n", "Appario Retail Private Ltd \\n*Renaissance indu... | \n", "
2 | \n", "405-4419941-9848328 | \n", "DEX3-4683 | \n", "23.07.2023 | \n", "DL-DEX3-157533501-2324 | \n", "23.07.2023 | \n", "Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... | \n", "Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... | \n", "ABEPW6057C | \n", "Amozo Easy Fit Tempered Glass Screen Protector... | \n", "474.00 | \n", "07ABEPW6057C1ZK | \n", "RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co... | \n", "
3 | \n", "405-4419941-9848328 | \n", "HYD8-29019 | \n", "23.07.2023 | \n", "TG-HYD8-817549015-2324 | \n", "23.07.2023 | \n", "Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... | \n", "Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... | \n", "AACCN8253B | \n", "ESR for iPhone 13/14 Cover, Shockproof Drop Pr... | \n", "399.00 | \n", "36AACCN8253B1ZN | \n", "TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp... | \n", "
4 | \n", "405-0015964-5687515 | \n", "IN-5040 | \n", "23.07.2023 | \n", "DL-1922955505-2324 | \n", "23.07.2023 | \n", "Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N... | \n", "Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto... | \n", "JISPS4412R | \n", "imluckies Camera Lens Protector Compatible wit... | \n", "149.00 | \n", "07JISPS4412R1Z4 | \n", "M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA... | \n", "
5 | \n", "408-4974466-7793143 | \n", "JPX2-223775 | \n", "02.01.2024 | \n", "RJ-JPX2-1317922175-2324 | \n", "02.01.2024 | \n", "Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA... | \n", "Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE... | \n", "AADCV4254H | \n", "Amazon Basics Sleek Rechargeable LED Table Lam... | \n", "569.00 | \n", "08AADCV4254H1Z8 | \n", "ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ... | \n", "
6 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "Saregama Carvaan Telugu - Portable Music Playe... | \n", "6,320.00 | \n", "36AARCA3925C1ZQBilling | \n", "AATS Connect Private Limited \\n* GMR Airport C... | \n", "