{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2, os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_pdf(path):\n",
    "    pdf_file = open(path, 'rb')\n",
    "    pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
    "    text = ''\n",
    "    for page_num in range(len(pdf_reader.pages)):\n",
    "        page = pdf_reader.pages[page_num]\n",
    "        text += page.extract_text()\n",
    "    pdf_file.close()\n",
    "    return text\n",
    "\n",
    "invoices = []\n",
    "path = 'invoices/'\n",
    "\n",
    "for file in os.listdir(path):\n",
    "    if file.startswith('invoice'):\n",
    "        text = read_pdf(path + file)\n",
    "        print(text)\n",
    "        invoices.append(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "def save_as_csv(details, save_as = \"invoice.csv\"):\n",
    "    # if the csv already exists then concat a new one to it, else create a new one\n",
    "    if os.path.exists(save_as):\n",
    "        df = pd.read_csv(save_as)\n",
    "        df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
    "    else:  \n",
    "        df = pd.DataFrame(details, index=[0])\n",
    "    df.to_csv(save_as, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_invoice_details(text):\n",
    "    invoice_details = {}\n",
    "    try:\n",
    "        invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
    "        invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
    "        invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
    "        invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
    "        invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
    "        invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
    "        invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
    "        invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
    "    except:\n",
    "        print('Order Number not found')\n",
    "    \n",
    "    item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
    "    if item_match:\n",
    "        item_info = item_match.group(1)\n",
    "        item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
    "        invoice_details['Item'] = item_name\n",
    "        print(item_name)\n",
    "    else:\n",
    "        print(\"No item found in the invoice.\")\n",
    "    total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
    "    if total_mount_match:\n",
    "        total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
    "        invoice_details['Total Amount'] = total_mount\n",
    "    else:\n",
    "        print(\"No total amount found in the invoice.\")\n",
    "    gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
    "    if gstin_match:\n",
    "        invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
    "    else:\n",
    "        print(\"No GSTIN found in the invoice.\")\n",
    "    by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
    "    if by_match:\n",
    "        invoice_details['Sold By'] = by_match.group(1).strip()\n",
    "    else:\n",
    "        print(\"No seller found in the invoice.\")\n",
    "        \n",
    "    return invoice_details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for invoice in invoices:\n",
    "    # print(invoice)\n",
    "    details = extract_invoice_details(invoice)\n",
    "    save_as_csv(details)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('invoice.csv')\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2, os, re\n",
    "import pandas as pd\n",
    "\n",
    "class InvoiceConvertor:\n",
    "    \"\"\"\n",
    "    This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
    "    \n",
    "    Usage:\n",
    "    convertor = InvoiceConvertor()\n",
    "    convertor.read_pdfs('path_to_pdfs')\n",
    "    result_df = convertor.convert()\n",
    "\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        self.invoices = []\n",
    "        \n",
    "    def read_pdfs(self,path):\n",
    "        for file in os.listdir(path):\n",
    "            if file.startswith('invoice'):\n",
    "                pdf_file = open(path + file, 'rb')\n",
    "                pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
    "                text = ''\n",
    "                for page_num in range(len(pdf_reader.pages)):\n",
    "                    page = pdf_reader.pages[page_num]\n",
    "                    text += page.extract_text()\n",
    "                pdf_file.close()\n",
    "                self.invoices.append(text)\n",
    "        return self.invoices\n",
    "    \n",
    "    def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
    "        # if the csv already exists then concat a new one to it, else create a new one\n",
    "        if os.path.exists(save_as):\n",
    "            df = pd.read_csv(save_as)\n",
    "            df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
    "        else:  \n",
    "            df = pd.DataFrame(details, index=[0])\n",
    "        df.to_csv(save_as, index=False)\n",
    "        \n",
    "    def extract_invoice_details(self, text):\n",
    "        invoice_details = {}\n",
    "        try:\n",
    "            invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
    "            invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
    "            invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
    "            invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
    "            invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
    "            invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
    "            invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
    "            invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
    "        except:\n",
    "            print('Order Number not found')\n",
    "\n",
    "        item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
    "        if item_match:\n",
    "            item_info = item_match.group(1)\n",
    "            item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
    "            invoice_details['Item'] = item_name\n",
    "            # print(item_name)\n",
    "        else:\n",
    "            print(\"No item found in the invoice.\")\n",
    "        total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
    "        if total_mount_match:\n",
    "            total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
    "            invoice_details['Total Amount'] = total_mount\n",
    "        else:\n",
    "            print(\"No total amount found in the invoice.\")\n",
    "        gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
    "        if gstin_match:\n",
    "            invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
    "        else:\n",
    "            print(\"No GSTIN found in the invoice.\")\n",
    "        by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
    "        if by_match:\n",
    "            invoice_details['Sold By'] = by_match.group(1).strip()\n",
    "        else:\n",
    "            print(\"No seller found in the invoice.\")\n",
    "        return invoice_details\n",
    "    \n",
    "    def convert(self):\n",
    "        for invoice in self.invoices:\n",
    "            details = self.extract_invoice_details(invoice)\n",
    "            self.save_as_csv(details)\n",
    "        return pd.read_csv('invoice.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Order Number not found\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Order Number</th>\n",
       "      <th>Invoice Number</th>\n",
       "      <th>Order Date</th>\n",
       "      <th>Invoice Details</th>\n",
       "      <th>Invoice Date</th>\n",
       "      <th>Billing Address</th>\n",
       "      <th>Shipping Address</th>\n",
       "      <th>PAN</th>\n",
       "      <th>Item</th>\n",
       "      <th>Total Amount</th>\n",
       "      <th>GSTIN</th>\n",
       "      <th>Sold By</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>402-7035529-3886722</td>\n",
       "      <td>NAG1-192347</td>\n",
       "      <td>17.08.2023</td>\n",
       "      <td>MH-NAG1-1034-2324</td>\n",
       "      <td>17.08.2023</td>\n",
       "      <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
       "      <td>AALCA0171E</td>\n",
       "      <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
       "      <td>458.0</td>\n",
       "      <td>27AALCA0171E1ZZ</td>\n",
       "      <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>402-7035529-3886722</td>\n",
       "      <td>BOM5-1379800</td>\n",
       "      <td>17.08.2023</td>\n",
       "      <td>MH-BOM5-1034-2324</td>\n",
       "      <td>17.08.2023</td>\n",
       "      <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
       "      <td>AALCA0171E</td>\n",
       "      <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
       "      <td>13,099.00</td>\n",
       "      <td>27AALCA0171E1ZZ</td>\n",
       "      <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>405-4419941-9848328</td>\n",
       "      <td>DEX3-4683</td>\n",
       "      <td>23.07.2023</td>\n",
       "      <td>DL-DEX3-157533501-2324</td>\n",
       "      <td>23.07.2023</td>\n",
       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
       "      <td>ABEPW6057C</td>\n",
       "      <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
       "      <td>474.00</td>\n",
       "      <td>07ABEPW6057C1ZK</td>\n",
       "      <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>405-4419941-9848328</td>\n",
       "      <td>HYD8-29019</td>\n",
       "      <td>23.07.2023</td>\n",
       "      <td>TG-HYD8-817549015-2324</td>\n",
       "      <td>23.07.2023</td>\n",
       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
       "      <td>AACCN8253B</td>\n",
       "      <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
       "      <td>399.00</td>\n",
       "      <td>36AACCN8253B1ZN</td>\n",
       "      <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>405-0015964-5687515</td>\n",
       "      <td>IN-5040</td>\n",
       "      <td>23.07.2023</td>\n",
       "      <td>DL-1922955505-2324</td>\n",
       "      <td>23.07.2023</td>\n",
       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
       "      <td>JISPS4412R</td>\n",
       "      <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
       "      <td>149.00</td>\n",
       "      <td>07JISPS4412R1Z4</td>\n",
       "      <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>408-4974466-7793143</td>\n",
       "      <td>JPX2-223775</td>\n",
       "      <td>02.01.2024</td>\n",
       "      <td>RJ-JPX2-1317922175-2324</td>\n",
       "      <td>02.01.2024</td>\n",
       "      <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
       "      <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
       "      <td>AADCV4254H</td>\n",
       "      <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
       "      <td>569.00</td>\n",
       "      <td>08AADCV4254H1Z8</td>\n",
       "      <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
       "      <td>6,320.00</td>\n",
       "      <td>36AARCA3925C1ZQBilling</td>\n",
       "      <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Order Number Invoice Number  Order Date          Invoice Details  \\\n",
       "0  402-7035529-3886722    NAG1-192347  17.08.2023        MH-NAG1-1034-2324   \n",
       "1  402-7035529-3886722   BOM5-1379800  17.08.2023        MH-BOM5-1034-2324   \n",
       "2  405-4419941-9848328      DEX3-4683  23.07.2023   DL-DEX3-157533501-2324   \n",
       "3  405-4419941-9848328     HYD8-29019  23.07.2023   TG-HYD8-817549015-2324   \n",
       "4  405-0015964-5687515        IN-5040  23.07.2023       DL-1922955505-2324   \n",
       "5  408-4974466-7793143    JPX2-223775  02.01.2024  RJ-JPX2-1317922175-2324   \n",
       "6                  NaN            NaN         NaN                      NaN   \n",
       "\n",
       "  Invoice Date                                    Billing Address  \\\n",
       "0   17.08.2023  Pratik Dwivedi \\nBennett University, Plot Nos ...   \n",
       "1   17.08.2023  Pratik Dwivedi \\nBennett University, Plot Nos ...   \n",
       "2   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
       "3   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
       "4   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
       "5   02.01.2024  Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...   \n",
       "6          NaN                                                NaN   \n",
       "\n",
       "                                    Shipping Address         PAN  \\\n",
       "0  Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...  AALCA0171E   \n",
       "1  Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...  AALCA0171E   \n",
       "2  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  ABEPW6057C   \n",
       "3  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  AACCN8253B   \n",
       "4  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  JISPS4412R   \n",
       "5  Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...  AADCV4254H   \n",
       "6                                                NaN         NaN   \n",
       "\n",
       "                                                Item Total Amount  \\\n",
       "0  Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...        458.0   \n",
       "1  LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...    13,099.00   \n",
       "2  Amozo Easy Fit Tempered Glass Screen Protector...       474.00   \n",
       "3  ESR for iPhone 13/14 Cover, Shockproof Drop Pr...       399.00   \n",
       "4  imluckies Camera Lens Protector Compatible wit...       149.00   \n",
       "5  Amazon Basics Sleek Rechargeable LED Table Lam...       569.00   \n",
       "6  Saregama Carvaan Telugu - Portable Music Playe...     6,320.00   \n",
       "\n",
       "                    GSTIN                                            Sold By  \n",
       "0         27AALCA0171E1ZZ  Appario Retail Private Ltd \\n*TCI Supply Chain...  \n",
       "1         27AALCA0171E1ZZ  Appario Retail Private Ltd \\n*Renaissance indu...  \n",
       "2         07ABEPW6057C1ZK  RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...  \n",
       "3         36AACCN8253B1ZN  TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...  \n",
       "4         07JISPS4412R1Z4  M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...  \n",
       "5         08AADCV4254H1Z8  ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...  \n",
       "6  36AARCA3925C1ZQBilling  AATS Connect Private Limited \\n* GMR Airport C...  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "invoice_convertor = InvoiceConvertor()\n",
    "invoice_convertor.read_pdfs('invoices/')\n",
    "res = invoice_convertor.convert()\n",
    "res.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "resparser",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}