Spaces:

AutomaticReimbursementTool
/

demo

Sleeping

App Files Files Community

ankur-bohra commited on Jul 28, 2023

Commit

317211f

•

1 Parent(s): 6db4a81

Initial commit

Browse files

Files changed (24) hide show

.gitattributes +2 -0
.gitignore +3 -0
app.py +10 -0
categories/__init__.py +197 -0
categories/accomodation/__init__.py +41 -0
categories/accomodation/model.py +29 -0
categories/random_/__init__.py +128 -0
categories/random_/model.py +82 -0
categories/travel_cab/__init__.py +37 -0
categories/travel_cab/model.py +19 -0
categories/travel_flight/__init__.py +23 -0
categories/travel_flight/model.py +30 -0
categories/vendor/__init__.py +38 -0
categories/vendor/model.py +46 -0
examples/example1.pdf +3 -0
examples/rotated.jpeg +3 -0
examples/rotated.pdf +3 -0
examples/upright.jpeg +3 -0
examples/upright.pdf +3 -0
extract.py +67 -0
main.py +61 -0
packages.txt +1 -0
processing.py +171 -0
requirements.txt +346 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.conda
+temp*
+__pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+st.title("Automatic Reimbursement Tool Demo")
+with st.container():
+    col1, col2 = st.columns(2)
+    with col1:
+        st.header("Input")
+        st.file_uploader("Upload a PDF file or an image", type=["pdf", "png", "jpg", "jpeg"])

categories/__init__.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from enum import Enum
+from . import random_
+from . import accomodation
+from . import travel_cab
+from . import travel_flight
+# from . import vendor
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser
+from langchain.output_parsers.enum import EnumOutputParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from pydantic import BaseModel
+class Category(Enum):
+    ACCOMODATION = "accomodation"
+    TRAVEL_FLIGHT = "travel_flight"
+    TRAVEL_CAB = "travel_cab"
+    # VENDOR = "vendor"
+    RANDOM = "random"
+category_modules = {
+    Category.ACCOMODATION: accomodation,
+    Category.TRAVEL_FLIGHT: travel_flight,
+    Category.TRAVEL_CAB: travel_cab,
+    # Category.VENDOR: vendor,
+    Category.RANDOM: random_,
+}
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    # max_tokens=300,
+    model_kwargs={
+        "stop": None,
+        "top_p": 1,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+    },
+)
+# Build categorizing chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are a classifier that, given a bill's text, states what type of bill "
+    "category it belongs to: accomodation (bills regarding stays), travel (bills "
+    "concerning cab or other land rides), travel (bills concerning flights), random "
+    "(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
+    "You may want to see if there are Room Details, Check-in/Check-out Date for "
+    "Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
+    "Travel; Conference Details for Conference organizers; anything else comes under "
+    "random category. Your answers must be only the appropriate choice e.g. 'option' and "
+    "not 'The given bill belongs to the option category.'\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+category_parser = EnumOutputParser(enum=Category)
+categorize_chain = LLMChain(
+    llm=model, prompt=chat_prompt, output_parser=category_parser
+)
+def categorize_text(text: str) -> Category:
+    """Categories the text into one of the categories defined in Category by querying
+    ChatGPT.
+    Args:
+        text(str): The text to categorize.
+    Returns: The category of the text.
+    """
+    return categorize_chain.run(
+        text=text, format_instructions=category_parser.get_format_instructions()
+    )
+def run_category_chain(category: Category, text: str) -> BaseModel | None:
+    """Runs the chain for the given category on the given text.
+    Args:
+        category(Category): The category for which the chain is to be run.
+        text(str): The text on which the chain is to be run.
+    Returns: The output of the chain.
+    """
+    output_parser = category_modules[category].output_parser
+    try:
+        return category_modules[category].chain.run(
+            text=text, format_instructions=output_parser.get_format_instructions()
+        )
+    except Exception as e:
+        print("Error in running chain for category", category, ":", e)
+if __name__ == "__main__":
+    text = """amazonin
+we)
+Sold By :
+Spigen India Pvt. Ltd.
+* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
+37//15/1, 15/2,, Adjacent to Starex School, Village
+- Binola, National Highway -8, Tehsil - Manesar
+Gurgaon, Haryana, 122413
+IN
+PAN No: ABACS5056L
+GST Registration No: O6ABACS5056L12Z5
+Order Number: 407-5335982-7837125
+Order Date: 30.05.2023
+Tax Invoice/Bill of Supply/Cash Memo
+(Original for Recipient)
+Billing Address :
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Shipping Address :
+Praveen Bohra
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Place of supply: HARYANA
+Place of delivery: HARYANA
+Invoice Number : DEL5-21033
+Invoice Details : HR-DEL5-918080915-2324
+Invoice Date : 30.05.2023
+Description at Tax |Tax /|Tax Total
+p y Rate |Type |Amount|Amount
+Black) | BO8BHLZHBH ( ACS01744INP )
+HSN:39269099
+1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
+1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
+9% |SGST| %76.19
+TOTAL:
+Amount in Words:
+Nine Hundred Ninety-nine only
+Whether tax is payable under reverse charge - No
+For Spigen India Pvt. Ltd.:
+sSoigenrn
+Authorized Signatory
+Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
+2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
+*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
+Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
+Please note that this invoice is not a demand for payment
+Page 1 of 1"""
+    category = categorize_text(text)
+    print("Category:", category)
+    print("\n\n")
+    result = run_category_chain(category, text)
+    print(result)

categories/accomodation/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0.6,
+    max_tokens=300,
+    n=1,
+    request_timeout=None,
+    model_kwargs={
+        'stop': None,
+        'top_p': 1,
+    }
+)
+# Build category chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are tasked with developing an OCR data extraction system for hotel bills in PDF "
+    "format given as text. The system should extract important information necessary for "
+    "the reimbursement process from a college. Your prompt should fetch the following "
+    "essential details from the hotel bill: hotel name, address, bill number/invoice "
+    "number, booking ID / confirmation ID / booking number, check-in date and time, "
+    "check-out date and time, total amount, booking platform, bill date.\n"
+    "Ensure that the system accurately extracts the above information from the OCR text "
+    "of the hotel bill.\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)

categories/accomodation/model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from __future__ import annotations
+from datetime import datetime
+from pydantic import BaseModel, Field
+class InformationExtractedFromABillReceipt(BaseModel):
+    """
+    1. Hotel Name: [Hotel Name]
+    2. Address: [Hotel Address]
+    3. Bill number/Invoice number: [Bill Number]
+    4. booking ID / Confirmation ID / Booking #: [Booking ID]
+    5. Check-in Date and Time: [Check-in Date Time]
+    6. Check-out Date and Time: [Check-out Date Time]
+    7. Total Amount: [Total Amount Charged]
+    8. Booking platform: [Booking Platform]
+    9. Bill date: [Bill Date]
+    """
+    hostel_name: str = Field(..., title="The name of the hotel")
+    address: str = Field(..., title="The address of the hotel")
+    bill_number: str = Field(..., title="The bill number/invoice number")
+    booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
+    check_in_date_time: datetime = Field(..., title="The check-in date and time")
+    check_out_date_time: datetime = Field(..., title="The check-out date and time")
+    total_amount_charged: float = Field(..., title="The total amount charged")
+    booking_platform: str = Field(..., title="The booking platform")
+    bill_date: datetime = Field(..., title="The bill date")

categories/random_/__init__.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    model_kwargs={
+        'stop': None,
+        'top_p': 1,
+        'frequency_penalty': 0,
+        'presence_penalty': 0,
+    }
+)
+# Build category chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are an information extraction engine that outputs details from OCR processed "
+    "documents like uids, total, tax, name, currency, date, seller details, summary. You "
+    "may use context to make an educated guess about the currency. Use null if you are "
+    "unable to find certain details\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
+if __name__ == "__main__":
+    text = """amazonin
+we)
+Sold By :
+Spigen India Pvt. Ltd.
+* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
+37//15/1, 15/2,, Adjacent to Starex School, Village
+- Binola, National Highway -8, Tehsil - Manesar
+Gurgaon, Haryana, 122413
+IN
+PAN No: ABACS5056L
+GST Registration No: O6ABACS5056L12Z5
+Order Number: 407-5335982-7837125
+Order Date: 30.05.2023
+Tax Invoice/Bill of Supply/Cash Memo
+(Original for Recipient)
+Billing Address :
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Shipping Address :
+Praveen Bohra
+Praveen Bohra
+E-303, ParkView City 2, Sector 49, Sohna Road
+GURGAON, HARYANA, 122018
+IN
+State/UT Code: 06
+Place of supply: HARYANA
+Place of delivery: HARYANA
+Invoice Number : DEL5-21033
+Invoice Details : HR-DEL5-918080915-2324
+Invoice Date : 30.05.2023
+Description at Tax |Tax /|Tax Total
+p y Rate |Type |Amount|Amount
+Black) | BO8BHLZHBH ( ACS01744INP )
+HSN:39269099
+1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
+1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
+9% |SGST| %76.19
+TOTAL:
+Amount in Words:
+Nine Hundred Ninety-nine only
+Whether tax is payable under reverse charge - No
+For Spigen India Pvt. Ltd.:
+sSoigenrn
+Authorized Signatory
+Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
+2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
+*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
+Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
+Please note that this invoice is not a demand for payment
+Page 1 of 1"""
+    result = chain.run(text=text, format_instructions=fixing_parser.get_format_instructions())
+    print(result.json(indent=4))

categories/random_/model.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# generated by datamodel-codegen:
+#   filename:  schema.json
+#   timestamp: 2023-07-28T11:36:16+00:00
+from __future__ import annotations
+from datetime import date
+from typing import Dict, Optional, Union
+import iso4217
+from pydantic import BaseModel, Field, constr, validator, ValidationError
+class TaxItem(BaseModel):
+    gst: float = Field(
+        ...,
+        title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
+    )
+class TaxItem1(BaseModel):
+    vat: float = Field(..., title="The total VAT present in the invoice")
+class TaxNumberItem(BaseModel):
+    gst_number: constr(min_length=15) = Field(
+        ..., title="The alphanumeric GSTIN/GST number code"
+    )
+class TaxNumberItem1(BaseModel):
+    vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
+class TaxNumberItem2(BaseModel):
+    ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
+class SellerDetails(BaseModel):
+    name: Optional[str] = None
+    address: Optional[str] = None
+    contact: Optional[str] = None
+    tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
+        ..., title="Tax information"
+    )
+    pan_number: constr(min_length=10, max_length=10) = Field(
+        ..., title="The 10-character alphanumeric PAN code"
+    )
+class UIDs(BaseModel):
+    invoice_number: str = Field(..., title="The invoice number")
+    other_uids: Dict[str, str] = Field(
+        ...,
+        title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
+    )
+class InformationExtractedFromABillReceipt(BaseModel):
+    uids: UIDs = Field(..., title="Invoice number and other UIDs")
+    total: float = Field(..., title="Total amount or price")
+    tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
+    name: str = Field(
+        ...,
+        title="Name of the person/entity that the invoice item was charged or delivered to",
+    )
+    currency: str = Field(
+        default="INR",
+        title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
+    )
+    date: date = Field(
+        ..., title="The date the invoice was issued"
+    )
+    seller_details: SellerDetails = Field(..., title="Information about the seller")
+    summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
+    @validator("currency")
+    @classmethod
+    def check_currency(cls, v: str) -> str:
+        if not iso4217.Currency.__members__.get(v.lower()):
+            raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
+        return v.upper()

categories/travel_cab/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    model_kwargs= {
+        'stop': None,
+        'top_p': 1,
+        'frequency_penalty': 0,
+        'presence_penalty': 0,
+    }
+)
+# Build categorizing chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are an information extraction engine that outputs details from OCR processed "
+    "documents such as date/time/place of departure and arrival.\n"
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(
+    llm=model, prompt=chat_prompt, output_parser=fixing_parser
+)

categories/travel_cab/model.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from __future__ import annotations
+from datetime import date, time
+from pydantic import BaseModel, Field
+class InformationExtractedFromABillReceipt(BaseModel):
+    ''''''
+    place_from: str = Field(..., title="place where journey starts")
+    date_from: date = Field(
+        ..., title="date on which journey starts (DD/MM/YYYY)"
+    )
+    time_from: time = Field(..., title="time at which journey starts")
+    place_to: str = Field(..., title="place where journey end")
+    date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
+    time_to: time = Field(..., title="time at which journey end")
+    amount: float = Field(..., title="cost of journey ticket")

categories/travel_flight/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+)
+model = ChatOpenAI(temperature=0)
+# Build categorizing chain
+human_message_prompt = HumanMessagePromptTemplate.from_template(
+    "Parse through and find the following details from the text extracted from a travel "
+    "bill\n"
+    "{format_instructions}\n"
+    "{text}"
+)
+chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)

categories/travel_flight/model.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from __future__ import annotations
+from datetime import date, time
+from pydantic import BaseModel, Field
+class InformationExtractedFromABillReceipt(BaseModel):
+    """
+    response_schemas = [
+        ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
+        ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
+        ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
+        ResponseSchema(name="place (to)", description="place where flight end/lands"),
+        ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
+        ResponseSchema(name="time (to)", description="time at which flight end/lands"),
+        ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
+        ResponseSchema(name="amount", description="cost of flight ticket")
+    ]"""
+    place_from: str = Field(..., title="place where flight starts/takes-off")
+    date_from: date = Field(
+        ..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
+    )
+    time_from: time = Field(..., title="time at which flight starts/takes-off")
+    place_to: str = Field(..., title="place where flight end/lands")
+    date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
+    time_to: time = Field(..., title="time at which flight end/lands")
+    pnr_number: str = Field(..., title="PNR Number of flight")
+    amount: float = Field(..., title="cost of flight ticket")

categories/vendor/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from .model import InformationExtractedFromABillReceipt as PydanticModel
+from langchain.chains import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+model = ChatOpenAI(
+    temperature=0,
+    n=1,
+    model_kwargs={
+        "stop": None,
+        "top_p": 1,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+    },
+)
+# Build category chain
+system_message_prompt = SystemMessagePromptTemplate.from_template(
+    "You are an information extraction engine that outputs details from OCR processed "
+    "documents like uids, total, tax, addresses, bank details, invoice details, "
+    "participant registration details."
+    "{format_instructions}"
+)
+human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
+chat_prompt = ChatPromptTemplate.from_messages(
+    [system_message_prompt, human_message_prompt]
+)
+output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
+print(output_parser.get_format_instructions())
+# exit()
+fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
+chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)

categories/vendor/model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# generated by datamodel-codegen:
+#   filename:  schema.json
+#   timestamp: 2023-07-28T11:36:16+00:00
+from __future__ import annotations
+from datetime import datetime
+from pydantic import BaseModel, Field, constr, validator, ValidationError
+class BankDetails(BaseModel):
+    """account holder name, bank name, account number, branch, ifs code, swift code"""
+    account_holder_name: str = Field(..., title="The name of the account holder")
+    bank_name: str = Field(..., title="The name of the bank")
+    account_number: str = Field(..., title="The account number")
+    branch: str = Field(..., title="The branch of the bank")
+    ifs_code: str = Field(..., title="The IFS code of the bank")
+    swift_code: str = Field(..., title="The SWIFT code of the bank")
+class InformationExtractedFromABillReceipt(BaseModel):
+    """
+    GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
+    bank details: (account holder name, bank name, account number, branch, ifs code, swift
+    code), recipient, registration id, registration fee, registration date/time
+    """
+    gstin: constr(min_length=15) = Field(
+        ..., title="The alphanumeric GSTIN/GST number code"
+    )
+    billing_address: str = Field(..., title="The billing address")
+    invoice_number: str = Field(..., title="The invoice number")
+    invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
+    due_date: datetime = Field(..., title="The date-time the invoice is due")
+    total: float = Field(..., title="Total amount or price")
+    balance_due: float = Field(..., title="The amount due")
+    bank_details: BankDetails = Field(..., title="Bank details")
+    recipient: str = Field(
+        ...,
+        title="Name of the person/entity that the invoice item was charged or delivered to",
+    )
+    registration_id: str = Field(..., title="The registration ID")
+    registration_fee: float = Field(..., title="The registration fee")
+    registration_date_time: datetime = Field(..., title="The registration date-time")

examples/example1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
+size 45782

examples/rotated.jpeg ADDED Viewed

Git LFS Details

SHA256: e98aa24e25b2c3f277c237664cba4616fbe5d80fe3099459fb81e2ef3720d23c
Pointer size: 132 Bytes
Size of remote file: 1.79 MB

examples/rotated.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
+size 333463

examples/upright.jpeg ADDED Viewed

Git LFS Details

SHA256: 728be2c94b4af573145e5e89ffe5c3dfddb12a3055b85e60a23bd7697cff83f7
Pointer size: 132 Bytes
Size of remote file: 2.93 MB

examples/upright.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
+size 325064

extract.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
+"""
+from io import BytesIO
+from typing import List
+import pyocr.tesseract
+import pypdf
+from PIL import Image
+def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
+    """Extracts text from the given PDF file using pypdf.
+    Args:
+        bytes_stream (BytesIO): The PDF file to extract text from.
+    Returns: The extracted text
+    """
+    pdf_reader = pypdf.PdfReader(bytes_stream)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+        text += "\n\n"
+    return text
+def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
+    """Extracts text from the given image using tesseract via pyocr.
+    Args:
+        image(PIL.Image.Image): The image to extract text from.
+    Returns: The extracted text.
+    """
+    if not pyocr.tesseract.is_available():
+        raise Exception("Tesseract is not available.")
+    text = pyocr.tesseract.image_to_string(image, lang="eng")
+    return text
+def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
+    """Extracts text from the given images using tesseract via pyocr.
+    Args:
+        images(List[PIL.Image.Image]): The images to extract text from.
+    Returns: The extracted text.
+    """
+    text = ""
+    for image in images:
+        text += extract_text_from_image_pyocr_tesseract(image)
+        text += "\n\n"
+        image.close()
+    return text
+if __name__ == '__main__':
+    filename = 'examples/upright.pdf'
+    with open(filename, 'rb') as file:
+        bytes_stream = BytesIO(file.read())
+    text = extract_text_from_pdf_pypdf(bytes_stream)
+    print(text)
+    print("-"*25)
+    filename = 'examples/upright.jpeg'
+    image = Image.open(filename)
+    text = extract_text_from_image_pyocr_tesseract(image)
+    print(text)
+    image.close()

main.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from pathlib import Path
+import categories
+import processing
+import extract
+from PIL import Image
+from pydantic import BaseModel
+from io import BytesIO
+def categorize_and_parse_text(text: str) -> BaseModel:
+    """Categorizes the text and parses the information from it.
+    Args:
+        text(str): The text to categorize and parse information from.
+    Returns: The category of the text.
+    """
+    category = categories.categorize_text(text)
+    print("Categorized as category", category)
+    result = categories.run_category_chain(category, text)
+    return result
+def process_pdf(filename: Path) -> BaseModel:
+    """Processes the given PDF file and extracts information from it.
+    Args:
+        filename(Path): The PDF file to process.
+    Returns: The extracted information.
+    """
+    with open(filename, "rb") as f:
+        pdf_bytes = bytes(f.read())
+    text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
+    # If the encoded text is too short, a pdf scanner probably added a watermark
+    if len(text) < 20:
+        # Try to extract text from images
+        images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
+        text = extract.extract_text_from_images_pyocr_tesseract(images)
+    result = categorize_and_parse_text(text)
+    return result
+def process_image(filename: Path) -> BaseModel:
+    """Processes the given image file and extracts information from it.
+    Args:
+        filename(Path): The image file to process.
+    Returns: The extracted information.
+    """
+    image = Image.open(filename)
+    text = extract.extract_text_from_image_pyocr_tesseract(image)
+    image.close()
+    result = categorize_and_parse_text(text)
+    return result
+if __name__ == "__main__":
+    filename = Path("examples/example1.pdf")
+    result = process_pdf(filename)
+    print(result.json(indent=4))

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ poppler-utils

processing.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""Responsible for (pre)processing images and PDFs before they are passed to the OCR
+engine and other miscellaneous actions concerning processing.
+"""
+import os
+from pathlib import Path
+from typing import List
+# import cv2
+# import numpy as np
+import pyocr
+from pdf2image import pdf2image
+from PIL import Image  #, ImageOps
+PDF_CONVERSION_DPI = 300
+ROTATION_CONFIDENCE_THRESHOLD = 2.0
+# def rotate_image(image: Image, angle: float):
+#     """Rotates the given image by the given angle.
+#     Args:
+#         image(PIL.Image.Image): The image to be rotated.
+#         angle(float): The angle to rotate the image by.
+#     Returns: The rotated image.
+#     """
+#     image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+#     height, width, _ = image.shape  # Get the image height, width, and channels
+#     # Compute the rotation matrix
+#     rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
+#     # Apply the rotation to the image
+#     rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
+#     rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
+#     return rotated_image
+# class PDF_CONVERTER(enum.Enum):
+#     PDF2IMAGE = 1
+#     IMAGEMAGICK = 2
+def correct_orientation(image: Image.Image) -> Image.Image:
+    """Corrects the orientation of an image if it is not upright.
+    Args:
+        image(PIL.Image.Image): The pillow image to be corrected.
+    Returns: The corrected pillow image as a copy. The original image is not closed.
+    """
+    if not pyocr.tesseract.is_available():
+        raise Exception("Tesseract is not available.")
+    # image = ImageOps.exif_transpose(image)  # EXIF rotation is apparent, not actual
+    orientation_info = {}
+    try:
+        orientation_info = pyocr.tesseract.detect_orientation(image)
+    except pyocr.PyocrException as e:
+        print("Orientation detection failed: {}".format(e))
+    # output = pytesseract.image_to_osd(
+    #     image, config=" --psm 0", output_type=pytesseract.Output.DICT
+    # )
+    angle = orientation_info.get("angle", 0)
+    confidence = orientation_info.get("confidence", 100)
+    # rotate = output["rotate"]
+    # confidence = output["orientation_conf"]
+    if confidence > ROTATION_CONFIDENCE_THRESHOLD:
+        new_image = image.rotate(angle, expand=True)
+    else:
+        new_image = image.copy()
+    return new_image
+def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
+    """Converts a PDF to an image using pdf2image.
+    Args:
+        pdf_bytes(bytes): The bytes of the PDF to be converted.
+    Returns: A list of pillow images corresponding to each page from the PDF.
+    """
+    images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
+    return images
+def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
+    """Converts a PDF to an image using ImageMagick.
+    Args:
+        filename(pathlib.Path): The path to the PDF to be converted.
+        dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
+                                   are saved in the folder as page.jpg or as page-01.jpg,
+                                   page-02.jpg, etc.
+    Returns: dest_folder
+    """
+    os.system(f"magick convert"
+                f"-density {PDF_CONVERSION_DPI}"
+                f"{filename}"
+                f"-quality 100"
+                f"{dest_folder/'page.jpg'}")
+    return dest_folder
+def preprocess_image(image: Image.Image) -> Image.Image:
+    """Preprocesses an image for future use with OCR.
+    The following operations are performed:
+      1. Orientation correction
+    Args:
+        image(PIL.Image.Image): The image to be preprocessed.
+    Returns: The preprocessed pillow image.
+    """
+    rotated_image = correct_orientation(image)
+    result = rotated_image
+    image.close()
+    return result
+def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
+    """Preprocesses a PDF for future use with OCR.
+    The following operations are performed:
+      1. PDF to image conversion
+      2. Orientation correction
+    Args:
+        pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
+    Returns: A list of pillow images corresponding to each page from the PDF.
+    """
+    images = convert_pdf_to_image_pdf2image(pdf_bytes)
+    result = []
+    for image in images:
+        new_image = preprocess_image(image)
+        image.close()
+        result.append(new_image)
+    return result
+def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
+    """Preprocesses a PDF for future use with OCR.
+    The following operations are performed:
+      1. PDF to image conversion
+      2. Orientation correction
+    Args:
+        filename(pathlib.Path): The path to the PDF to be preprocessed.
+    Returns: A list of pillow images corresponding to each page from the PDF.
+    """
+    dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
+    result = []
+    for image in dest_folder.glob("*.jpg"):
+        new_image = preprocess_image(image)
+        image.close()
+        result.append(new_image)
+    return result
+if __name__ == '__main__':
+    filename = 'examples/upright.jpeg'
+    image = Image.open(filename)
+    new_image = preprocess_image(image)
+    image.close()
+    new_image.show()
+    new_image.close()
+    filename = 'examples/rotated.pdf'
+    with open(filename, 'rb') as file:
+        bytes_ = bytes(file.read())
+    images = preprocess_pdf_pdf2image(bytes_)
+    for image in images:
+        image.show()
+        image.close()

requirements.txt ADDED Viewed

	@@ -0,0 +1,346 @@

+aiobotocore==2.5.0
+aiofiles==22.1.0
+aiohttp==3.8.3
+aioitertools==0.7.1
+aiosignal==1.2.0
+aiosqlite==0.18.0
+alabaster==0.7.12
+anyio==3.5.0
+appdirs==1.4.4
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+arrow==1.2.3
+astroid==2.14.2
+astropy==5.1
+asttokens==2.2.1
+async-timeout==4.0.2
+atomicwrites==1.4.0
+attrs==22.1.0
+Automat==20.2.0
+autopep8==1.6.0
+Babel==2.11.0
+backcall==0.2.0
+bcrypt==3.2.0
+beautifulsoup4==4.12.2
+binaryornot==0.4.4
+black==0.0
+bleach==4.1.0
+bokeh==3.1.1
+botocore==1.29.76
+Bottleneck==1.3.5
+brotlipy==0.7.0
+certifi==2023.7.22
+cffi==1.15.1
+chardet==4.0.0
+charset-normalizer==2.0.4
+click==8.0.4
+cloudpickle==2.2.1
+colorama==0.4.6
+colorcet==3.0.1
+comm==0.1.3
+constantly==15.1.0
+contourpy==1.0.5
+cookiecutter==1.7.3
+cryptography==39.0.1
+cssselect==1.1.0
+cycler==0.11.0
+cytoolz==0.12.0
+daal4py==2023.1.1
+dask==2023.6.0
+dataclasses-json==0.5.13
+datasets==2.12.0
+datashader==0.15.0
+datashape==0.5.4
+debugpy==1.6.7
+decorator==5.1.1
+defusedxml==0.7.1
+diff-match-patch==20200713
+dill==0.3.6
+distributed==2023.6.0
+docstring-to-markdown==0.11
+docutils==0.18.1
+entrypoints==0.4
+et-xmlfile==1.1.0
+exceptiongroup==1.0.4
+executing==1.2.0
+fastjsonschema==2.16.2
+filelock==3.9.0
+flake8==6.0.0
+Flask==2.2.2
+fonttools==4.25.0
+frozenlist==1.3.3
+fsspec==2023.4.0
+gensim==4.3.0
+greenlet==2.0.1
+h5py==3.7.0
+HeapDict==1.0.1
+holoviews==1.16.2
+huggingface-hub==0.15.1
+hvplot==0.8.4
+hyperlink==21.0.0
+idna==3.4
+imagecodecs==2021.8.26
+imageio==2.26.0
+imagesize==1.4.1
+imbalanced-learn==0.10.1
+importlib-metadata==6.0.0
+incremental==21.3.0
+inflection==0.5.1
+iniconfig==1.1.1
+intake==0.6.8
+intervaltree==3.1.0
+ipykernel==6.22.0
+ipython==8.12.0
+ipython-genutils==0.2.0
+ipywidgets==8.0.4
+iso4217==1.9.20220401
+isort==5.9.3
+itemadapter==0.3.0
+itemloaders==1.0.4
+itsdangerous==2.0.1
+jaraco.classes==3.2.1
+jedi==0.18.2
+jellyfish==0.9.0
+Jinja2==3.1.2
+jinja2-time==0.2.0
+jmespath==0.10.0
+joblib==1.2.0
+json5==0.9.6
+jsonschema==4.17.3
+jupyter==1.0.0
+jupyter_client==8.2.0
+jupyter-console==6.6.3
+jupyter_core==5.3.0
+jupyter-events==0.6.3
+jupyter-server==1.23.6
+jupyter_server_fileid==0.9.0
+jupyter_server_terminals==0.4.4
+jupyter_server_ydoc==0.8.0
+jupyter-ydoc==0.2.4
+jupyterlab==3.6.3
+jupyterlab-pygments==0.1.2
+jupyterlab_server==2.22.0
+jupyterlab-widgets==3.0.5
+keyring==23.13.1
+kiwisolver==1.4.4
+langchain==0.0.245
+langsmith==0.0.15
+lazy_loader==0.2
+lazy-object-proxy==1.6.0
+linkify-it-py==2.0.0
+llvmlite==0.40.0
+lmdb==1.4.1
+locket==1.0.0
+lxml==4.9.2
+lz4==4.3.2
+Markdown==3.4.1
+markdown-it-py==2.2.0
+MarkupSafe==2.1.1
+marshmallow==3.20.1
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mccabe==0.7.0
+mdit-py-plugins==0.3.0
+mdurl==0.1.0
+menuinst==1.4.19
+mistune==3.0.0
+mkl-fft==1.3.6
+mkl-random==1.2.2
+mkl-service==2.4.0
+more-itertools==8.12.0
+mpmath==1.2.1
+msgpack==1.0.3
+multidict==6.0.2
+multipledispatch==0.6.0
+multiprocess==0.70.14
+munkres==1.1.4
+mypy-extensions==0.4.3
+nbclassic==0.5.5
+nbclient==0.5.13
+nbconvert==7.7.3
+nbformat==5.7.0
+nest-asyncio==1.5.6
+networkx==2.8.4
+nltk==3.7
+notebook==6.5.4
+notebook_shim==0.2.2
+numba==0.57.0
+numexpr==2.8.4
+numpy==1.24.3
+numpydoc==1.5.0
+openai==0.27.8
+openapi-schema-pydantic==1.2.4
+opencv-python-headless==4.8.0.74
+openpyxl==3.0.10
+packaging==23.0
+pandas==1.5.3
+pandocfilters==1.5.0
+panel==1.1.0
+param==1.13.0
+paramiko==2.8.1
+parsel==1.6.0
+parso==0.8.3
+partd==1.2.0
+pathspec==0.10.3
+patsy==0.5.3
+pdf2image==1.16.3
+pep8==1.7.1
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.4.0
+pip==23.1.2
+platformdirs==3.5.0
+plotly==5.9.0
+pluggy==1.0.0
+ply==3.11
+pooch==1.4.0
+poyo==0.5.0
+prometheus-client==0.14.1
+prompt-toolkit==3.0.38
+Protego==0.1.16
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py-cpuinfo==8.0.0
+pyarrow==11.0.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycodestyle==2.10.0
+pycparser==2.21
+pyct==0.5.0
+pycurl==7.45.2
+pydantic==1.10.12
+PyDispatcher==2.0.5
+pydocstyle==6.3.0
+pyerfa==2.0.0
+pyflakes==3.0.1
+Pygments==2.15.1
+pylint==2.16.2
+pylint-venv==2.3.0
+pyls-spyder==0.4.0
+PyNaCl==1.5.0
+pyocr==0.8.3
+pyodbc==4.0.34
+pyOpenSSL==23.0.0
+pyparsing==3.0.9
+pypdf==3.13.0
+PyQt5==5.15.7
+PyQt5-sip==12.11.0
+PyQtWebEngine==5.15.4
+pyrsistent==0.18.0
+PySocks==1.7.1
+pytest==7.3.1
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+python-lsp-black==1.2.1
+python-lsp-jsonrpc==1.0.0
+python-lsp-server==1.7.2
+python-slugify==5.0.2
+python-snappy==0.6.1
+pytoolconfig==1.2.5
+pytz==2022.7
+pyviz-comms==2.3.0
+PyWavelets==1.4.1
+pywin32==305.1
+pywin32-ctypes==0.2.0
+pywinpty==2.0.10
+PyYAML==6.0
+pyzmq==25.0.2
+QDarkStyle==3.0.2
+qstylizer==0.2.2
+QtAwesome==1.2.2
+qtconsole==5.4.2
+QtPy==2.2.0
+queuelib==1.5.0
+regex==2022.7.9
+requests==2.29.0
+requests-file==1.5.1
+responses==0.13.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rope==1.7.0
+Rtree==1.0.1
+s3fs==2023.4.0
+sacremoses==0.0.43
+scikit-image==0.20.0
+scikit-learn==1.2.2
+scikit-learn-intelex==20230426.121158
+scipy==1.10.1
+Scrapy==2.8.0
+seaborn==0.12.2
+Send2Trash==1.8.0
+service-identity==18.1.0
+setuptools==67.8.0
+sip==6.6.2
+six==1.16.0
+smart-open==5.2.1
+sniffio==1.2.0
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soupsieve==2.4
+Sphinx==5.0.2
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+spyder==5.4.3
+spyder-kernels==2.4.3
+SQLAlchemy==1.4.39
+stack-data==0.6.2
+statsmodels==0.13.5
+sympy==1.11.1
+tables==3.8.0
+tabulate==0.8.10
+TBB==0.2
+tblib==1.7.0
+tenacity==8.2.2
+terminado==0.17.1
+text-unidecode==1.3
+textdistance==4.2.1
+threadpoolctl==2.2.0
+three-merge==0.1.1
+tifffile==2021.7.2
+tinycss2==1.2.1
+tldextract==3.2.0
+tokenizers==0.13.2
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.11.1
+toolz==0.12.0
+torch==2.0.1
+tornado==6.3.1
+tqdm==4.65.0
+traitlets==5.9.0
+transformers==4.29.2
+Twisted==22.10.0
+twisted-iocpsupport==1.0.2
+typing_extensions==4.6.3
+typing-inspect==0.9.0
+uc-micro-py==1.0.1
+ujson==5.4.0
+Unidecode==1.2.0
+urllib3==1.26.16
+w3lib==1.21.0
+watchdog==2.1.6
+wcwidth==0.2.6
+webencodings==0.5.1
+websocket-client==0.58.0
+Werkzeug==2.2.3
+whatthepatch==1.0.2
+wheel==0.38.4
+widgetsnbextension==4.0.5
+win-inet-pton==1.1.0
+wrapt==1.14.1
+xarray==2022.11.0
+xlwings==0.29.1
+xxhash==2.0.2
+xyzservices==2022.9.0
+y-py==0.5.9
+yapf==0.31.0
+yarl==1.8.1
+ypy-websocket==0.8.2
+zict==2.2.0
+zipp==3.11.0
+zope.interface==5.4.0