Spaces:
Sleeping
Sleeping
ankur-bohra
commited on
Commit
•
317211f
1
Parent(s):
6db4a81
Initial commit
Browse files- .gitattributes +2 -0
- .gitignore +3 -0
- app.py +10 -0
- categories/__init__.py +197 -0
- categories/accomodation/__init__.py +41 -0
- categories/accomodation/model.py +29 -0
- categories/random_/__init__.py +128 -0
- categories/random_/model.py +82 -0
- categories/travel_cab/__init__.py +37 -0
- categories/travel_cab/model.py +19 -0
- categories/travel_flight/__init__.py +23 -0
- categories/travel_flight/model.py +30 -0
- categories/vendor/__init__.py +38 -0
- categories/vendor/model.py +46 -0
- examples/example1.pdf +3 -0
- examples/rotated.jpeg +3 -0
- examples/rotated.pdf +3 -0
- examples/upright.jpeg +3 -0
- examples/upright.pdf +3 -0
- extract.py +67 -0
- main.py +61 -0
- packages.txt +1 -0
- processing.py +171 -0
- requirements.txt +346 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.conda
|
2 |
+
temp*
|
3 |
+
__pycache__/
|
app.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("Automatic Reimbursement Tool Demo")
|
4 |
+
|
5 |
+
with st.container():
|
6 |
+
col1, col2 = st.columns(2)
|
7 |
+
|
8 |
+
with col1:
|
9 |
+
st.header("Input")
|
10 |
+
st.file_uploader("Upload a PDF file or an image", type=["pdf", "png", "jpg", "jpeg"])
|
categories/__init__.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
|
3 |
+
from . import random_
|
4 |
+
from . import accomodation
|
5 |
+
from . import travel_cab
|
6 |
+
from . import travel_flight
|
7 |
+
|
8 |
+
# from . import vendor
|
9 |
+
from langchain.chains import LLMChain
|
10 |
+
from langchain.chat_models import ChatOpenAI
|
11 |
+
from langchain.output_parsers import PydanticOutputParser
|
12 |
+
from langchain.output_parsers.enum import EnumOutputParser
|
13 |
+
from langchain.prompts import (
|
14 |
+
ChatPromptTemplate,
|
15 |
+
HumanMessagePromptTemplate,
|
16 |
+
SystemMessagePromptTemplate,
|
17 |
+
)
|
18 |
+
from pydantic import BaseModel
|
19 |
+
|
20 |
+
|
21 |
+
class Category(Enum):
|
22 |
+
ACCOMODATION = "accomodation"
|
23 |
+
TRAVEL_FLIGHT = "travel_flight"
|
24 |
+
TRAVEL_CAB = "travel_cab"
|
25 |
+
# VENDOR = "vendor"
|
26 |
+
RANDOM = "random"
|
27 |
+
|
28 |
+
|
29 |
+
category_modules = {
|
30 |
+
Category.ACCOMODATION: accomodation,
|
31 |
+
Category.TRAVEL_FLIGHT: travel_flight,
|
32 |
+
Category.TRAVEL_CAB: travel_cab,
|
33 |
+
# Category.VENDOR: vendor,
|
34 |
+
Category.RANDOM: random_,
|
35 |
+
}
|
36 |
+
|
37 |
+
model = ChatOpenAI(
|
38 |
+
temperature=0,
|
39 |
+
n=1,
|
40 |
+
# max_tokens=300,
|
41 |
+
model_kwargs={
|
42 |
+
"stop": None,
|
43 |
+
"top_p": 1,
|
44 |
+
"frequency_penalty": 0,
|
45 |
+
"presence_penalty": 0,
|
46 |
+
},
|
47 |
+
)
|
48 |
+
|
49 |
+
# Build categorizing chain
|
50 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
51 |
+
"You are a classifier that, given a bill's text, states what type of bill "
|
52 |
+
"category it belongs to: accomodation (bills regarding stays), travel (bills "
|
53 |
+
"concerning cab or other land rides), travel (bills concerning flights), random "
|
54 |
+
"(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
|
55 |
+
"You may want to see if there are Room Details, Check-in/Check-out Date for "
|
56 |
+
"Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
|
57 |
+
"Travel; Conference Details for Conference organizers; anything else comes under "
|
58 |
+
"random category. Your answers must be only the appropriate choice e.g. 'option' and "
|
59 |
+
"not 'The given bill belongs to the option category.'\n"
|
60 |
+
"{format_instructions}"
|
61 |
+
)
|
62 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
63 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
64 |
+
[system_message_prompt, human_message_prompt]
|
65 |
+
)
|
66 |
+
category_parser = EnumOutputParser(enum=Category)
|
67 |
+
categorize_chain = LLMChain(
|
68 |
+
llm=model, prompt=chat_prompt, output_parser=category_parser
|
69 |
+
)
|
70 |
+
|
71 |
+
|
72 |
+
def categorize_text(text: str) -> Category:
|
73 |
+
"""Categories the text into one of the categories defined in Category by querying
|
74 |
+
ChatGPT.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
text(str): The text to categorize.
|
78 |
+
|
79 |
+
Returns: The category of the text.
|
80 |
+
"""
|
81 |
+
return categorize_chain.run(
|
82 |
+
text=text, format_instructions=category_parser.get_format_instructions()
|
83 |
+
)
|
84 |
+
|
85 |
+
|
86 |
+
def run_category_chain(category: Category, text: str) -> BaseModel | None:
|
87 |
+
"""Runs the chain for the given category on the given text.
|
88 |
+
|
89 |
+
Args:
|
90 |
+
category(Category): The category for which the chain is to be run.
|
91 |
+
text(str): The text on which the chain is to be run.
|
92 |
+
|
93 |
+
Returns: The output of the chain.
|
94 |
+
"""
|
95 |
+
output_parser = category_modules[category].output_parser
|
96 |
+
try:
|
97 |
+
return category_modules[category].chain.run(
|
98 |
+
text=text, format_instructions=output_parser.get_format_instructions()
|
99 |
+
)
|
100 |
+
except Exception as e:
|
101 |
+
print("Error in running chain for category", category, ":", e)
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == "__main__":
|
105 |
+
text = """amazonin
|
106 |
+
we)
|
107 |
+
|
108 |
+
Sold By :
|
109 |
+
|
110 |
+
Spigen India Pvt. Ltd.
|
111 |
+
|
112 |
+
* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
|
113 |
+
37//15/1, 15/2,, Adjacent to Starex School, Village
|
114 |
+
- Binola, National Highway -8, Tehsil - Manesar
|
115 |
+
Gurgaon, Haryana, 122413
|
116 |
+
|
117 |
+
IN
|
118 |
+
|
119 |
+
PAN No: ABACS5056L
|
120 |
+
GST Registration No: O6ABACS5056L12Z5
|
121 |
+
|
122 |
+
Order Number: 407-5335982-7837125
|
123 |
+
Order Date: 30.05.2023
|
124 |
+
|
125 |
+
Tax Invoice/Bill of Supply/Cash Memo
|
126 |
+
(Original for Recipient)
|
127 |
+
|
128 |
+
Billing Address :
|
129 |
+
|
130 |
+
Praveen Bohra
|
131 |
+
|
132 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
133 |
+
GURGAON, HARYANA, 122018
|
134 |
+
|
135 |
+
IN
|
136 |
+
|
137 |
+
State/UT Code: 06
|
138 |
+
|
139 |
+
Shipping Address :
|
140 |
+
|
141 |
+
Praveen Bohra
|
142 |
+
|
143 |
+
Praveen Bohra
|
144 |
+
|
145 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
146 |
+
GURGAON, HARYANA, 122018
|
147 |
+
|
148 |
+
IN
|
149 |
+
|
150 |
+
State/UT Code: 06
|
151 |
+
|
152 |
+
Place of supply: HARYANA
|
153 |
+
|
154 |
+
Place of delivery: HARYANA
|
155 |
+
|
156 |
+
Invoice Number : DEL5-21033
|
157 |
+
Invoice Details : HR-DEL5-918080915-2324
|
158 |
+
Invoice Date : 30.05.2023
|
159 |
+
|
160 |
+
Description at Tax |Tax /|Tax Total
|
161 |
+
p y Rate |Type |Amount|Amount
|
162 |
+
|
163 |
+
Black) | BO8BHLZHBH ( ACS01744INP )
|
164 |
+
HSN:39269099
|
165 |
+
|
166 |
+
1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
|
167 |
+
1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
|
168 |
+
9% |SGST| %76.19
|
169 |
+
|
170 |
+
TOTAL:
|
171 |
+
|
172 |
+
Amount in Words:
|
173 |
+
Nine Hundred Ninety-nine only
|
174 |
+
|
175 |
+
Whether tax is payable under reverse charge - No
|
176 |
+
|
177 |
+
For Spigen India Pvt. Ltd.:
|
178 |
+
sSoigenrn
|
179 |
+
|
180 |
+
Authorized Signatory
|
181 |
+
|
182 |
+
Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
|
183 |
+
2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
|
184 |
+
|
185 |
+
*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
|
186 |
+
|
187 |
+
Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
|
188 |
+
|
189 |
+
Please note that this invoice is not a demand for payment
|
190 |
+
|
191 |
+
Page 1 of 1"""
|
192 |
+
category = categorize_text(text)
|
193 |
+
print("Category:", category)
|
194 |
+
|
195 |
+
print("\n\n")
|
196 |
+
result = run_category_chain(category, text)
|
197 |
+
print(result)
|
categories/accomodation/__init__.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0.6,
|
14 |
+
max_tokens=300,
|
15 |
+
n=1,
|
16 |
+
request_timeout=None,
|
17 |
+
model_kwargs={
|
18 |
+
'stop': None,
|
19 |
+
'top_p': 1,
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build category chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are tasked with developing an OCR data extraction system for hotel bills in PDF "
|
26 |
+
"format given as text. The system should extract important information necessary for "
|
27 |
+
"the reimbursement process from a college. Your prompt should fetch the following "
|
28 |
+
"essential details from the hotel bill: hotel name, address, bill number/invoice "
|
29 |
+
"number, booking ID / confirmation ID / booking number, check-in date and time, "
|
30 |
+
"check-out date and time, total amount, booking platform, bill date.\n"
|
31 |
+
"Ensure that the system accurately extracts the above information from the OCR text "
|
32 |
+
"of the hotel bill.\n"
|
33 |
+
"{format_instructions}"
|
34 |
+
)
|
35 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
36 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
37 |
+
[system_message_prompt, human_message_prompt]
|
38 |
+
)
|
39 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
40 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
41 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/accomodation/model.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
|
8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
9 |
+
"""
|
10 |
+
1. Hotel Name: [Hotel Name]
|
11 |
+
2. Address: [Hotel Address]
|
12 |
+
3. Bill number/Invoice number: [Bill Number]
|
13 |
+
4. booking ID / Confirmation ID / Booking #: [Booking ID]
|
14 |
+
5. Check-in Date and Time: [Check-in Date Time]
|
15 |
+
6. Check-out Date and Time: [Check-out Date Time]
|
16 |
+
7. Total Amount: [Total Amount Charged]
|
17 |
+
8. Booking platform: [Booking Platform]
|
18 |
+
9. Bill date: [Bill Date]
|
19 |
+
"""
|
20 |
+
|
21 |
+
hostel_name: str = Field(..., title="The name of the hotel")
|
22 |
+
address: str = Field(..., title="The address of the hotel")
|
23 |
+
bill_number: str = Field(..., title="The bill number/invoice number")
|
24 |
+
booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
|
25 |
+
check_in_date_time: datetime = Field(..., title="The check-in date and time")
|
26 |
+
check_out_date_time: datetime = Field(..., title="The check-out date and time")
|
27 |
+
total_amount_charged: float = Field(..., title="The total amount charged")
|
28 |
+
booking_platform: str = Field(..., title="The booking platform")
|
29 |
+
bill_date: datetime = Field(..., title="The bill date")
|
categories/random_/__init__.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0,
|
14 |
+
n=1,
|
15 |
+
model_kwargs={
|
16 |
+
'stop': None,
|
17 |
+
'top_p': 1,
|
18 |
+
'frequency_penalty': 0,
|
19 |
+
'presence_penalty': 0,
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build category chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
26 |
+
"documents like uids, total, tax, name, currency, date, seller details, summary. You "
|
27 |
+
"may use context to make an educated guess about the currency. Use null if you are "
|
28 |
+
"unable to find certain details\n"
|
29 |
+
"{format_instructions}"
|
30 |
+
)
|
31 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
32 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
33 |
+
[system_message_prompt, human_message_prompt]
|
34 |
+
)
|
35 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
36 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
37 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
text = """amazonin
|
41 |
+
we)
|
42 |
+
|
43 |
+
Sold By :
|
44 |
+
|
45 |
+
Spigen India Pvt. Ltd.
|
46 |
+
|
47 |
+
* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
|
48 |
+
37//15/1, 15/2,, Adjacent to Starex School, Village
|
49 |
+
- Binola, National Highway -8, Tehsil - Manesar
|
50 |
+
Gurgaon, Haryana, 122413
|
51 |
+
|
52 |
+
IN
|
53 |
+
|
54 |
+
PAN No: ABACS5056L
|
55 |
+
GST Registration No: O6ABACS5056L12Z5
|
56 |
+
|
57 |
+
Order Number: 407-5335982-7837125
|
58 |
+
Order Date: 30.05.2023
|
59 |
+
|
60 |
+
Tax Invoice/Bill of Supply/Cash Memo
|
61 |
+
(Original for Recipient)
|
62 |
+
|
63 |
+
Billing Address :
|
64 |
+
|
65 |
+
Praveen Bohra
|
66 |
+
|
67 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
68 |
+
GURGAON, HARYANA, 122018
|
69 |
+
|
70 |
+
IN
|
71 |
+
|
72 |
+
State/UT Code: 06
|
73 |
+
|
74 |
+
Shipping Address :
|
75 |
+
|
76 |
+
Praveen Bohra
|
77 |
+
|
78 |
+
Praveen Bohra
|
79 |
+
|
80 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
81 |
+
GURGAON, HARYANA, 122018
|
82 |
+
|
83 |
+
IN
|
84 |
+
|
85 |
+
State/UT Code: 06
|
86 |
+
|
87 |
+
Place of supply: HARYANA
|
88 |
+
|
89 |
+
Place of delivery: HARYANA
|
90 |
+
|
91 |
+
Invoice Number : DEL5-21033
|
92 |
+
Invoice Details : HR-DEL5-918080915-2324
|
93 |
+
Invoice Date : 30.05.2023
|
94 |
+
|
95 |
+
Description at Tax |Tax /|Tax Total
|
96 |
+
p y Rate |Type |Amount|Amount
|
97 |
+
|
98 |
+
Black) | BO8BHLZHBH ( ACS01744INP )
|
99 |
+
HSN:39269099
|
100 |
+
|
101 |
+
1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
|
102 |
+
1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
|
103 |
+
9% |SGST| %76.19
|
104 |
+
|
105 |
+
TOTAL:
|
106 |
+
|
107 |
+
Amount in Words:
|
108 |
+
Nine Hundred Ninety-nine only
|
109 |
+
|
110 |
+
Whether tax is payable under reverse charge - No
|
111 |
+
|
112 |
+
For Spigen India Pvt. Ltd.:
|
113 |
+
sSoigenrn
|
114 |
+
|
115 |
+
Authorized Signatory
|
116 |
+
|
117 |
+
Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
|
118 |
+
2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
|
119 |
+
|
120 |
+
*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
|
121 |
+
|
122 |
+
Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
|
123 |
+
|
124 |
+
Please note that this invoice is not a demand for payment
|
125 |
+
|
126 |
+
Page 1 of 1"""
|
127 |
+
result = chain.run(text=text, format_instructions=fixing_parser.get_format_instructions())
|
128 |
+
print(result.json(indent=4))
|
categories/random_/model.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# generated by datamodel-codegen:
|
2 |
+
# filename: schema.json
|
3 |
+
# timestamp: 2023-07-28T11:36:16+00:00
|
4 |
+
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
from datetime import date
|
8 |
+
from typing import Dict, Optional, Union
|
9 |
+
|
10 |
+
import iso4217
|
11 |
+
from pydantic import BaseModel, Field, constr, validator, ValidationError
|
12 |
+
|
13 |
+
|
14 |
+
class TaxItem(BaseModel):
|
15 |
+
gst: float = Field(
|
16 |
+
...,
|
17 |
+
title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
class TaxItem1(BaseModel):
|
22 |
+
vat: float = Field(..., title="The total VAT present in the invoice")
|
23 |
+
|
24 |
+
|
25 |
+
class TaxNumberItem(BaseModel):
|
26 |
+
gst_number: constr(min_length=15) = Field(
|
27 |
+
..., title="The alphanumeric GSTIN/GST number code"
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
class TaxNumberItem1(BaseModel):
|
32 |
+
vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
|
33 |
+
|
34 |
+
|
35 |
+
class TaxNumberItem2(BaseModel):
|
36 |
+
ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
|
37 |
+
|
38 |
+
|
39 |
+
class SellerDetails(BaseModel):
|
40 |
+
name: Optional[str] = None
|
41 |
+
address: Optional[str] = None
|
42 |
+
contact: Optional[str] = None
|
43 |
+
tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
|
44 |
+
..., title="Tax information"
|
45 |
+
)
|
46 |
+
pan_number: constr(min_length=10, max_length=10) = Field(
|
47 |
+
..., title="The 10-character alphanumeric PAN code"
|
48 |
+
)
|
49 |
+
|
50 |
+
|
51 |
+
class UIDs(BaseModel):
|
52 |
+
invoice_number: str = Field(..., title="The invoice number")
|
53 |
+
other_uids: Dict[str, str] = Field(
|
54 |
+
...,
|
55 |
+
title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
|
56 |
+
)
|
57 |
+
|
58 |
+
|
59 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
60 |
+
uids: UIDs = Field(..., title="Invoice number and other UIDs")
|
61 |
+
total: float = Field(..., title="Total amount or price")
|
62 |
+
tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
|
63 |
+
name: str = Field(
|
64 |
+
...,
|
65 |
+
title="Name of the person/entity that the invoice item was charged or delivered to",
|
66 |
+
)
|
67 |
+
currency: str = Field(
|
68 |
+
default="INR",
|
69 |
+
title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
|
70 |
+
)
|
71 |
+
date: date = Field(
|
72 |
+
..., title="The date the invoice was issued"
|
73 |
+
)
|
74 |
+
seller_details: SellerDetails = Field(..., title="Information about the seller")
|
75 |
+
summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
|
76 |
+
|
77 |
+
@validator("currency")
|
78 |
+
@classmethod
|
79 |
+
def check_currency(cls, v: str) -> str:
|
80 |
+
if not iso4217.Currency.__members__.get(v.lower()):
|
81 |
+
raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
|
82 |
+
return v.upper()
|
categories/travel_cab/__init__.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0,
|
14 |
+
n=1,
|
15 |
+
model_kwargs= {
|
16 |
+
'stop': None,
|
17 |
+
'top_p': 1,
|
18 |
+
'frequency_penalty': 0,
|
19 |
+
'presence_penalty': 0,
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build categorizing chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
26 |
+
"documents such as date/time/place of departure and arrival.\n"
|
27 |
+
"{format_instructions}"
|
28 |
+
)
|
29 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
30 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
31 |
+
[system_message_prompt, human_message_prompt]
|
32 |
+
)
|
33 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
34 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
35 |
+
chain = LLMChain(
|
36 |
+
llm=model, prompt=chat_prompt, output_parser=fixing_parser
|
37 |
+
)
|
categories/travel_cab/model.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from datetime import date, time
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
|
8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
9 |
+
''''''
|
10 |
+
|
11 |
+
place_from: str = Field(..., title="place where journey starts")
|
12 |
+
date_from: date = Field(
|
13 |
+
..., title="date on which journey starts (DD/MM/YYYY)"
|
14 |
+
)
|
15 |
+
time_from: time = Field(..., title="time at which journey starts")
|
16 |
+
place_to: str = Field(..., title="place where journey end")
|
17 |
+
date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
|
18 |
+
time_to: time = Field(..., title="time at which journey end")
|
19 |
+
amount: float = Field(..., title="cost of journey ticket")
|
categories/travel_flight/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
)
|
10 |
+
|
11 |
+
model = ChatOpenAI(temperature=0)
|
12 |
+
|
13 |
+
# Build categorizing chain
|
14 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(
|
15 |
+
"Parse through and find the following details from the text extracted from a travel "
|
16 |
+
"bill\n"
|
17 |
+
"{format_instructions}\n"
|
18 |
+
"{text}"
|
19 |
+
)
|
20 |
+
chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
|
21 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
22 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
23 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/travel_flight/model.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from datetime import date, time
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
|
8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
9 |
+
"""
|
10 |
+
response_schemas = [
|
11 |
+
ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
|
12 |
+
ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
|
13 |
+
ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
|
14 |
+
ResponseSchema(name="place (to)", description="place where flight end/lands"),
|
15 |
+
ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
|
16 |
+
ResponseSchema(name="time (to)", description="time at which flight end/lands"),
|
17 |
+
ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
|
18 |
+
ResponseSchema(name="amount", description="cost of flight ticket")
|
19 |
+
]"""
|
20 |
+
|
21 |
+
place_from: str = Field(..., title="place where flight starts/takes-off")
|
22 |
+
date_from: date = Field(
|
23 |
+
..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
|
24 |
+
)
|
25 |
+
time_from: time = Field(..., title="time at which flight starts/takes-off")
|
26 |
+
place_to: str = Field(..., title="place where flight end/lands")
|
27 |
+
date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
|
28 |
+
time_to: time = Field(..., title="time at which flight end/lands")
|
29 |
+
pnr_number: str = Field(..., title="PNR Number of flight")
|
30 |
+
amount: float = Field(..., title="cost of flight ticket")
|
categories/vendor/__init__.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
2 |
+
|
3 |
+
from langchain.chains import LLMChain
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
6 |
+
from langchain.prompts import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
)
|
11 |
+
|
12 |
+
model = ChatOpenAI(
|
13 |
+
temperature=0,
|
14 |
+
n=1,
|
15 |
+
model_kwargs={
|
16 |
+
"stop": None,
|
17 |
+
"top_p": 1,
|
18 |
+
"frequency_penalty": 0,
|
19 |
+
"presence_penalty": 0,
|
20 |
+
},
|
21 |
+
)
|
22 |
+
|
23 |
+
# Build category chain
|
24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
26 |
+
"documents like uids, total, tax, addresses, bank details, invoice details, "
|
27 |
+
"participant registration details."
|
28 |
+
"{format_instructions}"
|
29 |
+
)
|
30 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
31 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
32 |
+
[system_message_prompt, human_message_prompt]
|
33 |
+
)
|
34 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
35 |
+
print(output_parser.get_format_instructions())
|
36 |
+
# exit()
|
37 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
38 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/vendor/model.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# generated by datamodel-codegen:
|
2 |
+
# filename: schema.json
|
3 |
+
# timestamp: 2023-07-28T11:36:16+00:00
|
4 |
+
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
from pydantic import BaseModel, Field, constr, validator, ValidationError
|
10 |
+
|
11 |
+
|
12 |
+
class BankDetails(BaseModel):
|
13 |
+
"""account holder name, bank name, account number, branch, ifs code, swift code"""
|
14 |
+
|
15 |
+
account_holder_name: str = Field(..., title="The name of the account holder")
|
16 |
+
bank_name: str = Field(..., title="The name of the bank")
|
17 |
+
account_number: str = Field(..., title="The account number")
|
18 |
+
branch: str = Field(..., title="The branch of the bank")
|
19 |
+
ifs_code: str = Field(..., title="The IFS code of the bank")
|
20 |
+
swift_code: str = Field(..., title="The SWIFT code of the bank")
|
21 |
+
|
22 |
+
|
23 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
24 |
+
"""
|
25 |
+
GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
|
26 |
+
bank details: (account holder name, bank name, account number, branch, ifs code, swift
|
27 |
+
code), recipient, registration id, registration fee, registration date/time
|
28 |
+
"""
|
29 |
+
|
30 |
+
gstin: constr(min_length=15) = Field(
|
31 |
+
..., title="The alphanumeric GSTIN/GST number code"
|
32 |
+
)
|
33 |
+
billing_address: str = Field(..., title="The billing address")
|
34 |
+
invoice_number: str = Field(..., title="The invoice number")
|
35 |
+
invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
|
36 |
+
due_date: datetime = Field(..., title="The date-time the invoice is due")
|
37 |
+
total: float = Field(..., title="Total amount or price")
|
38 |
+
balance_due: float = Field(..., title="The amount due")
|
39 |
+
bank_details: BankDetails = Field(..., title="Bank details")
|
40 |
+
recipient: str = Field(
|
41 |
+
...,
|
42 |
+
title="Name of the person/entity that the invoice item was charged or delivered to",
|
43 |
+
)
|
44 |
+
registration_id: str = Field(..., title="The registration ID")
|
45 |
+
registration_fee: float = Field(..., title="The registration fee")
|
46 |
+
registration_date_time: datetime = Field(..., title="The registration date-time")
|
examples/example1.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
|
3 |
+
size 45782
|
examples/rotated.jpeg
ADDED
Git LFS Details
|
examples/rotated.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
|
3 |
+
size 333463
|
examples/upright.jpeg
ADDED
Git LFS Details
|
examples/upright.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
|
3 |
+
size 325064
|
extract.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
|
2 |
+
"""
|
3 |
+
from io import BytesIO
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
import pyocr.tesseract
|
7 |
+
import pypdf
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
|
12 |
+
"""Extracts text from the given PDF file using pypdf.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
bytes_stream (BytesIO): The PDF file to extract text from.
|
16 |
+
|
17 |
+
Returns: The extracted text
|
18 |
+
"""
|
19 |
+
pdf_reader = pypdf.PdfReader(bytes_stream)
|
20 |
+
text = ""
|
21 |
+
for page in pdf_reader.pages:
|
22 |
+
text += page.extract_text()
|
23 |
+
text += "\n\n"
|
24 |
+
return text
|
25 |
+
|
26 |
+
|
27 |
+
def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
|
28 |
+
"""Extracts text from the given image using tesseract via pyocr.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
image(PIL.Image.Image): The image to extract text from.
|
32 |
+
|
33 |
+
Returns: The extracted text.
|
34 |
+
"""
|
35 |
+
if not pyocr.tesseract.is_available():
|
36 |
+
raise Exception("Tesseract is not available.")
|
37 |
+
text = pyocr.tesseract.image_to_string(image, lang="eng")
|
38 |
+
return text
|
39 |
+
|
40 |
+
|
41 |
+
def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
|
42 |
+
"""Extracts text from the given images using tesseract via pyocr.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
images(List[PIL.Image.Image]): The images to extract text from.
|
46 |
+
|
47 |
+
Returns: The extracted text.
|
48 |
+
"""
|
49 |
+
text = ""
|
50 |
+
for image in images:
|
51 |
+
text += extract_text_from_image_pyocr_tesseract(image)
|
52 |
+
text += "\n\n"
|
53 |
+
image.close()
|
54 |
+
return text
|
55 |
+
|
56 |
+
if __name__ == '__main__':
|
57 |
+
filename = 'examples/upright.pdf'
|
58 |
+
with open(filename, 'rb') as file:
|
59 |
+
bytes_stream = BytesIO(file.read())
|
60 |
+
text = extract_text_from_pdf_pypdf(bytes_stream)
|
61 |
+
print(text)
|
62 |
+
print("-"*25)
|
63 |
+
filename = 'examples/upright.jpeg'
|
64 |
+
image = Image.open(filename)
|
65 |
+
text = extract_text_from_image_pyocr_tesseract(image)
|
66 |
+
print(text)
|
67 |
+
image.close()
|
main.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import categories
|
4 |
+
import processing
|
5 |
+
import extract
|
6 |
+
from PIL import Image
|
7 |
+
from pydantic import BaseModel
|
8 |
+
from io import BytesIO
|
9 |
+
|
10 |
+
def categorize_and_parse_text(text: str) -> BaseModel:
|
11 |
+
"""Categorizes the text and parses the information from it.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
text(str): The text to categorize and parse information from.
|
15 |
+
|
16 |
+
Returns: The category of the text.
|
17 |
+
"""
|
18 |
+
category = categories.categorize_text(text)
|
19 |
+
print("Categorized as category", category)
|
20 |
+
result = categories.run_category_chain(category, text)
|
21 |
+
return result
|
22 |
+
|
23 |
+
def process_pdf(filename: Path) -> BaseModel:
|
24 |
+
"""Processes the given PDF file and extracts information from it.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
filename(Path): The PDF file to process.
|
28 |
+
|
29 |
+
Returns: The extracted information.
|
30 |
+
"""
|
31 |
+
with open(filename, "rb") as f:
|
32 |
+
pdf_bytes = bytes(f.read())
|
33 |
+
|
34 |
+
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
|
35 |
+
# If the encoded text is too short, a pdf scanner probably added a watermark
|
36 |
+
if len(text) < 20:
|
37 |
+
# Try to extract text from images
|
38 |
+
images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
|
39 |
+
text = extract.extract_text_from_images_pyocr_tesseract(images)
|
40 |
+
|
41 |
+
result = categorize_and_parse_text(text)
|
42 |
+
return result
|
43 |
+
|
44 |
+
def process_image(filename: Path) -> BaseModel:
|
45 |
+
"""Processes the given image file and extracts information from it.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
filename(Path): The image file to process.
|
49 |
+
|
50 |
+
Returns: The extracted information.
|
51 |
+
"""
|
52 |
+
image = Image.open(filename)
|
53 |
+
text = extract.extract_text_from_image_pyocr_tesseract(image)
|
54 |
+
image.close()
|
55 |
+
result = categorize_and_parse_text(text)
|
56 |
+
return result
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
filename = Path("examples/example1.pdf")
|
60 |
+
result = process_pdf(filename)
|
61 |
+
print(result.json(indent=4))
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
poppler-utils
|
processing.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Responsible for (pre)processing images and PDFs before they are passed to the OCR
|
2 |
+
engine and other miscellaneous actions concerning processing.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import List
|
7 |
+
|
8 |
+
# import cv2
|
9 |
+
# import numpy as np
|
10 |
+
import pyocr
|
11 |
+
from pdf2image import pdf2image
|
12 |
+
from PIL import Image #, ImageOps
|
13 |
+
|
14 |
+
PDF_CONVERSION_DPI = 300
|
15 |
+
ROTATION_CONFIDENCE_THRESHOLD = 2.0
|
16 |
+
|
17 |
+
# def rotate_image(image: Image, angle: float):
|
18 |
+
# """Rotates the given image by the given angle.
|
19 |
+
|
20 |
+
# Args:
|
21 |
+
# image(PIL.Image.Image): The image to be rotated.
|
22 |
+
# angle(float): The angle to rotate the image by.
|
23 |
+
|
24 |
+
# Returns: The rotated image.
|
25 |
+
# """
|
26 |
+
# image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
27 |
+
# height, width, _ = image.shape # Get the image height, width, and channels
|
28 |
+
# # Compute the rotation matrix
|
29 |
+
# rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
|
30 |
+
# # Apply the rotation to the image
|
31 |
+
# rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
|
32 |
+
# rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
|
33 |
+
# return rotated_image
|
34 |
+
|
35 |
+
|
36 |
+
# class PDF_CONVERTER(enum.Enum):
|
37 |
+
# PDF2IMAGE = 1
|
38 |
+
# IMAGEMAGICK = 2
|
39 |
+
|
40 |
+
|
41 |
+
def correct_orientation(image: Image.Image) -> Image.Image:
|
42 |
+
"""Corrects the orientation of an image if it is not upright.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
image(PIL.Image.Image): The pillow image to be corrected.
|
46 |
+
|
47 |
+
Returns: The corrected pillow image as a copy. The original image is not closed.
|
48 |
+
"""
|
49 |
+
if not pyocr.tesseract.is_available():
|
50 |
+
raise Exception("Tesseract is not available.")
|
51 |
+
|
52 |
+
# image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual
|
53 |
+
orientation_info = {}
|
54 |
+
try:
|
55 |
+
orientation_info = pyocr.tesseract.detect_orientation(image)
|
56 |
+
except pyocr.PyocrException as e:
|
57 |
+
print("Orientation detection failed: {}".format(e))
|
58 |
+
# output = pytesseract.image_to_osd(
|
59 |
+
# image, config=" --psm 0", output_type=pytesseract.Output.DICT
|
60 |
+
# )
|
61 |
+
angle = orientation_info.get("angle", 0)
|
62 |
+
confidence = orientation_info.get("confidence", 100)
|
63 |
+
# rotate = output["rotate"]
|
64 |
+
# confidence = output["orientation_conf"]
|
65 |
+
|
66 |
+
if confidence > ROTATION_CONFIDENCE_THRESHOLD:
|
67 |
+
new_image = image.rotate(angle, expand=True)
|
68 |
+
else:
|
69 |
+
new_image = image.copy()
|
70 |
+
return new_image
|
71 |
+
|
72 |
+
|
73 |
+
def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
|
74 |
+
"""Converts a PDF to an image using pdf2image.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
pdf_bytes(bytes): The bytes of the PDF to be converted.
|
78 |
+
|
79 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
80 |
+
"""
|
81 |
+
images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
|
82 |
+
return images
|
83 |
+
|
84 |
+
|
85 |
+
def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
|
86 |
+
"""Converts a PDF to an image using ImageMagick.
|
87 |
+
|
88 |
+
Args:
|
89 |
+
filename(pathlib.Path): The path to the PDF to be converted.
|
90 |
+
dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
|
91 |
+
are saved in the folder as page.jpg or as page-01.jpg,
|
92 |
+
page-02.jpg, etc.
|
93 |
+
|
94 |
+
Returns: dest_folder
|
95 |
+
"""
|
96 |
+
os.system(f"magick convert"
|
97 |
+
f"-density {PDF_CONVERSION_DPI}"
|
98 |
+
f"{filename}"
|
99 |
+
f"-quality 100"
|
100 |
+
f"{dest_folder/'page.jpg'}")
|
101 |
+
return dest_folder
|
102 |
+
|
103 |
+
|
104 |
+
def preprocess_image(image: Image.Image) -> Image.Image:
|
105 |
+
"""Preprocesses an image for future use with OCR.
|
106 |
+
The following operations are performed:
|
107 |
+
1. Orientation correction
|
108 |
+
|
109 |
+
Args:
|
110 |
+
image(PIL.Image.Image): The image to be preprocessed.
|
111 |
+
|
112 |
+
Returns: The preprocessed pillow image.
|
113 |
+
"""
|
114 |
+
rotated_image = correct_orientation(image)
|
115 |
+
result = rotated_image
|
116 |
+
image.close()
|
117 |
+
return result
|
118 |
+
|
119 |
+
def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
|
120 |
+
"""Preprocesses a PDF for future use with OCR.
|
121 |
+
The following operations are performed:
|
122 |
+
1. PDF to image conversion
|
123 |
+
2. Orientation correction
|
124 |
+
|
125 |
+
Args:
|
126 |
+
pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
|
127 |
+
|
128 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
129 |
+
"""
|
130 |
+
images = convert_pdf_to_image_pdf2image(pdf_bytes)
|
131 |
+
result = []
|
132 |
+
for image in images:
|
133 |
+
new_image = preprocess_image(image)
|
134 |
+
image.close()
|
135 |
+
result.append(new_image)
|
136 |
+
return result
|
137 |
+
|
138 |
+
def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
|
139 |
+
"""Preprocesses a PDF for future use with OCR.
|
140 |
+
The following operations are performed:
|
141 |
+
1. PDF to image conversion
|
142 |
+
2. Orientation correction
|
143 |
+
|
144 |
+
Args:
|
145 |
+
filename(pathlib.Path): The path to the PDF to be preprocessed.
|
146 |
+
|
147 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
148 |
+
"""
|
149 |
+
dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
|
150 |
+
result = []
|
151 |
+
for image in dest_folder.glob("*.jpg"):
|
152 |
+
new_image = preprocess_image(image)
|
153 |
+
image.close()
|
154 |
+
result.append(new_image)
|
155 |
+
return result
|
156 |
+
|
157 |
+
if __name__ == '__main__':
|
158 |
+
filename = 'examples/upright.jpeg'
|
159 |
+
image = Image.open(filename)
|
160 |
+
new_image = preprocess_image(image)
|
161 |
+
image.close()
|
162 |
+
new_image.show()
|
163 |
+
new_image.close()
|
164 |
+
|
165 |
+
filename = 'examples/rotated.pdf'
|
166 |
+
with open(filename, 'rb') as file:
|
167 |
+
bytes_ = bytes(file.read())
|
168 |
+
images = preprocess_pdf_pdf2image(bytes_)
|
169 |
+
for image in images:
|
170 |
+
image.show()
|
171 |
+
image.close()
|
requirements.txt
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiobotocore==2.5.0
|
2 |
+
aiofiles==22.1.0
|
3 |
+
aiohttp==3.8.3
|
4 |
+
aioitertools==0.7.1
|
5 |
+
aiosignal==1.2.0
|
6 |
+
aiosqlite==0.18.0
|
7 |
+
alabaster==0.7.12
|
8 |
+
anyio==3.5.0
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi==21.3.0
|
11 |
+
argon2-cffi-bindings==21.2.0
|
12 |
+
arrow==1.2.3
|
13 |
+
astroid==2.14.2
|
14 |
+
astropy==5.1
|
15 |
+
asttokens==2.2.1
|
16 |
+
async-timeout==4.0.2
|
17 |
+
atomicwrites==1.4.0
|
18 |
+
attrs==22.1.0
|
19 |
+
Automat==20.2.0
|
20 |
+
autopep8==1.6.0
|
21 |
+
Babel==2.11.0
|
22 |
+
backcall==0.2.0
|
23 |
+
bcrypt==3.2.0
|
24 |
+
beautifulsoup4==4.12.2
|
25 |
+
binaryornot==0.4.4
|
26 |
+
black==0.0
|
27 |
+
bleach==4.1.0
|
28 |
+
bokeh==3.1.1
|
29 |
+
botocore==1.29.76
|
30 |
+
Bottleneck==1.3.5
|
31 |
+
brotlipy==0.7.0
|
32 |
+
certifi==2023.7.22
|
33 |
+
cffi==1.15.1
|
34 |
+
chardet==4.0.0
|
35 |
+
charset-normalizer==2.0.4
|
36 |
+
click==8.0.4
|
37 |
+
cloudpickle==2.2.1
|
38 |
+
colorama==0.4.6
|
39 |
+
colorcet==3.0.1
|
40 |
+
comm==0.1.3
|
41 |
+
constantly==15.1.0
|
42 |
+
contourpy==1.0.5
|
43 |
+
cookiecutter==1.7.3
|
44 |
+
cryptography==39.0.1
|
45 |
+
cssselect==1.1.0
|
46 |
+
cycler==0.11.0
|
47 |
+
cytoolz==0.12.0
|
48 |
+
daal4py==2023.1.1
|
49 |
+
dask==2023.6.0
|
50 |
+
dataclasses-json==0.5.13
|
51 |
+
datasets==2.12.0
|
52 |
+
datashader==0.15.0
|
53 |
+
datashape==0.5.4
|
54 |
+
debugpy==1.6.7
|
55 |
+
decorator==5.1.1
|
56 |
+
defusedxml==0.7.1
|
57 |
+
diff-match-patch==20200713
|
58 |
+
dill==0.3.6
|
59 |
+
distributed==2023.6.0
|
60 |
+
docstring-to-markdown==0.11
|
61 |
+
docutils==0.18.1
|
62 |
+
entrypoints==0.4
|
63 |
+
et-xmlfile==1.1.0
|
64 |
+
exceptiongroup==1.0.4
|
65 |
+
executing==1.2.0
|
66 |
+
fastjsonschema==2.16.2
|
67 |
+
filelock==3.9.0
|
68 |
+
flake8==6.0.0
|
69 |
+
Flask==2.2.2
|
70 |
+
fonttools==4.25.0
|
71 |
+
frozenlist==1.3.3
|
72 |
+
fsspec==2023.4.0
|
73 |
+
gensim==4.3.0
|
74 |
+
greenlet==2.0.1
|
75 |
+
h5py==3.7.0
|
76 |
+
HeapDict==1.0.1
|
77 |
+
holoviews==1.16.2
|
78 |
+
huggingface-hub==0.15.1
|
79 |
+
hvplot==0.8.4
|
80 |
+
hyperlink==21.0.0
|
81 |
+
idna==3.4
|
82 |
+
imagecodecs==2021.8.26
|
83 |
+
imageio==2.26.0
|
84 |
+
imagesize==1.4.1
|
85 |
+
imbalanced-learn==0.10.1
|
86 |
+
importlib-metadata==6.0.0
|
87 |
+
incremental==21.3.0
|
88 |
+
inflection==0.5.1
|
89 |
+
iniconfig==1.1.1
|
90 |
+
intake==0.6.8
|
91 |
+
intervaltree==3.1.0
|
92 |
+
ipykernel==6.22.0
|
93 |
+
ipython==8.12.0
|
94 |
+
ipython-genutils==0.2.0
|
95 |
+
ipywidgets==8.0.4
|
96 |
+
iso4217==1.9.20220401
|
97 |
+
isort==5.9.3
|
98 |
+
itemadapter==0.3.0
|
99 |
+
itemloaders==1.0.4
|
100 |
+
itsdangerous==2.0.1
|
101 |
+
jaraco.classes==3.2.1
|
102 |
+
jedi==0.18.2
|
103 |
+
jellyfish==0.9.0
|
104 |
+
Jinja2==3.1.2
|
105 |
+
jinja2-time==0.2.0
|
106 |
+
jmespath==0.10.0
|
107 |
+
joblib==1.2.0
|
108 |
+
json5==0.9.6
|
109 |
+
jsonschema==4.17.3
|
110 |
+
jupyter==1.0.0
|
111 |
+
jupyter_client==8.2.0
|
112 |
+
jupyter-console==6.6.3
|
113 |
+
jupyter_core==5.3.0
|
114 |
+
jupyter-events==0.6.3
|
115 |
+
jupyter-server==1.23.6
|
116 |
+
jupyter_server_fileid==0.9.0
|
117 |
+
jupyter_server_terminals==0.4.4
|
118 |
+
jupyter_server_ydoc==0.8.0
|
119 |
+
jupyter-ydoc==0.2.4
|
120 |
+
jupyterlab==3.6.3
|
121 |
+
jupyterlab-pygments==0.1.2
|
122 |
+
jupyterlab_server==2.22.0
|
123 |
+
jupyterlab-widgets==3.0.5
|
124 |
+
keyring==23.13.1
|
125 |
+
kiwisolver==1.4.4
|
126 |
+
langchain==0.0.245
|
127 |
+
langsmith==0.0.15
|
128 |
+
lazy_loader==0.2
|
129 |
+
lazy-object-proxy==1.6.0
|
130 |
+
linkify-it-py==2.0.0
|
131 |
+
llvmlite==0.40.0
|
132 |
+
lmdb==1.4.1
|
133 |
+
locket==1.0.0
|
134 |
+
lxml==4.9.2
|
135 |
+
lz4==4.3.2
|
136 |
+
Markdown==3.4.1
|
137 |
+
markdown-it-py==2.2.0
|
138 |
+
MarkupSafe==2.1.1
|
139 |
+
marshmallow==3.20.1
|
140 |
+
matplotlib==3.7.1
|
141 |
+
matplotlib-inline==0.1.6
|
142 |
+
mccabe==0.7.0
|
143 |
+
mdit-py-plugins==0.3.0
|
144 |
+
mdurl==0.1.0
|
145 |
+
menuinst==1.4.19
|
146 |
+
mistune==3.0.0
|
147 |
+
mkl-fft==1.3.6
|
148 |
+
mkl-random==1.2.2
|
149 |
+
mkl-service==2.4.0
|
150 |
+
more-itertools==8.12.0
|
151 |
+
mpmath==1.2.1
|
152 |
+
msgpack==1.0.3
|
153 |
+
multidict==6.0.2
|
154 |
+
multipledispatch==0.6.0
|
155 |
+
multiprocess==0.70.14
|
156 |
+
munkres==1.1.4
|
157 |
+
mypy-extensions==0.4.3
|
158 |
+
nbclassic==0.5.5
|
159 |
+
nbclient==0.5.13
|
160 |
+
nbconvert==7.7.3
|
161 |
+
nbformat==5.7.0
|
162 |
+
nest-asyncio==1.5.6
|
163 |
+
networkx==2.8.4
|
164 |
+
nltk==3.7
|
165 |
+
notebook==6.5.4
|
166 |
+
notebook_shim==0.2.2
|
167 |
+
numba==0.57.0
|
168 |
+
numexpr==2.8.4
|
169 |
+
numpy==1.24.3
|
170 |
+
numpydoc==1.5.0
|
171 |
+
openai==0.27.8
|
172 |
+
openapi-schema-pydantic==1.2.4
|
173 |
+
opencv-python-headless==4.8.0.74
|
174 |
+
openpyxl==3.0.10
|
175 |
+
packaging==23.0
|
176 |
+
pandas==1.5.3
|
177 |
+
pandocfilters==1.5.0
|
178 |
+
panel==1.1.0
|
179 |
+
param==1.13.0
|
180 |
+
paramiko==2.8.1
|
181 |
+
parsel==1.6.0
|
182 |
+
parso==0.8.3
|
183 |
+
partd==1.2.0
|
184 |
+
pathspec==0.10.3
|
185 |
+
patsy==0.5.3
|
186 |
+
pdf2image==1.16.3
|
187 |
+
pep8==1.7.1
|
188 |
+
pexpect==4.8.0
|
189 |
+
pickleshare==0.7.5
|
190 |
+
Pillow==9.4.0
|
191 |
+
pip==23.1.2
|
192 |
+
platformdirs==3.5.0
|
193 |
+
plotly==5.9.0
|
194 |
+
pluggy==1.0.0
|
195 |
+
ply==3.11
|
196 |
+
pooch==1.4.0
|
197 |
+
poyo==0.5.0
|
198 |
+
prometheus-client==0.14.1
|
199 |
+
prompt-toolkit==3.0.38
|
200 |
+
Protego==0.1.16
|
201 |
+
psutil==5.9.5
|
202 |
+
ptyprocess==0.7.0
|
203 |
+
pure-eval==0.2.2
|
204 |
+
py-cpuinfo==8.0.0
|
205 |
+
pyarrow==11.0.0
|
206 |
+
pyasn1==0.4.8
|
207 |
+
pyasn1-modules==0.2.8
|
208 |
+
pycodestyle==2.10.0
|
209 |
+
pycparser==2.21
|
210 |
+
pyct==0.5.0
|
211 |
+
pycurl==7.45.2
|
212 |
+
pydantic==1.10.12
|
213 |
+
PyDispatcher==2.0.5
|
214 |
+
pydocstyle==6.3.0
|
215 |
+
pyerfa==2.0.0
|
216 |
+
pyflakes==3.0.1
|
217 |
+
Pygments==2.15.1
|
218 |
+
pylint==2.16.2
|
219 |
+
pylint-venv==2.3.0
|
220 |
+
pyls-spyder==0.4.0
|
221 |
+
PyNaCl==1.5.0
|
222 |
+
pyocr==0.8.3
|
223 |
+
pyodbc==4.0.34
|
224 |
+
pyOpenSSL==23.0.0
|
225 |
+
pyparsing==3.0.9
|
226 |
+
pypdf==3.13.0
|
227 |
+
PyQt5==5.15.7
|
228 |
+
PyQt5-sip==12.11.0
|
229 |
+
PyQtWebEngine==5.15.4
|
230 |
+
pyrsistent==0.18.0
|
231 |
+
PySocks==1.7.1
|
232 |
+
pytest==7.3.1
|
233 |
+
python-dateutil==2.8.2
|
234 |
+
python-json-logger==2.0.7
|
235 |
+
python-lsp-black==1.2.1
|
236 |
+
python-lsp-jsonrpc==1.0.0
|
237 |
+
python-lsp-server==1.7.2
|
238 |
+
python-slugify==5.0.2
|
239 |
+
python-snappy==0.6.1
|
240 |
+
pytoolconfig==1.2.5
|
241 |
+
pytz==2022.7
|
242 |
+
pyviz-comms==2.3.0
|
243 |
+
PyWavelets==1.4.1
|
244 |
+
pywin32==305.1
|
245 |
+
pywin32-ctypes==0.2.0
|
246 |
+
pywinpty==2.0.10
|
247 |
+
PyYAML==6.0
|
248 |
+
pyzmq==25.0.2
|
249 |
+
QDarkStyle==3.0.2
|
250 |
+
qstylizer==0.2.2
|
251 |
+
QtAwesome==1.2.2
|
252 |
+
qtconsole==5.4.2
|
253 |
+
QtPy==2.2.0
|
254 |
+
queuelib==1.5.0
|
255 |
+
regex==2022.7.9
|
256 |
+
requests==2.29.0
|
257 |
+
requests-file==1.5.1
|
258 |
+
responses==0.13.3
|
259 |
+
rfc3339-validator==0.1.4
|
260 |
+
rfc3986-validator==0.1.1
|
261 |
+
rope==1.7.0
|
262 |
+
Rtree==1.0.1
|
263 |
+
s3fs==2023.4.0
|
264 |
+
sacremoses==0.0.43
|
265 |
+
scikit-image==0.20.0
|
266 |
+
scikit-learn==1.2.2
|
267 |
+
scikit-learn-intelex==20230426.121158
|
268 |
+
scipy==1.10.1
|
269 |
+
Scrapy==2.8.0
|
270 |
+
seaborn==0.12.2
|
271 |
+
Send2Trash==1.8.0
|
272 |
+
service-identity==18.1.0
|
273 |
+
setuptools==67.8.0
|
274 |
+
sip==6.6.2
|
275 |
+
six==1.16.0
|
276 |
+
smart-open==5.2.1
|
277 |
+
sniffio==1.2.0
|
278 |
+
snowballstemmer==2.2.0
|
279 |
+
sortedcontainers==2.4.0
|
280 |
+
soupsieve==2.4
|
281 |
+
Sphinx==5.0.2
|
282 |
+
sphinxcontrib-applehelp==1.0.2
|
283 |
+
sphinxcontrib-devhelp==1.0.2
|
284 |
+
sphinxcontrib-htmlhelp==2.0.0
|
285 |
+
sphinxcontrib-jsmath==1.0.1
|
286 |
+
sphinxcontrib-qthelp==1.0.3
|
287 |
+
sphinxcontrib-serializinghtml==1.1.5
|
288 |
+
spyder==5.4.3
|
289 |
+
spyder-kernels==2.4.3
|
290 |
+
SQLAlchemy==1.4.39
|
291 |
+
stack-data==0.6.2
|
292 |
+
statsmodels==0.13.5
|
293 |
+
sympy==1.11.1
|
294 |
+
tables==3.8.0
|
295 |
+
tabulate==0.8.10
|
296 |
+
TBB==0.2
|
297 |
+
tblib==1.7.0
|
298 |
+
tenacity==8.2.2
|
299 |
+
terminado==0.17.1
|
300 |
+
text-unidecode==1.3
|
301 |
+
textdistance==4.2.1
|
302 |
+
threadpoolctl==2.2.0
|
303 |
+
three-merge==0.1.1
|
304 |
+
tifffile==2021.7.2
|
305 |
+
tinycss2==1.2.1
|
306 |
+
tldextract==3.2.0
|
307 |
+
tokenizers==0.13.2
|
308 |
+
toml==0.10.2
|
309 |
+
tomli==2.0.1
|
310 |
+
tomlkit==0.11.1
|
311 |
+
toolz==0.12.0
|
312 |
+
torch==2.0.1
|
313 |
+
tornado==6.3.1
|
314 |
+
tqdm==4.65.0
|
315 |
+
traitlets==5.9.0
|
316 |
+
transformers==4.29.2
|
317 |
+
Twisted==22.10.0
|
318 |
+
twisted-iocpsupport==1.0.2
|
319 |
+
typing_extensions==4.6.3
|
320 |
+
typing-inspect==0.9.0
|
321 |
+
uc-micro-py==1.0.1
|
322 |
+
ujson==5.4.0
|
323 |
+
Unidecode==1.2.0
|
324 |
+
urllib3==1.26.16
|
325 |
+
w3lib==1.21.0
|
326 |
+
watchdog==2.1.6
|
327 |
+
wcwidth==0.2.6
|
328 |
+
webencodings==0.5.1
|
329 |
+
websocket-client==0.58.0
|
330 |
+
Werkzeug==2.2.3
|
331 |
+
whatthepatch==1.0.2
|
332 |
+
wheel==0.38.4
|
333 |
+
widgetsnbextension==4.0.5
|
334 |
+
win-inet-pton==1.1.0
|
335 |
+
wrapt==1.14.1
|
336 |
+
xarray==2022.11.0
|
337 |
+
xlwings==0.29.1
|
338 |
+
xxhash==2.0.2
|
339 |
+
xyzservices==2022.9.0
|
340 |
+
y-py==0.5.9
|
341 |
+
yapf==0.31.0
|
342 |
+
yarl==1.8.1
|
343 |
+
ypy-websocket==0.8.2
|
344 |
+
zict==2.2.0
|
345 |
+
zipp==3.11.0
|
346 |
+
zope.interface==5.4.0
|