ankur-bohra commited on
Commit
317211f
1 Parent(s): 6db4a81

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
37
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .conda
2
+ temp*
3
+ __pycache__/
app.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.title("Automatic Reimbursement Tool Demo")
4
+
5
+ with st.container():
6
+ col1, col2 = st.columns(2)
7
+
8
+ with col1:
9
+ st.header("Input")
10
+ st.file_uploader("Upload a PDF file or an image", type=["pdf", "png", "jpg", "jpeg"])
categories/__init__.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+ from . import random_
4
+ from . import accomodation
5
+ from . import travel_cab
6
+ from . import travel_flight
7
+
8
+ # from . import vendor
9
+ from langchain.chains import LLMChain
10
+ from langchain.chat_models import ChatOpenAI
11
+ from langchain.output_parsers import PydanticOutputParser
12
+ from langchain.output_parsers.enum import EnumOutputParser
13
+ from langchain.prompts import (
14
+ ChatPromptTemplate,
15
+ HumanMessagePromptTemplate,
16
+ SystemMessagePromptTemplate,
17
+ )
18
+ from pydantic import BaseModel
19
+
20
+
21
+ class Category(Enum):
22
+ ACCOMODATION = "accomodation"
23
+ TRAVEL_FLIGHT = "travel_flight"
24
+ TRAVEL_CAB = "travel_cab"
25
+ # VENDOR = "vendor"
26
+ RANDOM = "random"
27
+
28
+
29
+ category_modules = {
30
+ Category.ACCOMODATION: accomodation,
31
+ Category.TRAVEL_FLIGHT: travel_flight,
32
+ Category.TRAVEL_CAB: travel_cab,
33
+ # Category.VENDOR: vendor,
34
+ Category.RANDOM: random_,
35
+ }
36
+
37
+ model = ChatOpenAI(
38
+ temperature=0,
39
+ n=1,
40
+ # max_tokens=300,
41
+ model_kwargs={
42
+ "stop": None,
43
+ "top_p": 1,
44
+ "frequency_penalty": 0,
45
+ "presence_penalty": 0,
46
+ },
47
+ )
48
+
49
+ # Build categorizing chain
50
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
51
+ "You are a classifier that, given a bill's text, states what type of bill "
52
+ "category it belongs to: accomodation (bills regarding stays), travel (bills "
53
+ "concerning cab or other land rides), travel (bills concerning flights), random "
54
+ "(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
55
+ "You may want to see if there are Room Details, Check-in/Check-out Date for "
56
+ "Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
57
+ "Travel; Conference Details for Conference organizers; anything else comes under "
58
+ "random category. Your answers must be only the appropriate choice e.g. 'option' and "
59
+ "not 'The given bill belongs to the option category.'\n"
60
+ "{format_instructions}"
61
+ )
62
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
63
+ chat_prompt = ChatPromptTemplate.from_messages(
64
+ [system_message_prompt, human_message_prompt]
65
+ )
66
+ category_parser = EnumOutputParser(enum=Category)
67
+ categorize_chain = LLMChain(
68
+ llm=model, prompt=chat_prompt, output_parser=category_parser
69
+ )
70
+
71
+
72
+ def categorize_text(text: str) -> Category:
73
+ """Categories the text into one of the categories defined in Category by querying
74
+ ChatGPT.
75
+
76
+ Args:
77
+ text(str): The text to categorize.
78
+
79
+ Returns: The category of the text.
80
+ """
81
+ return categorize_chain.run(
82
+ text=text, format_instructions=category_parser.get_format_instructions()
83
+ )
84
+
85
+
86
+ def run_category_chain(category: Category, text: str) -> BaseModel | None:
87
+ """Runs the chain for the given category on the given text.
88
+
89
+ Args:
90
+ category(Category): The category for which the chain is to be run.
91
+ text(str): The text on which the chain is to be run.
92
+
93
+ Returns: The output of the chain.
94
+ """
95
+ output_parser = category_modules[category].output_parser
96
+ try:
97
+ return category_modules[category].chain.run(
98
+ text=text, format_instructions=output_parser.get_format_instructions()
99
+ )
100
+ except Exception as e:
101
+ print("Error in running chain for category", category, ":", e)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ text = """amazonin
106
+ we)
107
+
108
+ Sold By :
109
+
110
+ Spigen India Pvt. Ltd.
111
+
112
+ * Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
113
+ 37//15/1, 15/2,, Adjacent to Starex School, Village
114
+ - Binola, National Highway -8, Tehsil - Manesar
115
+ Gurgaon, Haryana, 122413
116
+
117
+ IN
118
+
119
+ PAN No: ABACS5056L
120
+ GST Registration No: O6ABACS5056L12Z5
121
+
122
+ Order Number: 407-5335982-7837125
123
+ Order Date: 30.05.2023
124
+
125
+ Tax Invoice/Bill of Supply/Cash Memo
126
+ (Original for Recipient)
127
+
128
+ Billing Address :
129
+
130
+ Praveen Bohra
131
+
132
+ E-303, ParkView City 2, Sector 49, Sohna Road
133
+ GURGAON, HARYANA, 122018
134
+
135
+ IN
136
+
137
+ State/UT Code: 06
138
+
139
+ Shipping Address :
140
+
141
+ Praveen Bohra
142
+
143
+ Praveen Bohra
144
+
145
+ E-303, ParkView City 2, Sector 49, Sohna Road
146
+ GURGAON, HARYANA, 122018
147
+
148
+ IN
149
+
150
+ State/UT Code: 06
151
+
152
+ Place of supply: HARYANA
153
+
154
+ Place of delivery: HARYANA
155
+
156
+ Invoice Number : DEL5-21033
157
+ Invoice Details : HR-DEL5-918080915-2324
158
+ Invoice Date : 30.05.2023
159
+
160
+ Description at Tax |Tax /|Tax Total
161
+ p y Rate |Type |Amount|Amount
162
+
163
+ Black) | BO8BHLZHBH ( ACS01744INP )
164
+ HSN:39269099
165
+
166
+ 1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
167
+ 1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
168
+ 9% |SGST| %76.19
169
+
170
+ TOTAL:
171
+
172
+ Amount in Words:
173
+ Nine Hundred Ninety-nine only
174
+
175
+ Whether tax is payable under reverse charge - No
176
+
177
+ For Spigen India Pvt. Ltd.:
178
+ sSoigenrn
179
+
180
+ Authorized Signatory
181
+
182
+ Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
183
+ 2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
184
+
185
+ *ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
186
+
187
+ Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
188
+
189
+ Please note that this invoice is not a demand for payment
190
+
191
+ Page 1 of 1"""
192
+ category = categorize_text(text)
193
+ print("Category:", category)
194
+
195
+ print("\n\n")
196
+ result = run_category_chain(category, text)
197
+ print(result)
categories/accomodation/__init__.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0.6,
14
+ max_tokens=300,
15
+ n=1,
16
+ request_timeout=None,
17
+ model_kwargs={
18
+ 'stop': None,
19
+ 'top_p': 1,
20
+ }
21
+ )
22
+
23
+ # Build category chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are tasked with developing an OCR data extraction system for hotel bills in PDF "
26
+ "format given as text. The system should extract important information necessary for "
27
+ "the reimbursement process from a college. Your prompt should fetch the following "
28
+ "essential details from the hotel bill: hotel name, address, bill number/invoice "
29
+ "number, booking ID / confirmation ID / booking number, check-in date and time, "
30
+ "check-out date and time, total amount, booking platform, bill date.\n"
31
+ "Ensure that the system accurately extracts the above information from the OCR text "
32
+ "of the hotel bill.\n"
33
+ "{format_instructions}"
34
+ )
35
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
36
+ chat_prompt = ChatPromptTemplate.from_messages(
37
+ [system_message_prompt, human_message_prompt]
38
+ )
39
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
40
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
41
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
categories/accomodation/model.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class InformationExtractedFromABillReceipt(BaseModel):
9
+ """
10
+ 1. Hotel Name: [Hotel Name]
11
+ 2. Address: [Hotel Address]
12
+ 3. Bill number/Invoice number: [Bill Number]
13
+ 4. booking ID / Confirmation ID / Booking #: [Booking ID]
14
+ 5. Check-in Date and Time: [Check-in Date Time]
15
+ 6. Check-out Date and Time: [Check-out Date Time]
16
+ 7. Total Amount: [Total Amount Charged]
17
+ 8. Booking platform: [Booking Platform]
18
+ 9. Bill date: [Bill Date]
19
+ """
20
+
21
+ hostel_name: str = Field(..., title="The name of the hotel")
22
+ address: str = Field(..., title="The address of the hotel")
23
+ bill_number: str = Field(..., title="The bill number/invoice number")
24
+ booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
25
+ check_in_date_time: datetime = Field(..., title="The check-in date and time")
26
+ check_out_date_time: datetime = Field(..., title="The check-out date and time")
27
+ total_amount_charged: float = Field(..., title="The total amount charged")
28
+ booking_platform: str = Field(..., title="The booking platform")
29
+ bill_date: datetime = Field(..., title="The bill date")
categories/random_/__init__.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0,
14
+ n=1,
15
+ model_kwargs={
16
+ 'stop': None,
17
+ 'top_p': 1,
18
+ 'frequency_penalty': 0,
19
+ 'presence_penalty': 0,
20
+ }
21
+ )
22
+
23
+ # Build category chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are an information extraction engine that outputs details from OCR processed "
26
+ "documents like uids, total, tax, name, currency, date, seller details, summary. You "
27
+ "may use context to make an educated guess about the currency. Use null if you are "
28
+ "unable to find certain details\n"
29
+ "{format_instructions}"
30
+ )
31
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
32
+ chat_prompt = ChatPromptTemplate.from_messages(
33
+ [system_message_prompt, human_message_prompt]
34
+ )
35
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
36
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
37
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
38
+
39
+ if __name__ == "__main__":
40
+ text = """amazonin
41
+ we)
42
+
43
+ Sold By :
44
+
45
+ Spigen India Pvt. Ltd.
46
+
47
+ * Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
48
+ 37//15/1, 15/2,, Adjacent to Starex School, Village
49
+ - Binola, National Highway -8, Tehsil - Manesar
50
+ Gurgaon, Haryana, 122413
51
+
52
+ IN
53
+
54
+ PAN No: ABACS5056L
55
+ GST Registration No: O6ABACS5056L12Z5
56
+
57
+ Order Number: 407-5335982-7837125
58
+ Order Date: 30.05.2023
59
+
60
+ Tax Invoice/Bill of Supply/Cash Memo
61
+ (Original for Recipient)
62
+
63
+ Billing Address :
64
+
65
+ Praveen Bohra
66
+
67
+ E-303, ParkView City 2, Sector 49, Sohna Road
68
+ GURGAON, HARYANA, 122018
69
+
70
+ IN
71
+
72
+ State/UT Code: 06
73
+
74
+ Shipping Address :
75
+
76
+ Praveen Bohra
77
+
78
+ Praveen Bohra
79
+
80
+ E-303, ParkView City 2, Sector 49, Sohna Road
81
+ GURGAON, HARYANA, 122018
82
+
83
+ IN
84
+
85
+ State/UT Code: 06
86
+
87
+ Place of supply: HARYANA
88
+
89
+ Place of delivery: HARYANA
90
+
91
+ Invoice Number : DEL5-21033
92
+ Invoice Details : HR-DEL5-918080915-2324
93
+ Invoice Date : 30.05.2023
94
+
95
+ Description at Tax |Tax /|Tax Total
96
+ p y Rate |Type |Amount|Amount
97
+
98
+ Black) | BO8BHLZHBH ( ACS01744INP )
99
+ HSN:39269099
100
+
101
+ 1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
102
+ 1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
103
+ 9% |SGST| %76.19
104
+
105
+ TOTAL:
106
+
107
+ Amount in Words:
108
+ Nine Hundred Ninety-nine only
109
+
110
+ Whether tax is payable under reverse charge - No
111
+
112
+ For Spigen India Pvt. Ltd.:
113
+ sSoigenrn
114
+
115
+ Authorized Signatory
116
+
117
+ Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
118
+ 2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
119
+
120
+ *ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
121
+
122
+ Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
123
+
124
+ Please note that this invoice is not a demand for payment
125
+
126
+ Page 1 of 1"""
127
+ result = chain.run(text=text, format_instructions=fixing_parser.get_format_instructions())
128
+ print(result.json(indent=4))
categories/random_/model.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by datamodel-codegen:
2
+ # filename: schema.json
3
+ # timestamp: 2023-07-28T11:36:16+00:00
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import date
8
+ from typing import Dict, Optional, Union
9
+
10
+ import iso4217
11
+ from pydantic import BaseModel, Field, constr, validator, ValidationError
12
+
13
+
14
+ class TaxItem(BaseModel):
15
+ gst: float = Field(
16
+ ...,
17
+ title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
18
+ )
19
+
20
+
21
+ class TaxItem1(BaseModel):
22
+ vat: float = Field(..., title="The total VAT present in the invoice")
23
+
24
+
25
+ class TaxNumberItem(BaseModel):
26
+ gst_number: constr(min_length=15) = Field(
27
+ ..., title="The alphanumeric GSTIN/GST number code"
28
+ )
29
+
30
+
31
+ class TaxNumberItem1(BaseModel):
32
+ vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
33
+
34
+
35
+ class TaxNumberItem2(BaseModel):
36
+ ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
37
+
38
+
39
+ class SellerDetails(BaseModel):
40
+ name: Optional[str] = None
41
+ address: Optional[str] = None
42
+ contact: Optional[str] = None
43
+ tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
44
+ ..., title="Tax information"
45
+ )
46
+ pan_number: constr(min_length=10, max_length=10) = Field(
47
+ ..., title="The 10-character alphanumeric PAN code"
48
+ )
49
+
50
+
51
+ class UIDs(BaseModel):
52
+ invoice_number: str = Field(..., title="The invoice number")
53
+ other_uids: Dict[str, str] = Field(
54
+ ...,
55
+ title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
56
+ )
57
+
58
+
59
+ class InformationExtractedFromABillReceipt(BaseModel):
60
+ uids: UIDs = Field(..., title="Invoice number and other UIDs")
61
+ total: float = Field(..., title="Total amount or price")
62
+ tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
63
+ name: str = Field(
64
+ ...,
65
+ title="Name of the person/entity that the invoice item was charged or delivered to",
66
+ )
67
+ currency: str = Field(
68
+ default="INR",
69
+ title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
70
+ )
71
+ date: date = Field(
72
+ ..., title="The date the invoice was issued"
73
+ )
74
+ seller_details: SellerDetails = Field(..., title="Information about the seller")
75
+ summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
76
+
77
+ @validator("currency")
78
+ @classmethod
79
+ def check_currency(cls, v: str) -> str:
80
+ if not iso4217.Currency.__members__.get(v.lower()):
81
+ raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
82
+ return v.upper()
categories/travel_cab/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0,
14
+ n=1,
15
+ model_kwargs= {
16
+ 'stop': None,
17
+ 'top_p': 1,
18
+ 'frequency_penalty': 0,
19
+ 'presence_penalty': 0,
20
+ }
21
+ )
22
+
23
+ # Build categorizing chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are an information extraction engine that outputs details from OCR processed "
26
+ "documents such as date/time/place of departure and arrival.\n"
27
+ "{format_instructions}"
28
+ )
29
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
30
+ chat_prompt = ChatPromptTemplate.from_messages(
31
+ [system_message_prompt, human_message_prompt]
32
+ )
33
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
34
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
35
+ chain = LLMChain(
36
+ llm=model, prompt=chat_prompt, output_parser=fixing_parser
37
+ )
categories/travel_cab/model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, time
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class InformationExtractedFromABillReceipt(BaseModel):
9
+ ''''''
10
+
11
+ place_from: str = Field(..., title="place where journey starts")
12
+ date_from: date = Field(
13
+ ..., title="date on which journey starts (DD/MM/YYYY)"
14
+ )
15
+ time_from: time = Field(..., title="time at which journey starts")
16
+ place_to: str = Field(..., title="place where journey end")
17
+ date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
18
+ time_to: time = Field(..., title="time at which journey end")
19
+ amount: float = Field(..., title="cost of journey ticket")
categories/travel_flight/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ )
10
+
11
+ model = ChatOpenAI(temperature=0)
12
+
13
+ # Build categorizing chain
14
+ human_message_prompt = HumanMessagePromptTemplate.from_template(
15
+ "Parse through and find the following details from the text extracted from a travel "
16
+ "bill\n"
17
+ "{format_instructions}\n"
18
+ "{text}"
19
+ )
20
+ chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
21
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
22
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
23
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
categories/travel_flight/model.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, time
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class InformationExtractedFromABillReceipt(BaseModel):
9
+ """
10
+ response_schemas = [
11
+ ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
12
+ ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
13
+ ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
14
+ ResponseSchema(name="place (to)", description="place where flight end/lands"),
15
+ ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
16
+ ResponseSchema(name="time (to)", description="time at which flight end/lands"),
17
+ ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
18
+ ResponseSchema(name="amount", description="cost of flight ticket")
19
+ ]"""
20
+
21
+ place_from: str = Field(..., title="place where flight starts/takes-off")
22
+ date_from: date = Field(
23
+ ..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
24
+ )
25
+ time_from: time = Field(..., title="time at which flight starts/takes-off")
26
+ place_to: str = Field(..., title="place where flight end/lands")
27
+ date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
28
+ time_to: time = Field(..., title="time at which flight end/lands")
29
+ pnr_number: str = Field(..., title="PNR Number of flight")
30
+ amount: float = Field(..., title="cost of flight ticket")
categories/vendor/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import InformationExtractedFromABillReceipt as PydanticModel
2
+
3
+ from langchain.chains import LLMChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
6
+ from langchain.prompts import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ )
11
+
12
+ model = ChatOpenAI(
13
+ temperature=0,
14
+ n=1,
15
+ model_kwargs={
16
+ "stop": None,
17
+ "top_p": 1,
18
+ "frequency_penalty": 0,
19
+ "presence_penalty": 0,
20
+ },
21
+ )
22
+
23
+ # Build category chain
24
+ system_message_prompt = SystemMessagePromptTemplate.from_template(
25
+ "You are an information extraction engine that outputs details from OCR processed "
26
+ "documents like uids, total, tax, addresses, bank details, invoice details, "
27
+ "participant registration details."
28
+ "{format_instructions}"
29
+ )
30
+ human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
31
+ chat_prompt = ChatPromptTemplate.from_messages(
32
+ [system_message_prompt, human_message_prompt]
33
+ )
34
+ output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
35
+ print(output_parser.get_format_instructions())
36
+ # exit()
37
+ fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
38
+ chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
categories/vendor/model.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by datamodel-codegen:
2
+ # filename: schema.json
3
+ # timestamp: 2023-07-28T11:36:16+00:00
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime
8
+
9
+ from pydantic import BaseModel, Field, constr, validator, ValidationError
10
+
11
+
12
+ class BankDetails(BaseModel):
13
+ """account holder name, bank name, account number, branch, ifs code, swift code"""
14
+
15
+ account_holder_name: str = Field(..., title="The name of the account holder")
16
+ bank_name: str = Field(..., title="The name of the bank")
17
+ account_number: str = Field(..., title="The account number")
18
+ branch: str = Field(..., title="The branch of the bank")
19
+ ifs_code: str = Field(..., title="The IFS code of the bank")
20
+ swift_code: str = Field(..., title="The SWIFT code of the bank")
21
+
22
+
23
+ class InformationExtractedFromABillReceipt(BaseModel):
24
+ """
25
+ GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
26
+ bank details: (account holder name, bank name, account number, branch, ifs code, swift
27
+ code), recipient, registration id, registration fee, registration date/time
28
+ """
29
+
30
+ gstin: constr(min_length=15) = Field(
31
+ ..., title="The alphanumeric GSTIN/GST number code"
32
+ )
33
+ billing_address: str = Field(..., title="The billing address")
34
+ invoice_number: str = Field(..., title="The invoice number")
35
+ invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
36
+ due_date: datetime = Field(..., title="The date-time the invoice is due")
37
+ total: float = Field(..., title="Total amount or price")
38
+ balance_due: float = Field(..., title="The amount due")
39
+ bank_details: BankDetails = Field(..., title="Bank details")
40
+ recipient: str = Field(
41
+ ...,
42
+ title="Name of the person/entity that the invoice item was charged or delivered to",
43
+ )
44
+ registration_id: str = Field(..., title="The registration ID")
45
+ registration_fee: float = Field(..., title="The registration fee")
46
+ registration_date_time: datetime = Field(..., title="The registration date-time")
examples/example1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
3
+ size 45782
examples/rotated.jpeg ADDED

Git LFS Details

  • SHA256: e98aa24e25b2c3f277c237664cba4616fbe5d80fe3099459fb81e2ef3720d23c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.79 MB
examples/rotated.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
3
+ size 333463
examples/upright.jpeg ADDED

Git LFS Details

  • SHA256: 728be2c94b4af573145e5e89ffe5c3dfddb12a3055b85e60a23bd7697cff83f7
  • Pointer size: 132 Bytes
  • Size of remote file: 2.93 MB
examples/upright.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
3
+ size 325064
extract.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Responsible for extracting text from images and PDFs using OCR engines or other modules.
2
+ """
3
+ from io import BytesIO
4
+ from typing import List
5
+
6
+ import pyocr.tesseract
7
+ import pypdf
8
+ from PIL import Image
9
+
10
+
11
+ def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
12
+ """Extracts text from the given PDF file using pypdf.
13
+
14
+ Args:
15
+ bytes_stream (BytesIO): The PDF file to extract text from.
16
+
17
+ Returns: The extracted text
18
+ """
19
+ pdf_reader = pypdf.PdfReader(bytes_stream)
20
+ text = ""
21
+ for page in pdf_reader.pages:
22
+ text += page.extract_text()
23
+ text += "\n\n"
24
+ return text
25
+
26
+
27
+ def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
28
+ """Extracts text from the given image using tesseract via pyocr.
29
+
30
+ Args:
31
+ image(PIL.Image.Image): The image to extract text from.
32
+
33
+ Returns: The extracted text.
34
+ """
35
+ if not pyocr.tesseract.is_available():
36
+ raise Exception("Tesseract is not available.")
37
+ text = pyocr.tesseract.image_to_string(image, lang="eng")
38
+ return text
39
+
40
+
41
+ def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
42
+ """Extracts text from the given images using tesseract via pyocr.
43
+
44
+ Args:
45
+ images(List[PIL.Image.Image]): The images to extract text from.
46
+
47
+ Returns: The extracted text.
48
+ """
49
+ text = ""
50
+ for image in images:
51
+ text += extract_text_from_image_pyocr_tesseract(image)
52
+ text += "\n\n"
53
+ image.close()
54
+ return text
55
+
56
+ if __name__ == '__main__':
57
+ filename = 'examples/upright.pdf'
58
+ with open(filename, 'rb') as file:
59
+ bytes_stream = BytesIO(file.read())
60
+ text = extract_text_from_pdf_pypdf(bytes_stream)
61
+ print(text)
62
+ print("-"*25)
63
+ filename = 'examples/upright.jpeg'
64
+ image = Image.open(filename)
65
+ text = extract_text_from_image_pyocr_tesseract(image)
66
+ print(text)
67
+ image.close()
main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import categories
4
+ import processing
5
+ import extract
6
+ from PIL import Image
7
+ from pydantic import BaseModel
8
+ from io import BytesIO
9
+
10
+ def categorize_and_parse_text(text: str) -> BaseModel:
11
+ """Categorizes the text and parses the information from it.
12
+
13
+ Args:
14
+ text(str): The text to categorize and parse information from.
15
+
16
+ Returns: The category of the text.
17
+ """
18
+ category = categories.categorize_text(text)
19
+ print("Categorized as category", category)
20
+ result = categories.run_category_chain(category, text)
21
+ return result
22
+
23
+ def process_pdf(filename: Path) -> BaseModel:
24
+ """Processes the given PDF file and extracts information from it.
25
+
26
+ Args:
27
+ filename(Path): The PDF file to process.
28
+
29
+ Returns: The extracted information.
30
+ """
31
+ with open(filename, "rb") as f:
32
+ pdf_bytes = bytes(f.read())
33
+
34
+ text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
35
+ # If the encoded text is too short, a pdf scanner probably added a watermark
36
+ if len(text) < 20:
37
+ # Try to extract text from images
38
+ images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
39
+ text = extract.extract_text_from_images_pyocr_tesseract(images)
40
+
41
+ result = categorize_and_parse_text(text)
42
+ return result
43
+
44
+ def process_image(filename: Path) -> BaseModel:
45
+ """Processes the given image file and extracts information from it.
46
+
47
+ Args:
48
+ filename(Path): The image file to process.
49
+
50
+ Returns: The extracted information.
51
+ """
52
+ image = Image.open(filename)
53
+ text = extract.extract_text_from_image_pyocr_tesseract(image)
54
+ image.close()
55
+ result = categorize_and_parse_text(text)
56
+ return result
57
+
58
+ if __name__ == "__main__":
59
+ filename = Path("examples/example1.pdf")
60
+ result = process_pdf(filename)
61
+ print(result.json(indent=4))
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ poppler-utils
processing.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Responsible for (pre)processing images and PDFs before they are passed to the OCR
2
+ engine and other miscellaneous actions concerning processing.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ # import cv2
9
+ # import numpy as np
10
+ import pyocr
11
+ from pdf2image import pdf2image
12
+ from PIL import Image #, ImageOps
13
+
14
+ PDF_CONVERSION_DPI = 300
15
+ ROTATION_CONFIDENCE_THRESHOLD = 2.0
16
+
17
+ # def rotate_image(image: Image, angle: float):
18
+ # """Rotates the given image by the given angle.
19
+
20
+ # Args:
21
+ # image(PIL.Image.Image): The image to be rotated.
22
+ # angle(float): The angle to rotate the image by.
23
+
24
+ # Returns: The rotated image.
25
+ # """
26
+ # image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
27
+ # height, width, _ = image.shape # Get the image height, width, and channels
28
+ # # Compute the rotation matrix
29
+ # rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
30
+ # # Apply the rotation to the image
31
+ # rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
32
+ # rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
33
+ # return rotated_image
34
+
35
+
36
+ # class PDF_CONVERTER(enum.Enum):
37
+ # PDF2IMAGE = 1
38
+ # IMAGEMAGICK = 2
39
+
40
+
41
+ def correct_orientation(image: Image.Image) -> Image.Image:
42
+ """Corrects the orientation of an image if it is not upright.
43
+
44
+ Args:
45
+ image(PIL.Image.Image): The pillow image to be corrected.
46
+
47
+ Returns: The corrected pillow image as a copy. The original image is not closed.
48
+ """
49
+ if not pyocr.tesseract.is_available():
50
+ raise Exception("Tesseract is not available.")
51
+
52
+ # image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual
53
+ orientation_info = {}
54
+ try:
55
+ orientation_info = pyocr.tesseract.detect_orientation(image)
56
+ except pyocr.PyocrException as e:
57
+ print("Orientation detection failed: {}".format(e))
58
+ # output = pytesseract.image_to_osd(
59
+ # image, config=" --psm 0", output_type=pytesseract.Output.DICT
60
+ # )
61
+ angle = orientation_info.get("angle", 0)
62
+ confidence = orientation_info.get("confidence", 100)
63
+ # rotate = output["rotate"]
64
+ # confidence = output["orientation_conf"]
65
+
66
+ if confidence > ROTATION_CONFIDENCE_THRESHOLD:
67
+ new_image = image.rotate(angle, expand=True)
68
+ else:
69
+ new_image = image.copy()
70
+ return new_image
71
+
72
+
73
+ def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
74
+ """Converts a PDF to an image using pdf2image.
75
+
76
+ Args:
77
+ pdf_bytes(bytes): The bytes of the PDF to be converted.
78
+
79
+ Returns: A list of pillow images corresponding to each page from the PDF.
80
+ """
81
+ images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
82
+ return images
83
+
84
+
85
+ def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
86
+ """Converts a PDF to an image using ImageMagick.
87
+
88
+ Args:
89
+ filename(pathlib.Path): The path to the PDF to be converted.
90
+ dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
91
+ are saved in the folder as page.jpg or as page-01.jpg,
92
+ page-02.jpg, etc.
93
+
94
+ Returns: dest_folder
95
+ """
96
+ os.system(f"magick convert"
97
+ f"-density {PDF_CONVERSION_DPI}"
98
+ f"{filename}"
99
+ f"-quality 100"
100
+ f"{dest_folder/'page.jpg'}")
101
+ return dest_folder
102
+
103
+
104
+ def preprocess_image(image: Image.Image) -> Image.Image:
105
+ """Preprocesses an image for future use with OCR.
106
+ The following operations are performed:
107
+ 1. Orientation correction
108
+
109
+ Args:
110
+ image(PIL.Image.Image): The image to be preprocessed.
111
+
112
+ Returns: The preprocessed pillow image.
113
+ """
114
+ rotated_image = correct_orientation(image)
115
+ result = rotated_image
116
+ image.close()
117
+ return result
118
+
119
+ def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
120
+ """Preprocesses a PDF for future use with OCR.
121
+ The following operations are performed:
122
+ 1. PDF to image conversion
123
+ 2. Orientation correction
124
+
125
+ Args:
126
+ pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
127
+
128
+ Returns: A list of pillow images corresponding to each page from the PDF.
129
+ """
130
+ images = convert_pdf_to_image_pdf2image(pdf_bytes)
131
+ result = []
132
+ for image in images:
133
+ new_image = preprocess_image(image)
134
+ image.close()
135
+ result.append(new_image)
136
+ return result
137
+
138
+ def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
139
+ """Preprocesses a PDF for future use with OCR.
140
+ The following operations are performed:
141
+ 1. PDF to image conversion
142
+ 2. Orientation correction
143
+
144
+ Args:
145
+ filename(pathlib.Path): The path to the PDF to be preprocessed.
146
+
147
+ Returns: A list of pillow images corresponding to each page from the PDF.
148
+ """
149
+ dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
150
+ result = []
151
+ for image in dest_folder.glob("*.jpg"):
152
+ new_image = preprocess_image(image)
153
+ image.close()
154
+ result.append(new_image)
155
+ return result
156
+
157
+ if __name__ == '__main__':
158
+ filename = 'examples/upright.jpeg'
159
+ image = Image.open(filename)
160
+ new_image = preprocess_image(image)
161
+ image.close()
162
+ new_image.show()
163
+ new_image.close()
164
+
165
+ filename = 'examples/rotated.pdf'
166
+ with open(filename, 'rb') as file:
167
+ bytes_ = bytes(file.read())
168
+ images = preprocess_pdf_pdf2image(bytes_)
169
+ for image in images:
170
+ image.show()
171
+ image.close()
requirements.txt ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiobotocore==2.5.0
2
+ aiofiles==22.1.0
3
+ aiohttp==3.8.3
4
+ aioitertools==0.7.1
5
+ aiosignal==1.2.0
6
+ aiosqlite==0.18.0
7
+ alabaster==0.7.12
8
+ anyio==3.5.0
9
+ appdirs==1.4.4
10
+ argon2-cffi==21.3.0
11
+ argon2-cffi-bindings==21.2.0
12
+ arrow==1.2.3
13
+ astroid==2.14.2
14
+ astropy==5.1
15
+ asttokens==2.2.1
16
+ async-timeout==4.0.2
17
+ atomicwrites==1.4.0
18
+ attrs==22.1.0
19
+ Automat==20.2.0
20
+ autopep8==1.6.0
21
+ Babel==2.11.0
22
+ backcall==0.2.0
23
+ bcrypt==3.2.0
24
+ beautifulsoup4==4.12.2
25
+ binaryornot==0.4.4
26
+ black==0.0
27
+ bleach==4.1.0
28
+ bokeh==3.1.1
29
+ botocore==1.29.76
30
+ Bottleneck==1.3.5
31
+ brotlipy==0.7.0
32
+ certifi==2023.7.22
33
+ cffi==1.15.1
34
+ chardet==4.0.0
35
+ charset-normalizer==2.0.4
36
+ click==8.0.4
37
+ cloudpickle==2.2.1
38
+ colorama==0.4.6
39
+ colorcet==3.0.1
40
+ comm==0.1.3
41
+ constantly==15.1.0
42
+ contourpy==1.0.5
43
+ cookiecutter==1.7.3
44
+ cryptography==39.0.1
45
+ cssselect==1.1.0
46
+ cycler==0.11.0
47
+ cytoolz==0.12.0
48
+ daal4py==2023.1.1
49
+ dask==2023.6.0
50
+ dataclasses-json==0.5.13
51
+ datasets==2.12.0
52
+ datashader==0.15.0
53
+ datashape==0.5.4
54
+ debugpy==1.6.7
55
+ decorator==5.1.1
56
+ defusedxml==0.7.1
57
+ diff-match-patch==20200713
58
+ dill==0.3.6
59
+ distributed==2023.6.0
60
+ docstring-to-markdown==0.11
61
+ docutils==0.18.1
62
+ entrypoints==0.4
63
+ et-xmlfile==1.1.0
64
+ exceptiongroup==1.0.4
65
+ executing==1.2.0
66
+ fastjsonschema==2.16.2
67
+ filelock==3.9.0
68
+ flake8==6.0.0
69
+ Flask==2.2.2
70
+ fonttools==4.25.0
71
+ frozenlist==1.3.3
72
+ fsspec==2023.4.0
73
+ gensim==4.3.0
74
+ greenlet==2.0.1
75
+ h5py==3.7.0
76
+ HeapDict==1.0.1
77
+ holoviews==1.16.2
78
+ huggingface-hub==0.15.1
79
+ hvplot==0.8.4
80
+ hyperlink==21.0.0
81
+ idna==3.4
82
+ imagecodecs==2021.8.26
83
+ imageio==2.26.0
84
+ imagesize==1.4.1
85
+ imbalanced-learn==0.10.1
86
+ importlib-metadata==6.0.0
87
+ incremental==21.3.0
88
+ inflection==0.5.1
89
+ iniconfig==1.1.1
90
+ intake==0.6.8
91
+ intervaltree==3.1.0
92
+ ipykernel==6.22.0
93
+ ipython==8.12.0
94
+ ipython-genutils==0.2.0
95
+ ipywidgets==8.0.4
96
+ iso4217==1.9.20220401
97
+ isort==5.9.3
98
+ itemadapter==0.3.0
99
+ itemloaders==1.0.4
100
+ itsdangerous==2.0.1
101
+ jaraco.classes==3.2.1
102
+ jedi==0.18.2
103
+ jellyfish==0.9.0
104
+ Jinja2==3.1.2
105
+ jinja2-time==0.2.0
106
+ jmespath==0.10.0
107
+ joblib==1.2.0
108
+ json5==0.9.6
109
+ jsonschema==4.17.3
110
+ jupyter==1.0.0
111
+ jupyter_client==8.2.0
112
+ jupyter-console==6.6.3
113
+ jupyter_core==5.3.0
114
+ jupyter-events==0.6.3
115
+ jupyter-server==1.23.6
116
+ jupyter_server_fileid==0.9.0
117
+ jupyter_server_terminals==0.4.4
118
+ jupyter_server_ydoc==0.8.0
119
+ jupyter-ydoc==0.2.4
120
+ jupyterlab==3.6.3
121
+ jupyterlab-pygments==0.1.2
122
+ jupyterlab_server==2.22.0
123
+ jupyterlab-widgets==3.0.5
124
+ keyring==23.13.1
125
+ kiwisolver==1.4.4
126
+ langchain==0.0.245
127
+ langsmith==0.0.15
128
+ lazy_loader==0.2
129
+ lazy-object-proxy==1.6.0
130
+ linkify-it-py==2.0.0
131
+ llvmlite==0.40.0
132
+ lmdb==1.4.1
133
+ locket==1.0.0
134
+ lxml==4.9.2
135
+ lz4==4.3.2
136
+ Markdown==3.4.1
137
+ markdown-it-py==2.2.0
138
+ MarkupSafe==2.1.1
139
+ marshmallow==3.20.1
140
+ matplotlib==3.7.1
141
+ matplotlib-inline==0.1.6
142
+ mccabe==0.7.0
143
+ mdit-py-plugins==0.3.0
144
+ mdurl==0.1.0
145
+ menuinst==1.4.19
146
+ mistune==3.0.0
147
+ mkl-fft==1.3.6
148
+ mkl-random==1.2.2
149
+ mkl-service==2.4.0
150
+ more-itertools==8.12.0
151
+ mpmath==1.2.1
152
+ msgpack==1.0.3
153
+ multidict==6.0.2
154
+ multipledispatch==0.6.0
155
+ multiprocess==0.70.14
156
+ munkres==1.1.4
157
+ mypy-extensions==0.4.3
158
+ nbclassic==0.5.5
159
+ nbclient==0.5.13
160
+ nbconvert==7.7.3
161
+ nbformat==5.7.0
162
+ nest-asyncio==1.5.6
163
+ networkx==2.8.4
164
+ nltk==3.7
165
+ notebook==6.5.4
166
+ notebook_shim==0.2.2
167
+ numba==0.57.0
168
+ numexpr==2.8.4
169
+ numpy==1.24.3
170
+ numpydoc==1.5.0
171
+ openai==0.27.8
172
+ openapi-schema-pydantic==1.2.4
173
+ opencv-python-headless==4.8.0.74
174
+ openpyxl==3.0.10
175
+ packaging==23.0
176
+ pandas==1.5.3
177
+ pandocfilters==1.5.0
178
+ panel==1.1.0
179
+ param==1.13.0
180
+ paramiko==2.8.1
181
+ parsel==1.6.0
182
+ parso==0.8.3
183
+ partd==1.2.0
184
+ pathspec==0.10.3
185
+ patsy==0.5.3
186
+ pdf2image==1.16.3
187
+ pep8==1.7.1
188
+ pexpect==4.8.0
189
+ pickleshare==0.7.5
190
+ Pillow==9.4.0
191
+ pip==23.1.2
192
+ platformdirs==3.5.0
193
+ plotly==5.9.0
194
+ pluggy==1.0.0
195
+ ply==3.11
196
+ pooch==1.4.0
197
+ poyo==0.5.0
198
+ prometheus-client==0.14.1
199
+ prompt-toolkit==3.0.38
200
+ Protego==0.1.16
201
+ psutil==5.9.5
202
+ ptyprocess==0.7.0
203
+ pure-eval==0.2.2
204
+ py-cpuinfo==8.0.0
205
+ pyarrow==11.0.0
206
+ pyasn1==0.4.8
207
+ pyasn1-modules==0.2.8
208
+ pycodestyle==2.10.0
209
+ pycparser==2.21
210
+ pyct==0.5.0
211
+ pycurl==7.45.2
212
+ pydantic==1.10.12
213
+ PyDispatcher==2.0.5
214
+ pydocstyle==6.3.0
215
+ pyerfa==2.0.0
216
+ pyflakes==3.0.1
217
+ Pygments==2.15.1
218
+ pylint==2.16.2
219
+ pylint-venv==2.3.0
220
+ pyls-spyder==0.4.0
221
+ PyNaCl==1.5.0
222
+ pyocr==0.8.3
223
+ pyodbc==4.0.34
224
+ pyOpenSSL==23.0.0
225
+ pyparsing==3.0.9
226
+ pypdf==3.13.0
227
+ PyQt5==5.15.7
228
+ PyQt5-sip==12.11.0
229
+ PyQtWebEngine==5.15.4
230
+ pyrsistent==0.18.0
231
+ PySocks==1.7.1
232
+ pytest==7.3.1
233
+ python-dateutil==2.8.2
234
+ python-json-logger==2.0.7
235
+ python-lsp-black==1.2.1
236
+ python-lsp-jsonrpc==1.0.0
237
+ python-lsp-server==1.7.2
238
+ python-slugify==5.0.2
239
+ python-snappy==0.6.1
240
+ pytoolconfig==1.2.5
241
+ pytz==2022.7
242
+ pyviz-comms==2.3.0
243
+ PyWavelets==1.4.1
244
+ pywin32==305.1
245
+ pywin32-ctypes==0.2.0
246
+ pywinpty==2.0.10
247
+ PyYAML==6.0
248
+ pyzmq==25.0.2
249
+ QDarkStyle==3.0.2
250
+ qstylizer==0.2.2
251
+ QtAwesome==1.2.2
252
+ qtconsole==5.4.2
253
+ QtPy==2.2.0
254
+ queuelib==1.5.0
255
+ regex==2022.7.9
256
+ requests==2.29.0
257
+ requests-file==1.5.1
258
+ responses==0.13.3
259
+ rfc3339-validator==0.1.4
260
+ rfc3986-validator==0.1.1
261
+ rope==1.7.0
262
+ Rtree==1.0.1
263
+ s3fs==2023.4.0
264
+ sacremoses==0.0.43
265
+ scikit-image==0.20.0
266
+ scikit-learn==1.2.2
267
+ scikit-learn-intelex==20230426.121158
268
+ scipy==1.10.1
269
+ Scrapy==2.8.0
270
+ seaborn==0.12.2
271
+ Send2Trash==1.8.0
272
+ service-identity==18.1.0
273
+ setuptools==67.8.0
274
+ sip==6.6.2
275
+ six==1.16.0
276
+ smart-open==5.2.1
277
+ sniffio==1.2.0
278
+ snowballstemmer==2.2.0
279
+ sortedcontainers==2.4.0
280
+ soupsieve==2.4
281
+ Sphinx==5.0.2
282
+ sphinxcontrib-applehelp==1.0.2
283
+ sphinxcontrib-devhelp==1.0.2
284
+ sphinxcontrib-htmlhelp==2.0.0
285
+ sphinxcontrib-jsmath==1.0.1
286
+ sphinxcontrib-qthelp==1.0.3
287
+ sphinxcontrib-serializinghtml==1.1.5
288
+ spyder==5.4.3
289
+ spyder-kernels==2.4.3
290
+ SQLAlchemy==1.4.39
291
+ stack-data==0.6.2
292
+ statsmodels==0.13.5
293
+ sympy==1.11.1
294
+ tables==3.8.0
295
+ tabulate==0.8.10
296
+ TBB==0.2
297
+ tblib==1.7.0
298
+ tenacity==8.2.2
299
+ terminado==0.17.1
300
+ text-unidecode==1.3
301
+ textdistance==4.2.1
302
+ threadpoolctl==2.2.0
303
+ three-merge==0.1.1
304
+ tifffile==2021.7.2
305
+ tinycss2==1.2.1
306
+ tldextract==3.2.0
307
+ tokenizers==0.13.2
308
+ toml==0.10.2
309
+ tomli==2.0.1
310
+ tomlkit==0.11.1
311
+ toolz==0.12.0
312
+ torch==2.0.1
313
+ tornado==6.3.1
314
+ tqdm==4.65.0
315
+ traitlets==5.9.0
316
+ transformers==4.29.2
317
+ Twisted==22.10.0
318
+ twisted-iocpsupport==1.0.2
319
+ typing_extensions==4.6.3
320
+ typing-inspect==0.9.0
321
+ uc-micro-py==1.0.1
322
+ ujson==5.4.0
323
+ Unidecode==1.2.0
324
+ urllib3==1.26.16
325
+ w3lib==1.21.0
326
+ watchdog==2.1.6
327
+ wcwidth==0.2.6
328
+ webencodings==0.5.1
329
+ websocket-client==0.58.0
330
+ Werkzeug==2.2.3
331
+ whatthepatch==1.0.2
332
+ wheel==0.38.4
333
+ widgetsnbextension==4.0.5
334
+ win-inet-pton==1.1.0
335
+ wrapt==1.14.1
336
+ xarray==2022.11.0
337
+ xlwings==0.29.1
338
+ xxhash==2.0.2
339
+ xyzservices==2022.9.0
340
+ y-py==0.5.9
341
+ yapf==0.31.0
342
+ yarl==1.8.1
343
+ ypy-websocket==0.8.2
344
+ zict==2.2.0
345
+ zipp==3.11.0
346
+ zope.interface==5.4.0