Spaces:
Sleeping
Sleeping
File size: 14,715 Bytes
48421ca 358551d 48421ca 0c479b4 d064696 48421ca 9f88b7d 48421ca 8aa59e6 22419aa 4aa91e5 8aa59e6 1d54736 4aa91e5 22419aa 8aa59e6 0cef158 1d54736 0cef158 1d54736 873d31e 0cef158 1d54736 d9660e4 1d54736 0cef158 1d54736 873d31e 0cef158 873d31e 1d54736 0cef158 1d54736 d064696 8c7bd26 d064696 1d54736 d064696 1d54736 8aa59e6 4aa91e5 48421ca 4aa91e5 48421ca 1d54736 d064696 48421ca 97c9bbb 48421ca d064696 48421ca d064696 4aa91e5 8622e7c d064696 48421ca 4aa91e5 48421ca d064696 48421ca c65b957 d064696 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
from fastapi import FastAPI, Request, BackgroundTasks
import json
import io
from openai import Client
from supabase import create_client
from typing import List, Dict, Any
import asyncio
import logging
from datetime import datetime
import os
import tiktoken
# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI()
client = Client(api_key=os.getenv('OPENAI_API_KEY'),organization=os.getenv('ORG_ID'))
url: str = os.getenv('SUPABASE_URL')
key: str = os.getenv('SUPABASE_KEY')
supabase: Client = create_client(url, key)
@app.post("/send/batch_processing")
async def testv1(request: Request, background_tasks: BackgroundTasks):
try:
body_data = await request.json()
print(body_data)
# Add processing to background tasks
background_tasks.add_task(process_batch_job, body_data)
return {'data': 'Batch job is scheduled!'}
except Exception as e:
return {'error': str(e)}
# def receipt_radar_prompt(raw_text:str)->str:
# insurance_response_structure = """
# {
# "insurance_type": "Classify it into 8 categories travel , health , term , vehicle, property,liability, life , buisness only .Try to find the closest possible based on the receipt text, if you don't understand the type classify it as others.",
# "policy_details": {
# "policyholder_name": "",
# "policy_number": "",
# "insurance_start_date": "",
# "insurance_end_date": "",
# "premium_amount": "",
# "payment_frequency": ""
# },
# "coverage_details": {
# "covered_items": {
# "item_type": "",
# "product_company": "",
# "product_model": "",
# "product_manufacturing_year": ""
# },
# "comprehensive_coverage_type_policy": "yes/no"
# }
# }
# """
# travel_response_structure = """
# travel_type(bus,train,airplane,taxi,bike,rickshaw classify in these categories only strictly),travel_company_name , departure_destination , arrival_destination , arrival_city(if you are not able to find the arrival city add the arrival destination into this field strictly. ), departure_date,arrival_date .If the arrival and departure dates are the same from receipt text given to you analyse it properly to check that, then only use the same date in both the fields .if you don't find any field mark it as null.
# """
# hotel_data_points = """ hotel_type(hotel_stay , dine_in , dine_in + stay(use both keyword strictly)), hotel_brand_name , hotel_location , hotel_checkin_date , hotel_checkout_date. if you don't find any field mark it as null """
# coupon_data_points = """
# {
# "brand_name": "Extract the brand_name from where the coupon is sent",
# "validity": {
# "start_date": "Date the coupon is valid from, if provided.other wise mark it as null",
# "expiration_date": "Date the coupon expires. Leave blank if not mentioned."
# },
# "coupon_code": "Extract the unique code found in the email. If unavailable, leave it blank.",
# "description": "Provide the discount details (amount/percentage), minimum purchase, eligible products, retailer, any terms or restrictions, usage limit (single-use/multi-use), and where the coupon applies (in-store/online)."
# }
# """
# system_prompt = f"""Extract information from the following receipt OCR text and return a JSON object with these exact keys: brand, total_cost, location, purchase_category, brand_category, Date, currency, filename, payment_method, metadata.
# Rules:
# 1. For total_cost, use the highest monetary value in the text.
# 2. For brand_category, choose the closest match from: ["Fashion and Apparel", "Jewelry and Watches", "Beauty and Personal Care", "Automobiles", "Real Estate", "Travel(it may contain reciepts of airlines , trains , taxi ,cruise ,etc)", "Hospitality(it will include reciepts of Hotels (stays) , restaurants , cafe's , bar's , Accommodation Services , Beverages Services (don't include food delivery service in hospitality))","Food Delivery Services(like swiggy , zomato,eatsure and any other you can analyse from receipt text)", "Home and Lifestyle", "Technology and Electronics", "Sports and Leisure", "Art and Collectibles", "Health and Wellness", "Stationery and Writing Instruments", "Children and Baby", "Pet Accessories", "Insurance"]
# 3. Format Date as dd-mm-yyyy.Strictly return the date in the format dd-mm-yyyy.
# 4. metadata: For insurance receipts extract the data points given in the JSON and return the JSON with structure: \n """ + insurance_response_structure + """
# 5.metadata : For travel receipts(flight ,bus,train) extract these data points as a JSON object exactly""" + travel_response_structure + """
# 6. metadata : For hotel receipts extract these data points as a JSON object exactly""" + hotel_data_points + f"""
# 7. metadata : For coupon receipts extract these data points as a JSON object exactly""" + coupon_data_points + f"""
# For non-insurance and non-travel , non-hotel receipts, return metadata as null.
# 8. Use currency codes (e.g., USD, EUR) instead of symbols.
# 9. Generate filename as 'PURCHASE_TYPE_BRAND_DATE' (e.g., 'clothing_gucci_20230715').
# 10. If a value is not found, return null.
# 11. If all values are null, return null.
# Ensure the strictly that output is a valid JSON object containing strictly the above keys, without any explanations.
# Here's the OCR text below analyse it and convert into json using keys provided in first line and using the rules provided in rules section:
# Generate a JSON response in the following format without using the ```json block. Ensure the output is properly formatted as plain text JSON.
# {raw_text}
# """
# return system_prompt
def receipt_radar_prompt(raw_text:str)->str:
insurance_response_structure = """
{
"insurance_type": "Classify it into 8 categories travel , health , term , vehicle, property,liability, life , buisness only .Try to find the closest possible based on the receipt text, if you don't understand the type classify it as others.",
"policy_details": {
"policyholder_name": "",
"policy_number": "",
"insurance_start_date": "",
"insurance_end_date": "",
"premium_amount": "",
"payment_frequency": ""
},
"coverage_details": {
"covered_items": {
"item_type": "",
"product_company": "",
"product_model": "",
"product_manufacturing_year": ""
},
"comprehensive_coverage_type_policy": "yes/no"
}
}
"""
travel_response_structure = """
travel_type(bus,train,airplane,taxi,bike,rickshaw classify in these categories only strictly),travel_company_name , departure_destination , arrival_destination , arrival_city(if you are not able to find the arrival city add the arrival destination into this field strictly. ), departure_date,arrival_date .If the arrival and departure dates are the same from receipt text given to you analyse it properly to check that, then only use the same date in both the fields .if you don't find any field mark it as null.
"""
hotel_data_points = """ hotel_type(hotel_stay , dine_in , dine_in + stay(use both keyword strictly)), hotel_brand_name , hotel_location , hotel_checkin_date , hotel_checkout_date. if you don't find any field mark it as null """
coupon_data_points = """
{
"brand_name": "Extract the brand_name from where the coupon is sent",
"validity": {
"start_date": "Date the coupon is valid from, if provided.other wise mark it as null",
"expiration_date": "Date the coupon expires. Leave blank if not mentioned."
},
"coupon_code": "Extract the unique code found in the email. If unavailable, leave it blank.",
"description": "Provide the discount details (amount/percentage), minimum purchase, eligible products, retailer, any terms or restrictions, usage limit (single-use/multi-use), and where the coupon applies (in-store/online)."
}
"""
system_prompt = f"""Extract information from the following receipt OCR text,Strictly first analyse if the text actually is actually related to a purchase , insurance receipt , travel receipt , hotel receipt or a text having some coupon codes , If it is not a purchase or contains a coupon code then just return null strictly , and return a JSON object with these exact keys: brand, total_cost, location, purchase_category, brand_category, Date, currency, filename, payment_method, metadata.
Rules:
1. For total_cost, use the highest monetary value in the text.
2. For brand_category, choose the closest match from: ["Fashion and Apparel", "Jewelry and Watches", "Beauty and Personal Care", "Automobiles", "Real Estate", "Travel(it may contain reciepts of airlines , trains , taxi ,cruise ,etc)", "Hospitality(it will include reciepts of Hotels (stays) , restaurants , cafe's , bar's , Accommodation Services , Beverages Services (don't include food delivery service in hospitality))","Food Delivery Services(like swiggy , zomato,eatsure and any other you can analyse from receipt text)", "Home and Lifestyle", "Technology and Electronics", "Sports and Leisure", "Art and Collectibles", "Health and Wellness", "Stationery and Writing Instruments", "Children and Baby", "Pet Accessories","Insurance","Coupons(This may include Discounted Offers,Promo Coupons,Coupon Codes,Voucher Deals,Exclusive Discounts,Special Offers,Couponized Receipts,Promo Receipts,Coupon Tags,Discount Emails)"]
3. Format Date as dd-mm-yyyy.Strictly return the date in the format dd-mm-yyyy.
4. Check if the text includes the purchaser's name, receipt or transaction ID, relevant dates, amount paid, service details (insurance, travel, or hotel), and issuer information to verify if it's a valid insurance, travel, or hotel receipt. If you find that its just a promotion email ,don't assum its a travel , insurance or hotel receipt.
4. metadata: For insurance receipts extract the data points given in the JSON and return the JSON with structure: \n """ + insurance_response_structure + """
5.metadata : For travel receipts(flight ,bus,train) extract these data points as a JSON object exactly""" + travel_response_structure + """
6. metadata : For hotel receipts extract these data points as a JSON object exactly""" + hotel_data_points + f"""
7. metadata : For coupon receipts extract these data points as a JSON object exactly""" + coupon_data_points + f"""
For non-insurance and non-travel , non-hotel , non-coupon receipts, return metadata as null.Also if you don't find a valid coupon code,then return metadata as null directly strictly follow this rule.
8. Use currency codes (e.g., USD, EUR) instead of symbols.
9. Generate filename as 'PURCHASE_TYPE_BRAND_DATE' (e.g., 'clothing_gucci_20230715').
10. If a value is not found, return null.
11. If all values are null, return null.
Ensure the strictly that output is a valid JSON object containing strictly the above keys, without any explanations.
Here's the OCR text below analyse it and convert into json using keys provided in first line and using the rules provided in rules section:
Generate a JSON response in the following format without using the ```json block. Ensure the output is properly formatted as plain text JSON.
{raw_text}
"""
return system_prompt
def adjust_prompt_tokens_v1(prompt: str) -> str:
max_tokens = 127500
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
tokenized_prompt = encoding.encode(prompt)
# If token count exceeds max_tokens, trim it from the end while keeping full words
if len(tokenized_prompt) > max_tokens:
# Find the maximum index for the tokens that keeps the length within max_tokens
trimmed_tokens = tokenized_prompt[:max_tokens]
# Decode the trimmed tokens back to text
trimmed_text = encoding.decode(trimmed_tokens)
# Ensure we don't end up with a partial word; trim back to the last full word
last_space = trimmed_text.rfind(' ')
if last_space != -1:
trimmed_text = trimmed_text[:last_space]
else:
# If within the limit, no trimming needed
trimmed_text = prompt
return trimmed_text
async def process_batch_job(dataset: Dict[str, Any]):
"""
Background task to process the batch job
"""
try:
openai_tasks = []
for ds in dataset.get('data'):
message_id = ds.get('message_id')
user_id = ds.get('user_id')
receipt_text = ds.get('receipt_text')
email = ds.get('email')
text = adjust_prompt_tokens_v1(receipt_radar_prompt(receipt_text))
task = {
"custom_id": f"{message_id}*{user_id}*{email}",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "gpt-4o-mini",
"temperature": 0.1,
"response_format": {
"type": "json_object"
},
"messages": [
{
"role": "user",
"content": text
}
]
}
}
openai_tasks.append(task)
# Create batch file
json_obj = io.BytesIO()
for obj in openai_tasks:
json_obj.write((json.dumps(obj) + '\n').encode('utf-8'))
batch_file = client.files.create(
file=json_obj,
purpose="batch"
)
# Create batch job
batch_job = client.batches.create(
input_file_id=batch_file.id,
endpoint="/v1/chat/completions",
completion_window="24h"
)
# Update status in Supabase
supabase.table("batch_processing_details").insert({
"batch_job_status": False,
"batch_job_id" : batch_job.id
}).execute()
print("Batch sent for processing")
except Exception as e:
logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
# Update status with error
supabase.table("batch_processing_details").insert({
"batch_job_status": False,
"error": str(e)
}).execute()
|