expense_parser / extract_and_store_supabase.py
Omkar008's picture
Update extract_and_store_supabase.py
17117d6 verified
import os
import base64
import random
import string
from base64 import urlsafe_b64encode
import requests
from supabase_models import Supabase_Client
from authenticate import get_access_token_v1
def generate_custom_random_string():
# Length of the provided string (16 characters)
length = 16
# Define the character set (hexadecimal characters: 0-9 and a-f)
characters = string.hexdigits.lower()
# Generate the random string
random_string = ''.join(random.choice(characters) for _ in range(length))
# Add "RAN" in front of the string
return "RAN" + random_string
async def extract_structure_store_message(filename:str,filename_path:str,user_id:str,email:str):
message_id = generate_custom_random_string()
attachment_id = 'UPLOADED_MANUALLY'
if attachment_id and message_id:
project_id = os.getenv('PROJECT_ID')
processor_id = os.getenv('PROCESSOR_ID')
document_entities = {}
print(filename)
attachment_extension = filename.split('.')[1]
print("printing attachment extension")
print(attachment_extension)
print(filename)
# file_name = f"{message_id}_{attachment_id}"
# print(f"file_name: {file_name}")
print(filename_path)
supabase = Supabase_Client().instance
try:
response = supabase.storage.from_(f"all_card_assets").download(
filename_path
)
base64_data = urlsafe_b64encode(response).decode('utf-8')
if attachment_extension == 'pdf':
payload = {
"skipHumanReview": True,
"rawDocument": {
"mimeType": f"application/{attachment_extension}",
"content": base64_data
}
}
elif attachment_extension == 'jpg' or attachment_extension == 'png':
payload = {
"skipHumanReview": True,
"rawDocument": {
"mimeType": f"image/{attachment_extension}",
"content": base64_data
}
}
access_token = get_access_token_v1()
print(access_token)
headers = {
'Authorization': f'Bearer {access_token}',
'Content-Type': 'application/json; charset=utf-8'
}
response = requests.post(
f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
headers=headers,
json=payload
)
response_json = response.json()
print(response_json)
allowed_entities = [
"credit_card_last_four_digits",
"currency",
"end_date",
"net_amount",
"payment_type",
"purchase_time",
"receipt_date",
"start_date",
"supplier_address",
"supplier_city",
"supplier_name",
"tip_amount",
"total_amount",
"line_item/quantity",
"line_item/amount",
"line_item/unit_price"
]
raw_text = response_json.get('document').get('text' , None)
entities = response_json.get('document').get('entities' , None)
document_entities['user_id'] = user_id
insert_ocr_data_response = (
supabase.table("receipt_ocr_data")
.insert({'user_id':user_id , 'message_id':message_id,'receipt_text':raw_text ,'email':email,'file_type':attachment_extension})
.execute()
)
print('Printing entities')
print(entities)
# if entities is not None:
# for ent in entities:
# if ent.get('type') is not None:
# if ent.get('type') in allowed_entities:
# mention_text = ent.get('mentionText')
# normalised_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
# document_entities[ent.get('type')] = {"mention_text":mention_text,"normalizedValue":normalised_values}
if entities is not None:
for ent in entities:
if ent.get('type') is not None:
entity_type = ent.get('type')
if entity_type in allowed_entities:
mention_text = ent.get('mentionText')
normalized_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
# Initialize a list for the entity type if not already present
if entity_type not in document_entities:
document_entities[entity_type] = []
# Append the entity data to the list
document_entities[entity_type].append({
"mention_text": mention_text,
"normalizedValue": normalized_values
})
document_entities['email'] = email
document_entities['message_id'] = message_id
print(document_entities)
insert_data_response = (
supabase.table("document_ai_entities")
.insert(document_entities)
.execute()
)
print(insert_data_response)
except Exception as e:
print(f"Error downloading or encoding file: {e}")