Spaces:
Sleeping
Sleeping
from langchain_google_genai import ChatGoogleGenerativeAI | |
from langchain_core.messages import HumanMessage | |
import os | |
import re | |
import json | |
from dotenv import load_dotenv | |
load_dotenv() | |
MONGO_URI = os.getenv("MONGO_URI") | |
DB_NAME = os.getenv("DB_NAME") | |
COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
FLASH_API = os.getenv("FLASH_API") | |
PINECONE_API=os.getenv("PINECONE_API") | |
PINECONE_INDEX=os.getenv("PINECONE_INDEX") | |
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-002", temperature=0.2, max_tokens=None, google_api_key=FLASH_API) | |
system_prompt_text = f"""Please extract the table from the image and return the table data in JSON format, with each row represented as an object containing column headers as keys. Ensure that each cell's content corresponds accurately to its column header. If a cell is empty, Keep None as its value. | |
Go through the data and give a summary of the table, describing what the data is about in description field. | |
Go through each column and give a column summary telling what each column header means. | |
Analyze the data to suggest two columns which can be used to plot the best graph for this table. | |
If a table contains both hindi and english translations for header or cell then only give english translations. | |
Remember to give the response in correct JSON Format. | |
Expected output format : {{ | |
"table_data": [ | |
{{ | |
"column_1": "Value 1-1", | |
"column_2": "Value 1-2", | |
"column_3": "Value 1-3" | |
}}, | |
{{ | |
"column_1": "Value 2-1", | |
"column_2": "Value 2-2", | |
"column_3": "Value 2-3" | |
}} | |
// Additional rows as needed | |
], | |
"description": "Table Description", | |
"column_summary":{{ | |
"column_1" : "column description", | |
"column_2" : "column description", | |
"column_3" :"column description" | |
}}, | |
"best_column1" : "Column 1 name", | |
"best_column2" : "Column 2 name" | |
}} | |
""" | |
def process_image_using_llm(image, page_number, max_retries=3): | |
for attempt in range(1, max_retries + 1): | |
try: | |
# Send the image and system prompt to the LLM | |
message = HumanMessage( | |
content=[ | |
{"type": "text", "text": system_prompt_text}, | |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}}, | |
], | |
) | |
response = model.invoke([message]) | |
# Clean up the response content | |
response_content = response.content.strip("```").replace("json", "").replace("\\n", "").strip() | |
print(response_content) | |
response_content = response_content.strip("```") | |
try: | |
# Attempt direct JSON parsing | |
data = json.loads(response_content) | |
# Extract table data and additional notes | |
table_data = data.get("table_data", []) | |
description = data.get("description", "").strip() if data.get("description") else "" | |
column_summary=data.get("column_summary",{}) | |
best_col1=data.get("best_column1","").strip() if data.get("best_column1") else "" | |
best_col2=data.get("best_column2","").strip() if data.get("best_column2") else "" | |
# Verify that we have valid table data | |
has_table_data = bool(table_data) | |
return { | |
"page_number": page_number, | |
"table_data": table_data if has_table_data else None, | |
"description": description if description else None, | |
"column_summary": column_summary if column_summary else None, | |
"best_col1":best_col1 if best_col1 else None, | |
"best_col2":best_col2 if best_col2 else None, | |
"has_table_data": has_table_data | |
} | |
except json.JSONDecodeError as e: | |
print(f"JSON decode error on attempt {attempt} for page {page_number}: {e}") | |
if attempt == max_retries: | |
return { | |
"page_number": page_number, | |
"table_data": None, | |
"description": None, | |
"column_summary": None, | |
"best_col1": None, | |
"best_col2": None, | |
"has_table_data": False | |
} | |
# Handle any other exceptions without retrying | |
except Exception as e: | |
print(f"Outer exception for page {page_number}: {e}") | |
return { | |
"page_number": page_number, | |
"table_data": None, | |
"description": None, | |
"column_summary": None, | |
"best_col1": None, | |
"best_col2": None, | |
"has_table_data": False | |
} | |