Spaces:
Sleeping
Sleeping
from io import BytesIO | |
import os | |
import base64 | |
import fitz | |
from fastapi.responses import JSONResponse | |
from llama_index.core.vector_stores import ( | |
MetadataFilter, | |
MetadataFilters, | |
FilterCondition, | |
) | |
from llama_index.core import load_index_from_storage | |
from llama_index.core.storage import StorageContext | |
from llama_index.llms.openai import OpenAI | |
from core.parser import parse_topics_to_dict | |
from llama_index.core.llms import ChatMessage | |
from core.prompt import ( | |
SYSTEM_TOPIC_TEMPLATE, | |
USER_TOPIC_TEMPLATE, | |
REFINED_GET_TOPIC_TEMPLATE, | |
) | |
# from langfuse.openai import openai | |
class SummarizeGenerator: | |
def __init__(self, references): | |
self.references = references | |
self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096) | |
def extract_pages(self, content_table): | |
try: | |
content_bytes = content_table.file.read() | |
print(content_bytes) | |
# Open the PDF file | |
content_table = fitz.open(stream=content_bytes, filetype="pdf") | |
print(content_table) | |
# content_table = fitz.open(topics_image) | |
except Exception as e: | |
return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}") | |
# Initialize a list to collect base64 encoded images | |
pix_encoded_combined = [] | |
# Iterate over each page to extract images | |
for page_number in range(len(content_table)): | |
try: | |
page = content_table.load_page(page_number) | |
pix_encoded = self._extract_image_as_base64(page) | |
pix_encoded_combined.append(pix_encoded) | |
# print("pix encoded combined", pix_encoded_combined) | |
except Exception as e: | |
print(f"Error processing page {page_number}: {e}") | |
continue # Skip to the next page if there's an error | |
if not pix_encoded_combined: | |
return JSONResponse(status_code=404, content="No images found in the PDF") | |
return pix_encoded_combined | |
def extract_content_table(self, content_table): | |
try: | |
images = self.extract_pages(content_table) | |
image_messages = [ | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{image}", | |
}, | |
} | |
for image in images | |
] | |
messages = [ | |
ChatMessage( | |
role="system", | |
content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}], | |
), | |
ChatMessage( | |
role="user", | |
content=[ | |
{"type": "text", "text": USER_TOPIC_TEMPLATE}, | |
*image_messages, | |
], | |
), | |
] | |
extractor_output = self.llm.chat(messages) | |
print("extractor output : ", extractor_output) | |
refined_extractor_output = self.llm.complete( | |
REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output)) | |
) | |
print("refined extractor output : ",str(refined_extractor_output)) | |
extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output))) | |
return str(refined_extractor_output), extractor_dics | |
except Exception as e: | |
return JSONResponse(status_code=500, content=f"An error occurred: {e}") | |
def _extract_image_as_base64(self, page): | |
try: | |
pix = page.get_pixmap() | |
pix_bytes = pix.tobytes() | |
return base64.b64encode(pix_bytes).decode("utf-8") | |
except Exception as e: | |
return JSONResponse(status_code=500, content=f"Error extracting image: {e}") | |
def index_summarizer_engine(self, topic, subtopic, index): | |
filters = MetadataFilters( | |
filters=[ | |
MetadataFilter(key="title", value=topic), | |
MetadataFilter(key="category", value=subtopic), | |
], | |
condition=FilterCondition.AND, | |
) | |
# Create the QueryEngineTool with the index and filters | |
kwargs = {"similarity_top_k": 5, "filters": filters} | |
query_engine = index.as_query_engine(**kwargs) | |
return query_engine | |
def get_summarizer_engine(self, topic, subtopic): | |
pass | |
def prepare_summaries(self): | |
pass |