dsmultimedika's picture
Improve the code bot development
d57efd6
raw
history blame
4.5 kB
from io import BytesIO
import os
import base64
import fitz
from fastapi.responses import JSONResponse
from llama_index.core.vector_stores import (
MetadataFilter,
MetadataFilters,
FilterCondition,
)
from llama_index.core import load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.llms.openai import OpenAI
from core.parser import parse_topics_to_dict
from llama_index.core.llms import ChatMessage
from core.prompt import (
SYSTEM_TOPIC_TEMPLATE,
USER_TOPIC_TEMPLATE,
REFINED_GET_TOPIC_TEMPLATE,
)
# from langfuse.openai import openai
class SummarizeGenerator:
def __init__(self, references):
self.references = references
self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)
def extract_pages(self, content_table):
try:
content_bytes = content_table.file.read()
print(content_bytes)
# Open the PDF file
content_table = fitz.open(stream=content_bytes, filetype="pdf")
print(content_table)
# content_table = fitz.open(topics_image)
except Exception as e:
return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")
# Initialize a list to collect base64 encoded images
pix_encoded_combined = []
# Iterate over each page to extract images
for page_number in range(len(content_table)):
try:
page = content_table.load_page(page_number)
pix_encoded = self._extract_image_as_base64(page)
pix_encoded_combined.append(pix_encoded)
# print("pix encoded combined", pix_encoded_combined)
except Exception as e:
print(f"Error processing page {page_number}: {e}")
continue # Skip to the next page if there's an error
if not pix_encoded_combined:
return JSONResponse(status_code=404, content="No images found in the PDF")
return pix_encoded_combined
def extract_content_table(self, content_table):
try:
images = self.extract_pages(content_table)
image_messages = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image}",
},
}
for image in images
]
messages = [
ChatMessage(
role="system",
content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
),
ChatMessage(
role="user",
content=[
{"type": "text", "text": USER_TOPIC_TEMPLATE},
*image_messages,
],
),
]
extractor_output = self.llm.chat(messages)
print("extractor output : ", extractor_output)
refined_extractor_output = self.llm.complete(
REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
)
print("refined extractor output : ",str(refined_extractor_output))
extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))
return str(refined_extractor_output), extractor_dics
except Exception as e:
return JSONResponse(status_code=500, content=f"An error occurred: {e}")
def _extract_image_as_base64(self, page):
try:
pix = page.get_pixmap()
pix_bytes = pix.tobytes()
return base64.b64encode(pix_bytes).decode("utf-8")
except Exception as e:
return JSONResponse(status_code=500, content=f"Error extracting image: {e}")
def index_summarizer_engine(self, topic, subtopic, index):
filters = MetadataFilters(
filters=[
MetadataFilter(key="title", value=topic),
MetadataFilter(key="category", value=subtopic),
],
condition=FilterCondition.AND,
)
# Create the QueryEngineTool with the index and filters
kwargs = {"similarity_top_k": 5, "filters": filters}
query_engine = index.as_query_engine(**kwargs)
return query_engine
def get_summarizer_engine(self, topic, subtopic):
pass
def prepare_summaries(self):
pass