Spaces:

Multimedika
/

Bot_Development

Sleeping

File size: 4,497 Bytes

from io import BytesIO
import os
import base64
import fitz

from fastapi.responses import JSONResponse
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterCondition,
)

from llama_index.core import load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.llms.openai import OpenAI
from core.parser import parse_topics_to_dict
from llama_index.core.llms import ChatMessage
from core.prompt import (
    SYSTEM_TOPIC_TEMPLATE,
    USER_TOPIC_TEMPLATE,
    REFINED_GET_TOPIC_TEMPLATE,
)

# from langfuse.openai import openai


class SummarizeGenerator:
    def __init__(self, references):

        self.references = references
        self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)

    def extract_pages(self, content_table):
        try:
            content_bytes = content_table.file.read()
            print(content_bytes)
            # Open the PDF file
            content_table = fitz.open(stream=content_bytes, filetype="pdf")
            print(content_table)
            # content_table = fitz.open(topics_image)
        except Exception as e:
            return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")

        # Initialize a list to collect base64 encoded images
        pix_encoded_combined = []

        # Iterate over each page to extract images
        for page_number in range(len(content_table)):
            try:
                page = content_table.load_page(page_number)
                pix_encoded = self._extract_image_as_base64(page)
                pix_encoded_combined.append(pix_encoded)
                # print("pix encoded combined", pix_encoded_combined)

            except Exception as e:
                print(f"Error processing page {page_number}: {e}")
                continue  # Skip to the next page if there's an error

        if not pix_encoded_combined:
            return JSONResponse(status_code=404, content="No images found in the PDF")

        return pix_encoded_combined

    def extract_content_table(self, content_table):
        try:
            images = self.extract_pages(content_table)

            image_messages = [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image}",
                    },
                }
                for image in images
            ]

            messages = [
                ChatMessage(
                    role="system",
                    content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
                ),
                ChatMessage(
                    role="user",
                    content=[
                        {"type": "text", "text": USER_TOPIC_TEMPLATE},
                        *image_messages,
                    ],
                ),
            ]

            extractor_output = self.llm.chat(messages)
            print("extractor output : ", extractor_output)
            refined_extractor_output = self.llm.complete(
                REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
            )
        
            print("refined extractor output : ",str(refined_extractor_output))

            extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))

            return str(refined_extractor_output), extractor_dics

        except Exception as e:
            return JSONResponse(status_code=500, content=f"An error occurred: {e}")

    def _extract_image_as_base64(self, page):
        try:
            pix = page.get_pixmap()
            pix_bytes = pix.tobytes()
            return base64.b64encode(pix_bytes).decode("utf-8")
        except Exception as e:
            return JSONResponse(status_code=500, content=f"Error extracting image: {e}")

    def index_summarizer_engine(self, topic, subtopic, index):
        filters = MetadataFilters(
            filters=[
                MetadataFilter(key="title", value=topic),
                MetadataFilter(key="category", value=subtopic),
            ],
            condition=FilterCondition.AND,
        )
        
        # Create the QueryEngineTool with the index and filters 
        kwargs = {"similarity_top_k": 5, "filters": filters}
        
        query_engine = index.as_query_engine(**kwargs)

        return query_engine

    def get_summarizer_engine(self, topic, subtopic):
        pass

    def prepare_summaries(self):
        pass