File size: 4,497 Bytes
9002555
 
 
 
 
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
 
 
 
 
 
 
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from io import BytesIO
import os
import base64
import fitz

from fastapi.responses import JSONResponse
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterCondition,
)

from llama_index.core import load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.llms.openai import OpenAI
from core.parser import parse_topics_to_dict
from llama_index.core.llms import ChatMessage
from core.prompt import (
    SYSTEM_TOPIC_TEMPLATE,
    USER_TOPIC_TEMPLATE,
    REFINED_GET_TOPIC_TEMPLATE,
)

# from langfuse.openai import openai


class SummarizeGenerator:
    def __init__(self, references):

        self.references = references
        self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)

    def extract_pages(self, content_table):
        try:
            content_bytes = content_table.file.read()
            print(content_bytes)
            # Open the PDF file
            content_table = fitz.open(stream=content_bytes, filetype="pdf")
            print(content_table)
            # content_table = fitz.open(topics_image)
        except Exception as e:
            return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")

        # Initialize a list to collect base64 encoded images
        pix_encoded_combined = []

        # Iterate over each page to extract images
        for page_number in range(len(content_table)):
            try:
                page = content_table.load_page(page_number)
                pix_encoded = self._extract_image_as_base64(page)
                pix_encoded_combined.append(pix_encoded)
                # print("pix encoded combined", pix_encoded_combined)

            except Exception as e:
                print(f"Error processing page {page_number}: {e}")
                continue  # Skip to the next page if there's an error

        if not pix_encoded_combined:
            return JSONResponse(status_code=404, content="No images found in the PDF")

        return pix_encoded_combined

    def extract_content_table(self, content_table):
        try:
            images = self.extract_pages(content_table)

            image_messages = [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image}",
                    },
                }
                for image in images
            ]

            messages = [
                ChatMessage(
                    role="system",
                    content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
                ),
                ChatMessage(
                    role="user",
                    content=[
                        {"type": "text", "text": USER_TOPIC_TEMPLATE},
                        *image_messages,
                    ],
                ),
            ]

            extractor_output = self.llm.chat(messages)
            print("extractor output : ", extractor_output)
            refined_extractor_output = self.llm.complete(
                REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
            )
        
            print("refined extractor output : ",str(refined_extractor_output))

            extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))

            return str(refined_extractor_output), extractor_dics

        except Exception as e:
            return JSONResponse(status_code=500, content=f"An error occurred: {e}")

    def _extract_image_as_base64(self, page):
        try:
            pix = page.get_pixmap()
            pix_bytes = pix.tobytes()
            return base64.b64encode(pix_bytes).decode("utf-8")
        except Exception as e:
            return JSONResponse(status_code=500, content=f"Error extracting image: {e}")

    def index_summarizer_engine(self, topic, subtopic, index):
        filters = MetadataFilters(
            filters=[
                MetadataFilter(key="title", value=topic),
                MetadataFilter(key="category", value=subtopic),
            ],
            condition=FilterCondition.AND,
        )
        
        # Create the QueryEngineTool with the index and filters 
        kwargs = {"similarity_top_k": 5, "filters": filters}
        
        query_engine = index.as_query_engine(**kwargs)

        return query_engine

    def get_summarizer_engine(self, topic, subtopic):
        pass

    def prepare_summaries(self):
        pass