Spaces:
Sleeping
Sleeping
File size: 4,497 Bytes
9002555 d57efd6 9002555 d57efd6 9002555 d57efd6 9002555 d57efd6 9002555 d57efd6 9002555 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from io import BytesIO
import os
import base64
import fitz
from fastapi.responses import JSONResponse
from llama_index.core.vector_stores import (
MetadataFilter,
MetadataFilters,
FilterCondition,
)
from llama_index.core import load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.llms.openai import OpenAI
from core.parser import parse_topics_to_dict
from llama_index.core.llms import ChatMessage
from core.prompt import (
SYSTEM_TOPIC_TEMPLATE,
USER_TOPIC_TEMPLATE,
REFINED_GET_TOPIC_TEMPLATE,
)
# from langfuse.openai import openai
class SummarizeGenerator:
def __init__(self, references):
self.references = references
self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)
def extract_pages(self, content_table):
try:
content_bytes = content_table.file.read()
print(content_bytes)
# Open the PDF file
content_table = fitz.open(stream=content_bytes, filetype="pdf")
print(content_table)
# content_table = fitz.open(topics_image)
except Exception as e:
return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")
# Initialize a list to collect base64 encoded images
pix_encoded_combined = []
# Iterate over each page to extract images
for page_number in range(len(content_table)):
try:
page = content_table.load_page(page_number)
pix_encoded = self._extract_image_as_base64(page)
pix_encoded_combined.append(pix_encoded)
# print("pix encoded combined", pix_encoded_combined)
except Exception as e:
print(f"Error processing page {page_number}: {e}")
continue # Skip to the next page if there's an error
if not pix_encoded_combined:
return JSONResponse(status_code=404, content="No images found in the PDF")
return pix_encoded_combined
def extract_content_table(self, content_table):
try:
images = self.extract_pages(content_table)
image_messages = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image}",
},
}
for image in images
]
messages = [
ChatMessage(
role="system",
content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
),
ChatMessage(
role="user",
content=[
{"type": "text", "text": USER_TOPIC_TEMPLATE},
*image_messages,
],
),
]
extractor_output = self.llm.chat(messages)
print("extractor output : ", extractor_output)
refined_extractor_output = self.llm.complete(
REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
)
print("refined extractor output : ",str(refined_extractor_output))
extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))
return str(refined_extractor_output), extractor_dics
except Exception as e:
return JSONResponse(status_code=500, content=f"An error occurred: {e}")
def _extract_image_as_base64(self, page):
try:
pix = page.get_pixmap()
pix_bytes = pix.tobytes()
return base64.b64encode(pix_bytes).decode("utf-8")
except Exception as e:
return JSONResponse(status_code=500, content=f"Error extracting image: {e}")
def index_summarizer_engine(self, topic, subtopic, index):
filters = MetadataFilters(
filters=[
MetadataFilter(key="title", value=topic),
MetadataFilter(key="category", value=subtopic),
],
condition=FilterCondition.AND,
)
# Create the QueryEngineTool with the index and filters
kwargs = {"similarity_top_k": 5, "filters": filters}
query_engine = index.as_query_engine(**kwargs)
return query_engine
def get_summarizer_engine(self, topic, subtopic):
pass
def prepare_summaries(self):
pass |