Spaces:
Sleeping
Sleeping
import nest_asyncio | |
import os | |
from dotenv import load_dotenv | |
from jinja2 import Template | |
from pydantic import BaseModel, Field | |
from pymongo.mongo_client import MongoClient | |
from llama_index.program.openai import OpenAIPydanticProgram | |
from llama_index.core.extractors import PydanticProgramExtractor | |
from llama_index.llms.openai import OpenAI | |
from core.prompt import ADD_METADATA_TEMPLATE | |
from core.summarization.summarizer import SummarizeGenerator | |
nest_asyncio.apply() | |
load_dotenv() | |
class NodeMetadata(BaseModel): | |
"""Metadata for nodes, capturing topic and subtopic from the book.""" | |
topic: str = Field( | |
..., | |
description="The main subject or category that the node is associated with, representing a broad theme within the book.", | |
) | |
subtopic: str = Field( | |
..., | |
description="A more specific aspect or section under the main topic, refining the context of the node within the book.", | |
) | |
def extract_topic(references, content_table): | |
uri = os.getenv("MONGO_URI") | |
client = MongoClient(uri) | |
try: | |
client.admin.command('ping') | |
print("Pinged your deployment. You successfully connected to MongoDB!") | |
except Exception as e: | |
print(e) | |
# Access a specific database | |
db = client["summarizer"] | |
# Access a collection within the database | |
collection = db["topic_collection"] | |
generate_content_table = SummarizeGenerator(references) | |
extractor_output, extractor_dics = generate_content_table.extract_content_table(content_table) | |
print(extractor_output) | |
data_to_insert = { | |
"title": references["title"], | |
**extractor_dics # Unpack the extractor_output dictionary | |
} | |
collection.insert_one(data_to_insert) | |
add_metadata_template = str( | |
Template(ADD_METADATA_TEMPLATE).render(extractor_output=extractor_output) | |
) | |
print("add metadata template : ", add_metadata_template) | |
llm = OpenAI(temperature=0.1, model="gpt-4o-mini") | |
openai_program = OpenAIPydanticProgram.from_defaults( | |
output_cls=NodeMetadata, | |
prompt_template_str="{input}", | |
extract_template_str=add_metadata_template, | |
llm=llm, | |
) | |
topic_extractor = PydanticProgramExtractor( | |
program=openai_program, | |
input_key="input", | |
show_progress=True, | |
extract_template_str=add_metadata_template, | |
llm=llm, | |
) | |
return topic_extractor | |