Spaces:
Sleeping
Sleeping
File size: 2,443 Bytes
9002555 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import nest_asyncio
import os
from dotenv import load_dotenv
from jinja2 import Template
from pydantic import BaseModel, Field
from pymongo.mongo_client import MongoClient
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core.extractors import PydanticProgramExtractor
from llama_index.llms.openai import OpenAI
from core.prompt import ADD_METADATA_TEMPLATE
from core.summarization.summarizer import SummarizeGenerator
nest_asyncio.apply()
load_dotenv()
class NodeMetadata(BaseModel):
"""Metadata for nodes, capturing topic and subtopic from the book."""
topic: str = Field(
...,
description="The main subject or category that the node is associated with, representing a broad theme within the book.",
)
subtopic: str = Field(
...,
description="A more specific aspect or section under the main topic, refining the context of the node within the book.",
)
def extract_topic(references, content_table):
uri = os.getenv("MONGO_URI")
client = MongoClient(uri)
try:
client.admin.command('ping')
print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
print(e)
# Access a specific database
db = client["summarizer"]
# Access a collection within the database
collection = db["topic_collection"]
generate_content_table = SummarizeGenerator(references)
extractor_output, extractor_dics = generate_content_table.extract_content_table(content_table)
print(extractor_output)
data_to_insert = {
"title": references["title"],
**extractor_dics # Unpack the extractor_output dictionary
}
collection.insert_one(data_to_insert)
add_metadata_template = str(
Template(ADD_METADATA_TEMPLATE).render(extractor_output=extractor_output)
)
print("add metadata template : ", add_metadata_template)
llm = OpenAI(temperature=0.1, model="gpt-4o-mini")
openai_program = OpenAIPydanticProgram.from_defaults(
output_cls=NodeMetadata,
prompt_template_str="{input}",
extract_template_str=add_metadata_template,
llm=llm,
)
topic_extractor = PydanticProgramExtractor(
program=openai_program,
input_key="input",
show_progress=True,
extract_template_str=add_metadata_template,
llm=llm,
)
return topic_extractor
|