Spaces:

agents-course
/

Final_Assignment_Template

Running

App Files Files Community

205

Final_Assignment_Template / tools /content_retriever_tool.py

onkar127

base upload

b5d547f verified 1 day ago

raw

history blame

3.51 kB

	from smolagents import Tool
	from docling.document_converter import DocumentConverter
	from docling.chunking import HierarchicalChunker
	from sentence_transformers import SentenceTransformer, util
	import torch


	class ContentRetrieverTool(Tool):
	name = "retrieve_content"
	description = """Retrieve the content of a webpage or document in markdown format. Supports PDF, DOCX, XLSX, HTML, images, and more."""
	inputs = {
	"url": {
	"type": "string",
	"description": "The URL or local path of the webpage or document to retrieve.",
	},
	"query": {
	"type": "string",
	"description": "The subject on the page you are looking for. The shorter the more relevant content is returned.",
	},
	}
	output_type = "string"

	def __init__(
	self,
	model_name: str \| None = None,
	threshold: float = 0.2,
	**kwargs,
	):
	self.threshold = threshold
	self._document_converter = DocumentConverter()
	self._model = SentenceTransformer(
	model_name if model_name is not None else "all-MiniLM-L6-v2"
	)
	self._chunker = HierarchicalChunker()

	super().__init__(**kwargs)

	def forward(self, url: str, query: str) -> str:
	document = self._document_converter.convert(url).document

	chunks = list(self._chunker.chunk(dl_doc=document))
	if len(chunks) == 0:
	return "No content found."

	chunks_text = [chunk.text for chunk in chunks]
	chunks_with_context = [self._chunker.contextualize(chunk) for chunk in chunks]
	chunks_context = [
	chunks_with_context[i].replace(chunks_text[i], "").strip()
	for i in range(len(chunks))
	]

	chunk_embeddings = self._model.encode(chunks_text, convert_to_tensor=True)
	context_embeddings = self._model.encode(chunks_context, convert_to_tensor=True)
	query_embedding = self._model.encode(
	[term.strip() for term in query.split(",") if term.strip()],
	convert_to_tensor=True,
	)

	selected_indices = [] # aggregate indexes across chunks and context matches and for all queries
	for embeddings in [
	context_embeddings,
	chunk_embeddings,
	]:
	# Compute cosine similarities (returns 1D tensor)
	for cos_scores in util.pytorch_cos_sim(query_embedding, embeddings):
	# Convert to softmax probabilities
	probabilities = torch.nn.functional.softmax(cos_scores, dim=0)
	# Sort by probability descending
	sorted_indices = torch.argsort(probabilities, descending=True)
	# Accumulate until total probability reaches threshold

	cumulative = 0.0
	for i in sorted_indices:
	cumulative += probabilities[i].item()
	selected_indices.append(i.item())
	if cumulative >= self.threshold:
	break

	selected_indices = list(
	dict.fromkeys(selected_indices)
	) # remove duplicates and preserve order
	selected_indices = selected_indices[
	::-1
	] # make most relevant items last for better focus

	if len(selected_indices) == 0:
	return "No content found."

	return "\n\n".join([chunks_with_context[idx] for idx in selected_indices])