Spaces:

Nattyboi
/

resume-api

Running

App Files Files Community

resume-api / Ars /embedDoc.py

Nattyboi

Added ARS (#1)

f8a0c51 verified 7 days ago

raw

history blame contribute delete

3.25 kB

	from pinecone import Pinecone
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	import os
	from dotenv import load_dotenv
	import time

	load_dotenv()
	def chunk_list(lst, chunk_size):
	"""Yield successive chunks of size `chunk_size` from list."""
	for i in range(0, len(lst), chunk_size):
	yield lst[i:i + chunk_size]

	def upsert_text_with_chunks(
	text: str,
	*,
	index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io",
	namespace: str = "default",
	chunk_size: int = 1000,
	chunk_overlap: int = 200
	) -> None:
	"""
	Splits a long text into overlapping chunks and upserts them directly into a Pinecone index
	that has integrated embedding enabled.

	Args:
	text (str): The full text document to embed.
	index_host (str): Pinecone index host URL.
	namespace (str): Pinecone namespace to upsert into.
	chunk_size (int): Max characters per chunk.
	chunk_overlap (int): Overlap in characters between chunks.
	"""
	api_key = os.getenv("PINECONE_API_KEY")
	if not api_key:
	raise EnvironmentError("Set PINECONE_API_KEY in environment")

	pc = Pinecone(api_key=api_key)
	index = pc.Index(host=index_host)

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	is_separator_regex=False
	)
	chunks = splitter.split_text(text)
	if not chunks:
	print("No chunks generated — exiting.")
	return

	records = [
	{
	"_id": f"chunk-{i}",
	"text": chunk
	}
	for i, chunk in enumerate(chunks)
	]

	for batch in chunk_list(records, 50):
	print("Inserting")
	index.upsert_records(records=batch, namespace=namespace)
	time.sleep(60)
	print("resting")
	print(f"✅ Upserted {len(records)} valid chunks (out of {len(chunks)}) into namespace '{namespace}'.")





	from pinecone import Pinecone
	from typing import List, Dict

	def search_pinecone_text(
	query_text: str,
	index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io",
	namespace: str = "default",
	top_k: int = 2,
	fields: List[str] = ["category", "text"]
	) -> List[Dict]:
	"""
	Search a Pinecone index using a text query.

	Args:
	api_key (str): Your Pinecone API key.
	index_host (str): The specific index host URL.
	namespace (str): The namespace to search within.
	query_text (str): The input text to search for.
	top_k (int): Number of top results to return.
	fields (List[str]): Metadata fields to include in the response.

	Returns:
	List[Dict]: The top matching results.
	"""
	api_key = os.getenv("PINECONE_API_KEY")
	pc = Pinecone(api_key=api_key)
	index = pc.Index(host=index_host)

	results = index.search(
	namespace=namespace,
	query={"inputs": {"text": query_text}, "top_k": top_k},
	fields=fields
	)
	print()
	hits =results.result['hits']
	result=[]
	for hit in hits:
	text = hit['fields']['text']
	score = hit['_score']
	result.append({"text":text,"score":score})
	return result