resume-api / Ars /embedDoc.py
Nattyboi's picture
Added ARS (#1)
f8a0c51 verified
from pinecone import Pinecone
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
import time
load_dotenv()
def chunk_list(lst, chunk_size):
"""Yield successive chunks of size `chunk_size` from list."""
for i in range(0, len(lst), chunk_size):
yield lst[i:i + chunk_size]
def upsert_text_with_chunks(
text: str,
*,
index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io",
namespace: str = "default",
chunk_size: int = 1000,
chunk_overlap: int = 200
) -> None:
"""
Splits a long text into overlapping chunks and upserts them directly into a Pinecone index
that has integrated embedding enabled.
Args:
text (str): The full text document to embed.
index_host (str): Pinecone index host URL.
namespace (str): Pinecone namespace to upsert into.
chunk_size (int): Max characters per chunk.
chunk_overlap (int): Overlap in characters between chunks.
"""
api_key = os.getenv("PINECONE_API_KEY")
if not api_key:
raise EnvironmentError("Set PINECONE_API_KEY in environment")
pc = Pinecone(api_key=api_key)
index = pc.Index(host=index_host)
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False
)
chunks = splitter.split_text(text)
if not chunks:
print("No chunks generated — exiting.")
return
records = [
{
"_id": f"chunk-{i}",
"text": chunk
}
for i, chunk in enumerate(chunks)
]
for batch in chunk_list(records, 50):
print("Inserting")
index.upsert_records(records=batch, namespace=namespace)
time.sleep(60)
print("resting")
print(f"✅ Upserted {len(records)} valid chunks (out of {len(chunks)}) into namespace '{namespace}'.")
from pinecone import Pinecone
from typing import List, Dict
def search_pinecone_text(
query_text: str,
index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io",
namespace: str = "default",
top_k: int = 2,
fields: List[str] = ["category", "text"]
) -> List[Dict]:
"""
Search a Pinecone index using a text query.
Args:
api_key (str): Your Pinecone API key.
index_host (str): The specific index host URL.
namespace (str): The namespace to search within.
query_text (str): The input text to search for.
top_k (int): Number of top results to return.
fields (List[str]): Metadata fields to include in the response.
Returns:
List[Dict]: The top matching results.
"""
api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
index = pc.Index(host=index_host)
results = index.search(
namespace=namespace,
query={"inputs": {"text": query_text}, "top_k": top_k},
fields=fields
)
print()
hits =results.result['hits']
result=[]
for hit in hits:
text = hit['fields']['text']
score = hit['_score']
result.append({"text":text,"score":score})
return result