Spaces:
Running
Running
from pinecone import Pinecone | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
import os | |
from dotenv import load_dotenv | |
import time | |
load_dotenv() | |
def chunk_list(lst, chunk_size): | |
"""Yield successive chunks of size `chunk_size` from list.""" | |
for i in range(0, len(lst), chunk_size): | |
yield lst[i:i + chunk_size] | |
def upsert_text_with_chunks( | |
text: str, | |
*, | |
index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io", | |
namespace: str = "default", | |
chunk_size: int = 1000, | |
chunk_overlap: int = 200 | |
) -> None: | |
""" | |
Splits a long text into overlapping chunks and upserts them directly into a Pinecone index | |
that has integrated embedding enabled. | |
Args: | |
text (str): The full text document to embed. | |
index_host (str): Pinecone index host URL. | |
namespace (str): Pinecone namespace to upsert into. | |
chunk_size (int): Max characters per chunk. | |
chunk_overlap (int): Overlap in characters between chunks. | |
""" | |
api_key = os.getenv("PINECONE_API_KEY") | |
if not api_key: | |
raise EnvironmentError("Set PINECONE_API_KEY in environment") | |
pc = Pinecone(api_key=api_key) | |
index = pc.Index(host=index_host) | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=len, | |
is_separator_regex=False | |
) | |
chunks = splitter.split_text(text) | |
if not chunks: | |
print("No chunks generated — exiting.") | |
return | |
records = [ | |
{ | |
"_id": f"chunk-{i}", | |
"text": chunk | |
} | |
for i, chunk in enumerate(chunks) | |
] | |
for batch in chunk_list(records, 50): | |
print("Inserting") | |
index.upsert_records(records=batch, namespace=namespace) | |
time.sleep(60) | |
print("resting") | |
print(f"✅ Upserted {len(records)} valid chunks (out of {len(chunks)}) into namespace '{namespace}'.") | |
from pinecone import Pinecone | |
from typing import List, Dict | |
def search_pinecone_text( | |
query_text: str, | |
index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io", | |
namespace: str = "default", | |
top_k: int = 2, | |
fields: List[str] = ["category", "text"] | |
) -> List[Dict]: | |
""" | |
Search a Pinecone index using a text query. | |
Args: | |
api_key (str): Your Pinecone API key. | |
index_host (str): The specific index host URL. | |
namespace (str): The namespace to search within. | |
query_text (str): The input text to search for. | |
top_k (int): Number of top results to return. | |
fields (List[str]): Metadata fields to include in the response. | |
Returns: | |
List[Dict]: The top matching results. | |
""" | |
api_key = os.getenv("PINECONE_API_KEY") | |
pc = Pinecone(api_key=api_key) | |
index = pc.Index(host=index_host) | |
results = index.search( | |
namespace=namespace, | |
query={"inputs": {"text": query_text}, "top_k": top_k}, | |
fields=fields | |
) | |
print() | |
hits =results.result['hits'] | |
result=[] | |
for hit in hits: | |
text = hit['fields']['text'] | |
score = hit['_score'] | |
result.append({"text":text,"score":score}) | |
return result |