from pinecone import Pinecone from langchain_text_splitters import RecursiveCharacterTextSplitter import os from dotenv import load_dotenv import time load_dotenv() def chunk_list(lst, chunk_size): """Yield successive chunks of size `chunk_size` from list.""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size] def upsert_text_with_chunks( text: str, *, index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io", namespace: str = "default", chunk_size: int = 1000, chunk_overlap: int = 200 ) -> None: """ Splits a long text into overlapping chunks and upserts them directly into a Pinecone index that has integrated embedding enabled. Args: text (str): The full text document to embed. index_host (str): Pinecone index host URL. namespace (str): Pinecone namespace to upsert into. chunk_size (int): Max characters per chunk. chunk_overlap (int): Overlap in characters between chunks. """ api_key = os.getenv("PINECONE_API_KEY") if not api_key: raise EnvironmentError("Set PINECONE_API_KEY in environment") pc = Pinecone(api_key=api_key) index = pc.Index(host=index_host) splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False ) chunks = splitter.split_text(text) if not chunks: print("No chunks generated — exiting.") return records = [ { "_id": f"chunk-{i}", "text": chunk } for i, chunk in enumerate(chunks) ] for batch in chunk_list(records, 50): print("Inserting") index.upsert_records(records=batch, namespace=namespace) time.sleep(60) print("resting") print(f"✅ Upserted {len(records)} valid chunks (out of {len(chunks)}) into namespace '{namespace}'.") from pinecone import Pinecone from typing import List, Dict def search_pinecone_text( query_text: str, index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io", namespace: str = "default", top_k: int = 2, fields: List[str] = ["category", "text"] ) -> List[Dict]: """ Search a Pinecone index using a text query. Args: api_key (str): Your Pinecone API key. index_host (str): The specific index host URL. namespace (str): The namespace to search within. query_text (str): The input text to search for. top_k (int): Number of top results to return. fields (List[str]): Metadata fields to include in the response. Returns: List[Dict]: The top matching results. """ api_key = os.getenv("PINECONE_API_KEY") pc = Pinecone(api_key=api_key) index = pc.Index(host=index_host) results = index.search( namespace=namespace, query={"inputs": {"text": query_text}, "top_k": top_k}, fields=fields ) print() hits =results.result['hits'] result=[] for hit in hits: text = hit['fields']['text'] score = hit['_score'] result.append({"text":text,"score":score}) return result