File size: 3,252 Bytes
f8a0c51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from pinecone import Pinecone
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
import time

load_dotenv()
def chunk_list(lst, chunk_size):
    """Yield successive chunks of size `chunk_size` from list."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]
        
def upsert_text_with_chunks(
    text: str,
    *,
    index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io",
    namespace: str = "default",
    chunk_size: int = 1000,
    chunk_overlap: int = 200
) -> None:
    """
    Splits a long text into overlapping chunks and upserts them directly into a Pinecone index
    that has integrated embedding enabled.

    Args:
        text (str): The full text document to embed.
        index_host (str): Pinecone index host URL.
        namespace (str): Pinecone namespace to upsert into.
        chunk_size (int): Max characters per chunk.
        chunk_overlap (int): Overlap in characters between chunks.
    """
    api_key = os.getenv("PINECONE_API_KEY")
    if not api_key:
        raise EnvironmentError("Set PINECONE_API_KEY in environment")

    pc = Pinecone(api_key=api_key)
    index = pc.Index(host=index_host)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False
    )
    chunks = splitter.split_text(text)
    if not chunks:
        print("No chunks generated — exiting.")
        return

    records = [
        {
            "_id": f"chunk-{i}",
            "text": chunk
        }
        for i, chunk in enumerate(chunks)
    ]

    for batch in chunk_list(records, 50):
        print("Inserting")
        index.upsert_records(records=batch, namespace=namespace)
        time.sleep(60)
        print("resting")
    print(f"✅ Upserted {len(records)} valid chunks (out of {len(chunks)}) into namespace '{namespace}'.")




    
from pinecone import Pinecone
from typing import List, Dict

def search_pinecone_text(
    query_text: str,
    index_host: str = "https://resume-42eo81u.svc.aped-4627-b74a.pinecone.io",
    namespace: str = "default",
    top_k: int = 2,
    fields: List[str] = ["category", "text"]
) -> List[Dict]:
    """
    Search a Pinecone index using a text query.

    Args:
        api_key (str): Your Pinecone API key.
        index_host (str): The specific index host URL.
        namespace (str): The namespace to search within.
        query_text (str): The input text to search for.
        top_k (int): Number of top results to return.
        fields (List[str]): Metadata fields to include in the response.

    Returns:
        List[Dict]: The top matching results.
    """
    api_key = os.getenv("PINECONE_API_KEY")
    pc = Pinecone(api_key=api_key)
    index = pc.Index(host=index_host)

    results = index.search(
        namespace=namespace,
        query={"inputs": {"text": query_text}, "top_k": top_k},
        fields=fields
    )
    print()
    hits =results.result['hits']
    result=[]
    for hit in hits:
        text = hit['fields']['text']
        score = hit['_score']
        result.append({"text":text,"score":score})
    return   result