Spaces:
Sleeping
Sleeping
Update pinecone_embeddings.py
Browse files- pinecone_embeddings.py +147 -152
pinecone_embeddings.py
CHANGED
@@ -1,153 +1,148 @@
|
|
1 |
-
from pinecone import Pinecone, ServerlessSpec
|
2 |
-
from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
|
3 |
-
import numpy as np
|
4 |
-
import re
|
5 |
-
import hashlib
|
6 |
-
from typing import List, Dict
|
7 |
-
import os
|
8 |
-
import time
|
9 |
-
|
10 |
-
from sentence_transformers import SentenceTransformer
|
11 |
-
|
12 |
-
# Initialize Pinecone at the module level
|
13 |
-
# It's better to initialize it once at the start of your application
|
14 |
-
|
15 |
-
|
16 |
-
def get_index(pinecone: Pinecone, index_name: str):
|
17 |
-
# 1. Delete existing index if it exists
|
18 |
-
if index_name in pinecone.list_indexes().names():
|
19 |
-
print(f"Deleting existing index: {index_name}")
|
20 |
-
pinecone.delete_index(index_name)
|
21 |
-
print(f"Index {index_name} deleted.")
|
22 |
-
else:
|
23 |
-
print(f"Index {index_name} does not exist, no deletion necessary.")
|
24 |
-
|
25 |
-
# 2. Create fresh index using create_index_for_model for integrated embedding
|
26 |
-
print(f"Creating new index: {index_name} with integrated 'llama-text-embed-v2'")
|
27 |
-
pinecone.create_index( # Corrected from create_index_for_model
|
28 |
-
name=index_name,
|
29 |
-
metric="cosine", # llama-text-embed-v2 uses cosine or dotproduct
|
30 |
-
dimension=768, # default dimension for llama-text-embed-v2
|
31 |
-
# embed parameter should be at the top-level of create_index for integrated models
|
32 |
-
# and 'field_map' is not used directly in create_index for embedded models
|
33 |
-
# Instead, it's inferred from the text being passed in the 'upsert' method
|
34 |
-
# We will specify the embedding model when upserting.
|
35 |
-
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
36 |
-
)
|
37 |
-
print(f"Index {index_name} created.")
|
38 |
-
|
39 |
-
# 3. Wait for the index to be ready
|
40 |
-
while not pinecone.describe_index(index_name).status['ready']:
|
41 |
-
print("Waiting for index to be ready...")
|
42 |
-
time.sleep(5)
|
43 |
-
print(f"Index {index_name} is now ready.")
|
44 |
-
|
45 |
-
return pinecone.Index(index_name)
|
46 |
-
|
47 |
-
# class EmbeddingEngine:
|
48 |
-
# def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5'):
|
49 |
-
# # For now, we are relying on Pinecone's integrated embedding for dense vectors.
|
50 |
-
# # This class might be used for other purposes or for local embedding generation if needed later.
|
51 |
-
# self.model = SentenceTransformer(model_name)
|
52 |
-
# self.model.max_seq_length = 512
|
53 |
-
|
54 |
-
# def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
|
55 |
-
# prefixed_texts = [
|
56 |
-
# f"query: {text}" if "query:" not in text.lower() else text
|
57 |
-
# for text in texts
|
58 |
-
# ]
|
59 |
-
# return self.model.encode(prefixed_texts, batch_size=batch_size, convert_to_numpy=True)
|
60 |
-
|
61 |
-
class PineconeVectorStore:
|
62 |
-
def __init__(self, index_name: str, pinecone: Pinecone, dimension: int = 1024): # default dimension for llama-text-embed-v2
|
63 |
-
self.index_name = index_name
|
64 |
-
self.dimension = dimension
|
65 |
-
|
66 |
-
self.index =
|
67 |
-
|
68 |
-
# Initialize BM25 encoder for sparse vectors
|
69 |
-
self.bm25_encoder = BM25Encoder()
|
70 |
-
|
71 |
-
# Fit BM25 encoder on a representative corpus of your data.
|
72 |
-
# This is crucial for BM25's effectiveness.
|
73 |
-
# For this example, we'll fit on a small sample. In a real scenario,
|
74 |
-
# you'd fit it on a larger corpus of your document chunks.
|
75 |
-
print("Fitting BM25Encoder...")
|
76 |
-
sample_corpus = ["This is a document about machine learning.", "Another document discussing natural language processing.", "A third document focused on artificial intelligence applications."]
|
77 |
-
self.bm25_encoder.fit(sample_corpus)
|
78 |
-
print("BM25Encoder fitted.")
|
79 |
-
|
80 |
-
def overwrite_vectors(self, document_chunks, pdf_filename: str, pinecone: Pinecone):
|
81 |
-
"""
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
#
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
""
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
"
|
147 |
-
|
148 |
-
"text": match['metadata']['text'],
|
149 |
-
"header": match['metadata']['header'],
|
150 |
-
"page": match['metadata']['page'],
|
151 |
-
"type": match['metadata']['type']
|
152 |
-
})
|
153 |
return retrieved_chunks
|
|
|
1 |
+
from pinecone import Pinecone, ServerlessSpec
|
2 |
+
from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
import hashlib
|
6 |
+
from typing import List, Dict
|
7 |
+
import os
|
8 |
+
import time
|
9 |
+
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
|
12 |
+
# Initialize Pinecone at the module level
|
13 |
+
# It's better to initialize it once at the start of your application
|
14 |
+
|
15 |
+
|
16 |
+
def get_index(pinecone: Pinecone, index_name: str):
|
17 |
+
# 1. Delete existing index if it exists
|
18 |
+
if index_name in pinecone.list_indexes().names():
|
19 |
+
print(f"Deleting existing index: {index_name}")
|
20 |
+
pinecone.delete_index(index_name)
|
21 |
+
print(f"Index {index_name} deleted.")
|
22 |
+
else:
|
23 |
+
print(f"Index {index_name} does not exist, no deletion necessary.")
|
24 |
+
|
25 |
+
# 2. Create fresh index using create_index_for_model for integrated embedding
|
26 |
+
print(f"Creating new index: {index_name} with integrated 'llama-text-embed-v2'")
|
27 |
+
pinecone.create_index( # Corrected from create_index_for_model
|
28 |
+
name=index_name,
|
29 |
+
metric="cosine", # llama-text-embed-v2 uses cosine or dotproduct
|
30 |
+
dimension=768, # default dimension for llama-text-embed-v2
|
31 |
+
# embed parameter should be at the top-level of create_index for integrated models
|
32 |
+
# and 'field_map' is not used directly in create_index for embedded models
|
33 |
+
# Instead, it's inferred from the text being passed in the 'upsert' method
|
34 |
+
# We will specify the embedding model when upserting.
|
35 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
36 |
+
)
|
37 |
+
print(f"Index {index_name} created.")
|
38 |
+
|
39 |
+
# 3. Wait for the index to be ready
|
40 |
+
while not pinecone.describe_index(index_name).status['ready']:
|
41 |
+
print("Waiting for index to be ready...")
|
42 |
+
time.sleep(5)
|
43 |
+
print(f"Index {index_name} is now ready.")
|
44 |
+
|
45 |
+
return pinecone.Index(index_name)
|
46 |
+
|
47 |
+
# class EmbeddingEngine:
|
48 |
+
# def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5'):
|
49 |
+
# # For now, we are relying on Pinecone's integrated embedding for dense vectors.
|
50 |
+
# # This class might be used for other purposes or for local embedding generation if needed later.
|
51 |
+
# self.model = SentenceTransformer(model_name)
|
52 |
+
# self.model.max_seq_length = 512
|
53 |
+
|
54 |
+
# def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
|
55 |
+
# prefixed_texts = [
|
56 |
+
# f"query: {text}" if "query:" not in text.lower() else text
|
57 |
+
# for text in texts
|
58 |
+
# ]
|
59 |
+
# return self.model.encode(prefixed_texts, batch_size=batch_size, convert_to_numpy=True)
|
60 |
+
|
61 |
+
class PineconeVectorStore:
|
62 |
+
def __init__(self, index_name: str, pinecone: Pinecone, dimension: int = 1024): # default dimension for llama-text-embed-v2
|
63 |
+
self.index_name = index_name
|
64 |
+
self.dimension = dimension
|
65 |
+
|
66 |
+
self.index = pinecone.Index(index_name)
|
67 |
+
|
68 |
+
# Initialize BM25 encoder for sparse vectors
|
69 |
+
self.bm25_encoder = BM25Encoder()
|
70 |
+
|
71 |
+
# Fit BM25 encoder on a representative corpus of your data.
|
72 |
+
# This is crucial for BM25's effectiveness.
|
73 |
+
# For this example, we'll fit on a small sample. In a real scenario,
|
74 |
+
# you'd fit it on a larger corpus of your document chunks.
|
75 |
+
print("Fitting BM25Encoder...")
|
76 |
+
sample_corpus = ["This is a document about machine learning.", "Another document discussing natural language processing.", "A third document focused on artificial intelligence applications."]
|
77 |
+
self.bm25_encoder.fit(sample_corpus)
|
78 |
+
print("BM25Encoder fitted.")
|
79 |
+
|
80 |
+
def overwrite_vectors(self, document_chunks, pdf_filename: str, pinecone: Pinecone):
|
81 |
+
"""
|
82 |
+
Clears all vectors from the index and uploads new ones for the given PDF.
|
83 |
+
"""
|
84 |
+
# ✅ Clear all existing vectors from the index
|
85 |
+
print("Deleting all vectors in current index...")
|
86 |
+
self.index.delete(delete_all=True)
|
87 |
+
print("All vectors deleted.")
|
88 |
+
|
89 |
+
# Step 1: Generate embeddings using SentenceTransformer
|
90 |
+
inputs = [f"query: {text['text']}" for text in document_chunks]
|
91 |
+
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
|
92 |
+
embeddings = model.encode(inputs, batch_size=32, convert_to_numpy=True).tolist()
|
93 |
+
|
94 |
+
# Step 2: Prepare vectors for upsert
|
95 |
+
records_to_upsert = []
|
96 |
+
for i, chunk_text in enumerate(document_chunks):
|
97 |
+
doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
|
98 |
+
sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
|
99 |
+
|
100 |
+
records_to_upsert.append({
|
101 |
+
"id": doc_id,
|
102 |
+
"values": embeddings[i],
|
103 |
+
# "sparse_values": sparse_vector, # Optional hybrid retrieval
|
104 |
+
"metadata": {
|
105 |
+
"text": chunk_text['text'],
|
106 |
+
"header": chunk_text['header'],
|
107 |
+
"page": chunk_text['page'],
|
108 |
+
"type": chunk_text['type']
|
109 |
+
}
|
110 |
+
})
|
111 |
+
|
112 |
+
# Step 3: Upsert to Pinecone in batches
|
113 |
+
batch_size = 100
|
114 |
+
for i in range(0, len(records_to_upsert), batch_size):
|
115 |
+
batch = records_to_upsert[i:i + batch_size]
|
116 |
+
self.index.upsert(vectors=batch, batch_size=batch_size)
|
117 |
+
|
118 |
+
print(f"✅ Successfully uploaded {len(records_to_upsert)} chunks from {pdf_filename} to Pinecone.")
|
119 |
+
|
120 |
+
def retrieve_chunks(self, query_text: str, pinecone: Pinecone, top_k: int = 5):
|
121 |
+
"""
|
122 |
+
Retrieves top-k chunks based on the query using hybrid search.
|
123 |
+
"""
|
124 |
+
# Generate sparse vector for the query using BM25Encoder
|
125 |
+
sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
|
126 |
+
|
127 |
+
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
|
128 |
+
embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()
|
129 |
+
|
130 |
+
query_results = self.index.query(
|
131 |
+
vector=embeddings,
|
132 |
+
# sparse_vector=sparse_query_vector, # Include the sparse vector for hybrid search
|
133 |
+
top_k=top_k,
|
134 |
+
include_metadata=True,
|
135 |
+
include_values=False # No need to return the vectors themselves for RAG
|
136 |
+
)
|
137 |
+
|
138 |
+
retrieved_chunks = []
|
139 |
+
for match in query_results['matches']:
|
140 |
+
retrieved_chunks.append({
|
141 |
+
"id": match['id'],
|
142 |
+
"score": match['score'],
|
143 |
+
"text": match['metadata']['text'],
|
144 |
+
"header": match['metadata']['header'],
|
145 |
+
"page": match['metadata']['page'],
|
146 |
+
"type": match['metadata']['type']
|
147 |
+
})
|
|
|
|
|
|
|
|
|
|
|
148 |
return retrieved_chunks
|