MRP999 commited on
Commit
eeff9de
·
verified ·
1 Parent(s): e313bf4

Update pinecone_embeddings.py

Browse files
Files changed (1) hide show
  1. pinecone_embeddings.py +147 -152
pinecone_embeddings.py CHANGED
@@ -1,153 +1,148 @@
1
- from pinecone import Pinecone, ServerlessSpec
2
- from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
3
- import numpy as np
4
- import re
5
- import hashlib
6
- from typing import List, Dict
7
- import os
8
- import time
9
-
10
- from sentence_transformers import SentenceTransformer
11
-
12
- # Initialize Pinecone at the module level
13
- # It's better to initialize it once at the start of your application
14
-
15
-
16
- def get_index(pinecone: Pinecone, index_name: str):
17
- # 1. Delete existing index if it exists
18
- if index_name in pinecone.list_indexes().names():
19
- print(f"Deleting existing index: {index_name}")
20
- pinecone.delete_index(index_name)
21
- print(f"Index {index_name} deleted.")
22
- else:
23
- print(f"Index {index_name} does not exist, no deletion necessary.")
24
-
25
- # 2. Create fresh index using create_index_for_model for integrated embedding
26
- print(f"Creating new index: {index_name} with integrated 'llama-text-embed-v2'")
27
- pinecone.create_index( # Corrected from create_index_for_model
28
- name=index_name,
29
- metric="cosine", # llama-text-embed-v2 uses cosine or dotproduct
30
- dimension=768, # default dimension for llama-text-embed-v2
31
- # embed parameter should be at the top-level of create_index for integrated models
32
- # and 'field_map' is not used directly in create_index for embedded models
33
- # Instead, it's inferred from the text being passed in the 'upsert' method
34
- # We will specify the embedding model when upserting.
35
- spec=ServerlessSpec(cloud="aws", region="us-east-1")
36
- )
37
- print(f"Index {index_name} created.")
38
-
39
- # 3. Wait for the index to be ready
40
- while not pinecone.describe_index(index_name).status['ready']:
41
- print("Waiting for index to be ready...")
42
- time.sleep(5)
43
- print(f"Index {index_name} is now ready.")
44
-
45
- return pinecone.Index(index_name)
46
-
47
- # class EmbeddingEngine:
48
- # def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5'):
49
- # # For now, we are relying on Pinecone's integrated embedding for dense vectors.
50
- # # This class might be used for other purposes or for local embedding generation if needed later.
51
- # self.model = SentenceTransformer(model_name)
52
- # self.model.max_seq_length = 512
53
-
54
- # def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
55
- # prefixed_texts = [
56
- # f"query: {text}" if "query:" not in text.lower() else text
57
- # for text in texts
58
- # ]
59
- # return self.model.encode(prefixed_texts, batch_size=batch_size, convert_to_numpy=True)
60
-
61
- class PineconeVectorStore:
62
- def __init__(self, index_name: str, pinecone: Pinecone, dimension: int = 1024): # default dimension for llama-text-embed-v2
63
- self.index_name = index_name
64
- self.dimension = dimension
65
-
66
- self.index = get_index(pinecone, index_name)
67
-
68
- # Initialize BM25 encoder for sparse vectors
69
- self.bm25_encoder = BM25Encoder()
70
-
71
- # Fit BM25 encoder on a representative corpus of your data.
72
- # This is crucial for BM25's effectiveness.
73
- # For this example, we'll fit on a small sample. In a real scenario,
74
- # you'd fit it on a larger corpus of your document chunks.
75
- print("Fitting BM25Encoder...")
76
- sample_corpus = ["This is a document about machine learning.", "Another document discussing natural language processing.", "A third document focused on artificial intelligence applications."]
77
- self.bm25_encoder.fit(sample_corpus)
78
- print("BM25Encoder fitted.")
79
-
80
- def overwrite_vectors(self, document_chunks, pdf_filename: str, pinecone: Pinecone):
81
- """
82
- Completely replaces all vectors in the index with new data from a PDF.
83
- Leverages Pinecone's integrated embedding for dense vectors and BM25 for sparse.
84
- """
85
- # Ensure the index is recreated before processing each new PDF
86
- # self.index = get_index(pinecone, self.index_name)
87
-
88
- inputs = [f"query: {text['text']}" for text in document_chunks]
89
-
90
- # embeddings = pinecone.inference.embed(
91
- # model = 'llama-text-embed-v2',
92
- # inputs = inputs,
93
- # parameters={
94
- # "input_type": "passage",
95
- # "truncate": "END"
96
- # }
97
- # )
98
-
99
- model = SentenceTransformer('BAAI/bge-base-en-v1.5')
100
- embeddings = model.encode(inputs, batch_size=32, convert_to_numpy=True).tolist()
101
-
102
- records_to_upsert = []
103
- for i, chunk_text in enumerate(document_chunks):
104
- # Ensure chunk_text is always a string before encoding
105
-
106
- doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
107
- sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
108
-
109
- records_to_upsert.append({
110
- "id": doc_id,
111
- "values": embeddings[i],
112
- # "sparse_values": sparse_vector,
113
- "metadata": {"text": chunk_text['text'], "header": chunk_text['header'], "page": chunk_text['page'], "type": chunk_text['type']}
114
- })
115
-
116
- batch_size = 100
117
- for i in range(0, len(records_to_upsert), batch_size):
118
- batch = records_to_upsert[i:i + batch_size]
119
- self.index.upsert(
120
- vectors=batch,
121
- batch_size=batch_size
122
- )
123
- print(f"Successfully uploaded {len(records_to_upsert)} chunks from {pdf_filename} to Pinecone.")
124
-
125
- def retrieve_chunks(self, query_text: str, pinecone: Pinecone, top_k: int = 5):
126
- """
127
- Retrieves top-k chunks based on the query using hybrid search.
128
- """
129
- # Generate sparse vector for the query using BM25Encoder
130
- sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
131
-
132
- model = SentenceTransformer('BAAI/bge-base-en-v1.5')
133
- embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()
134
-
135
- query_results = self.index.query(
136
- vector=embeddings,
137
- # sparse_vector=sparse_query_vector, # Include the sparse vector for hybrid search
138
- top_k=top_k,
139
- include_metadata=True,
140
- include_values=False # No need to return the vectors themselves for RAG
141
- )
142
-
143
- retrieved_chunks = []
144
- for match in query_results['matches']:
145
- retrieved_chunks.append({
146
- "id": match['id'],
147
- "score": match['score'],
148
- "text": match['metadata']['text'],
149
- "header": match['metadata']['header'],
150
- "page": match['metadata']['page'],
151
- "type": match['metadata']['type']
152
- })
153
  return retrieved_chunks
 
1
+ from pinecone import Pinecone, ServerlessSpec
2
+ from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
3
+ import numpy as np
4
+ import re
5
+ import hashlib
6
+ from typing import List, Dict
7
+ import os
8
+ import time
9
+
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ # Initialize Pinecone at the module level
13
+ # It's better to initialize it once at the start of your application
14
+
15
+
16
+ def get_index(pinecone: Pinecone, index_name: str):
17
+ # 1. Delete existing index if it exists
18
+ if index_name in pinecone.list_indexes().names():
19
+ print(f"Deleting existing index: {index_name}")
20
+ pinecone.delete_index(index_name)
21
+ print(f"Index {index_name} deleted.")
22
+ else:
23
+ print(f"Index {index_name} does not exist, no deletion necessary.")
24
+
25
+ # 2. Create fresh index using create_index_for_model for integrated embedding
26
+ print(f"Creating new index: {index_name} with integrated 'llama-text-embed-v2'")
27
+ pinecone.create_index( # Corrected from create_index_for_model
28
+ name=index_name,
29
+ metric="cosine", # llama-text-embed-v2 uses cosine or dotproduct
30
+ dimension=768, # default dimension for llama-text-embed-v2
31
+ # embed parameter should be at the top-level of create_index for integrated models
32
+ # and 'field_map' is not used directly in create_index for embedded models
33
+ # Instead, it's inferred from the text being passed in the 'upsert' method
34
+ # We will specify the embedding model when upserting.
35
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
36
+ )
37
+ print(f"Index {index_name} created.")
38
+
39
+ # 3. Wait for the index to be ready
40
+ while not pinecone.describe_index(index_name).status['ready']:
41
+ print("Waiting for index to be ready...")
42
+ time.sleep(5)
43
+ print(f"Index {index_name} is now ready.")
44
+
45
+ return pinecone.Index(index_name)
46
+
47
+ # class EmbeddingEngine:
48
+ # def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5'):
49
+ # # For now, we are relying on Pinecone's integrated embedding for dense vectors.
50
+ # # This class might be used for other purposes or for local embedding generation if needed later.
51
+ # self.model = SentenceTransformer(model_name)
52
+ # self.model.max_seq_length = 512
53
+
54
+ # def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
55
+ # prefixed_texts = [
56
+ # f"query: {text}" if "query:" not in text.lower() else text
57
+ # for text in texts
58
+ # ]
59
+ # return self.model.encode(prefixed_texts, batch_size=batch_size, convert_to_numpy=True)
60
+
61
+ class PineconeVectorStore:
62
+ def __init__(self, index_name: str, pinecone: Pinecone, dimension: int = 1024): # default dimension for llama-text-embed-v2
63
+ self.index_name = index_name
64
+ self.dimension = dimension
65
+
66
+ self.index = pinecone.Index(index_name)
67
+
68
+ # Initialize BM25 encoder for sparse vectors
69
+ self.bm25_encoder = BM25Encoder()
70
+
71
+ # Fit BM25 encoder on a representative corpus of your data.
72
+ # This is crucial for BM25's effectiveness.
73
+ # For this example, we'll fit on a small sample. In a real scenario,
74
+ # you'd fit it on a larger corpus of your document chunks.
75
+ print("Fitting BM25Encoder...")
76
+ sample_corpus = ["This is a document about machine learning.", "Another document discussing natural language processing.", "A third document focused on artificial intelligence applications."]
77
+ self.bm25_encoder.fit(sample_corpus)
78
+ print("BM25Encoder fitted.")
79
+
80
+ def overwrite_vectors(self, document_chunks, pdf_filename: str, pinecone: Pinecone):
81
+ """
82
+ Clears all vectors from the index and uploads new ones for the given PDF.
83
+ """
84
+ # ✅ Clear all existing vectors from the index
85
+ print("Deleting all vectors in current index...")
86
+ self.index.delete(delete_all=True)
87
+ print("All vectors deleted.")
88
+
89
+ # Step 1: Generate embeddings using SentenceTransformer
90
+ inputs = [f"query: {text['text']}" for text in document_chunks]
91
+ model = SentenceTransformer('BAAI/bge-base-en-v1.5')
92
+ embeddings = model.encode(inputs, batch_size=32, convert_to_numpy=True).tolist()
93
+
94
+ # Step 2: Prepare vectors for upsert
95
+ records_to_upsert = []
96
+ for i, chunk_text in enumerate(document_chunks):
97
+ doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
98
+ sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
99
+
100
+ records_to_upsert.append({
101
+ "id": doc_id,
102
+ "values": embeddings[i],
103
+ # "sparse_values": sparse_vector, # Optional hybrid retrieval
104
+ "metadata": {
105
+ "text": chunk_text['text'],
106
+ "header": chunk_text['header'],
107
+ "page": chunk_text['page'],
108
+ "type": chunk_text['type']
109
+ }
110
+ })
111
+
112
+ # Step 3: Upsert to Pinecone in batches
113
+ batch_size = 100
114
+ for i in range(0, len(records_to_upsert), batch_size):
115
+ batch = records_to_upsert[i:i + batch_size]
116
+ self.index.upsert(vectors=batch, batch_size=batch_size)
117
+
118
+ print(f"✅ Successfully uploaded {len(records_to_upsert)} chunks from {pdf_filename} to Pinecone.")
119
+
120
+ def retrieve_chunks(self, query_text: str, pinecone: Pinecone, top_k: int = 5):
121
+ """
122
+ Retrieves top-k chunks based on the query using hybrid search.
123
+ """
124
+ # Generate sparse vector for the query using BM25Encoder
125
+ sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
126
+
127
+ model = SentenceTransformer('BAAI/bge-base-en-v1.5')
128
+ embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()
129
+
130
+ query_results = self.index.query(
131
+ vector=embeddings,
132
+ # sparse_vector=sparse_query_vector, # Include the sparse vector for hybrid search
133
+ top_k=top_k,
134
+ include_metadata=True,
135
+ include_values=False # No need to return the vectors themselves for RAG
136
+ )
137
+
138
+ retrieved_chunks = []
139
+ for match in query_results['matches']:
140
+ retrieved_chunks.append({
141
+ "id": match['id'],
142
+ "score": match['score'],
143
+ "text": match['metadata']['text'],
144
+ "header": match['metadata']['header'],
145
+ "page": match['metadata']['page'],
146
+ "type": match['metadata']['type']
147
+ })
 
 
 
 
 
148
  return retrieved_chunks