MRP999 commited on
Commit
5ffe411
·
verified ·
1 Parent(s): cdc38bb

Update pinecone_embeddings.py

Browse files
Files changed (1) hide show
  1. pinecone_embeddings.py +4 -4
pinecone_embeddings.py CHANGED
@@ -1,5 +1,5 @@
1
  from pinecone import Pinecone, ServerlessSpec
2
- from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
3
  import numpy as np
4
  import re
5
  import hashlib
@@ -66,7 +66,7 @@ class PineconeVectorStore:
66
  self.index = pinecone.Index(index_name)
67
 
68
  # Initialize BM25 encoder for sparse vectors
69
- self.bm25_encoder = BM25Encoder()
70
 
71
  # Fit BM25 encoder on a representative corpus of your data.
72
  # This is crucial for BM25's effectiveness.
@@ -95,7 +95,7 @@ class PineconeVectorStore:
95
  records_to_upsert = []
96
  for i, chunk_text in enumerate(document_chunks):
97
  doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
98
- sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
99
 
100
  records_to_upsert.append({
101
  "id": doc_id,
@@ -122,7 +122,7 @@ class PineconeVectorStore:
122
  Retrieves top-k chunks based on the query using hybrid search.
123
  """
124
  # Generate sparse vector for the query using BM25Encoder
125
- sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
126
 
127
  model = SentenceTransformer('BAAI/bge-base-en-v1.5')
128
  embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()
 
1
  from pinecone import Pinecone, ServerlessSpec
2
+ # from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
3
  import numpy as np
4
  import re
5
  import hashlib
 
66
  self.index = pinecone.Index(index_name)
67
 
68
  # Initialize BM25 encoder for sparse vectors
69
+ # self.bm25_encoder = BM25Encoder()
70
 
71
  # Fit BM25 encoder on a representative corpus of your data.
72
  # This is crucial for BM25's effectiveness.
 
95
  records_to_upsert = []
96
  for i, chunk_text in enumerate(document_chunks):
97
  doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
98
+ # sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
99
 
100
  records_to_upsert.append({
101
  "id": doc_id,
 
122
  Retrieves top-k chunks based on the query using hybrid search.
123
  """
124
  # Generate sparse vector for the query using BM25Encoder
125
+ # sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
126
 
127
  model = SentenceTransformer('BAAI/bge-base-en-v1.5')
128
  embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()