Spaces:
Sleeping
Sleeping
Update pinecone_embeddings.py
Browse files- pinecone_embeddings.py +4 -4
pinecone_embeddings.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from pinecone import Pinecone, ServerlessSpec
|
2 |
-
from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
|
3 |
import numpy as np
|
4 |
import re
|
5 |
import hashlib
|
@@ -66,7 +66,7 @@ class PineconeVectorStore:
|
|
66 |
self.index = pinecone.Index(index_name)
|
67 |
|
68 |
# Initialize BM25 encoder for sparse vectors
|
69 |
-
self.bm25_encoder = BM25Encoder()
|
70 |
|
71 |
# Fit BM25 encoder on a representative corpus of your data.
|
72 |
# This is crucial for BM25's effectiveness.
|
@@ -95,7 +95,7 @@ class PineconeVectorStore:
|
|
95 |
records_to_upsert = []
|
96 |
for i, chunk_text in enumerate(document_chunks):
|
97 |
doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
|
98 |
-
sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
|
99 |
|
100 |
records_to_upsert.append({
|
101 |
"id": doc_id,
|
@@ -122,7 +122,7 @@ class PineconeVectorStore:
|
|
122 |
Retrieves top-k chunks based on the query using hybrid search.
|
123 |
"""
|
124 |
# Generate sparse vector for the query using BM25Encoder
|
125 |
-
sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
|
126 |
|
127 |
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
|
128 |
embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()
|
|
|
1 |
from pinecone import Pinecone, ServerlessSpec
|
2 |
+
# from pinecone_text.sparse import BM25Encoder # For BM25 sparse vectors
|
3 |
import numpy as np
|
4 |
import re
|
5 |
import hashlib
|
|
|
66 |
self.index = pinecone.Index(index_name)
|
67 |
|
68 |
# Initialize BM25 encoder for sparse vectors
|
69 |
+
# self.bm25_encoder = BM25Encoder()
|
70 |
|
71 |
# Fit BM25 encoder on a representative corpus of your data.
|
72 |
# This is crucial for BM25's effectiveness.
|
|
|
95 |
records_to_upsert = []
|
96 |
for i, chunk_text in enumerate(document_chunks):
|
97 |
doc_id = hashlib.md5(f"{pdf_filename}-{chunk_text['text']}".encode('utf-8')).hexdigest()
|
98 |
+
# sparse_vector = self.bm25_encoder.encode_documents([chunk_text["text"]])
|
99 |
|
100 |
records_to_upsert.append({
|
101 |
"id": doc_id,
|
|
|
122 |
Retrieves top-k chunks based on the query using hybrid search.
|
123 |
"""
|
124 |
# Generate sparse vector for the query using BM25Encoder
|
125 |
+
# sparse_query_vector = self.bm25_encoder.encode_queries([query_text])
|
126 |
|
127 |
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
|
128 |
embeddings = model.encode(f"query: {query_text}", batch_size=32, convert_to_numpy=True).tolist()
|