Update app.py
Browse files
app.py
CHANGED
@@ -4,12 +4,12 @@ import h5py
|
|
4 |
import faiss
|
5 |
import json
|
6 |
from transformers import AutoTokenizer, AutoModel
|
7 |
-
from sentence_transformers import SentenceTransformer, models
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
import re
|
11 |
from collections import Counter
|
12 |
import spacy
|
|
|
13 |
|
14 |
# Load Spacy model for advanced NLP
|
15 |
try:
|
@@ -51,19 +51,24 @@ embeddings, patent_numbers, metadata, texts = load_data()
|
|
51 |
try:
|
52 |
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
|
53 |
bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
|
54 |
-
word_embedding_model = models.Transformer(model_name='anferico/bert-for-patents', tokenizer=tokenizer, auto_model=bert_model)
|
55 |
-
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
|
56 |
-
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
57 |
except Exception as e:
|
58 |
print(f"Error loading anferico/bert-for-patents: {e}")
|
59 |
print("Falling back to a general-purpose model.")
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# Check if the embedding dimensions match
|
63 |
-
if embeddings.shape[1] !=
|
64 |
print("Embedding dimensions do not match. Rebuilding FAISS index.")
|
65 |
# Rebuild embeddings using the new model
|
66 |
-
embeddings =
|
67 |
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
68 |
|
69 |
# Normalize embeddings for cosine similarity
|
@@ -97,7 +102,7 @@ def hybrid_search(query, top_k=5):
|
|
97 |
query_features = extract_key_features(query)
|
98 |
|
99 |
# Encode the query using the transformer model
|
100 |
-
query_embedding =
|
101 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
102 |
|
103 |
# Perform semantic similarity search
|
|
|
4 |
import faiss
|
5 |
import json
|
6 |
from transformers import AutoTokenizer, AutoModel
|
|
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
import re
|
10 |
from collections import Counter
|
11 |
import spacy
|
12 |
+
import torch
|
13 |
|
14 |
# Load Spacy model for advanced NLP
|
15 |
try:
|
|
|
51 |
try:
|
52 |
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
|
53 |
bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
|
|
|
|
|
|
|
54 |
except Exception as e:
|
55 |
print(f"Error loading anferico/bert-for-patents: {e}")
|
56 |
print("Falling back to a general-purpose model.")
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
58 |
+
bert_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
59 |
+
|
60 |
+
def encode_texts(texts):
|
61 |
+
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
62 |
+
with torch.no_grad():
|
63 |
+
outputs = bert_model(**inputs)
|
64 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
65 |
+
return embeddings.numpy()
|
66 |
|
67 |
# Check if the embedding dimensions match
|
68 |
+
if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
|
69 |
print("Embedding dimensions do not match. Rebuilding FAISS index.")
|
70 |
# Rebuild embeddings using the new model
|
71 |
+
embeddings = encode_texts(texts)
|
72 |
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
73 |
|
74 |
# Normalize embeddings for cosine similarity
|
|
|
102 |
query_features = extract_key_features(query)
|
103 |
|
104 |
# Encode the query using the transformer model
|
105 |
+
query_embedding = encode_texts([query])[0]
|
106 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
107 |
|
108 |
# Perform semantic similarity search
|