BERT-for-Patents_Semantic-Patent-Finder-v2

Running

bhlewis commited on Jul 29, 2024

Commit

cf6f1f1

verified ·

1 Parent(s): f7dbdf9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -47,13 +47,6 @@ def load_data():
 embeddings, patent_numbers, metadata, texts = load_data()
-# Normalize embeddings for cosine similarity
-embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
-# Create FAISS index for cosine similarity
-index = faiss.IndexFlatIP(embeddings.shape[1])
-index.add(embeddings)
 # Load BERT model for encoding search queries
 try:
     bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
@@ -66,6 +59,20 @@ except Exception as e:
     print("Falling back to a general-purpose model.")
     model = SentenceTransformer('all-MiniLM-L6-v2')
 # Create TF-IDF vectorizer
 tfidf_vectorizer = TfidfVectorizer(stop_words='english')
 tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

 embeddings, patent_numbers, metadata, texts = load_data()
 # Load BERT model for encoding search queries
 try:
     bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
     print("Falling back to a general-purpose model.")
     model = SentenceTransformer('all-MiniLM-L6-v2')
+# Check if the embedding dimensions match
+if embeddings.shape[1] != model.get_sentence_embedding_dimension():
+    print("Embedding dimensions do not match. Rebuilding FAISS index.")
+    # Rebuild embeddings using the new model
+    embeddings = np.array([model.encode(text) for text in texts])
+    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+# Normalize embeddings for cosine similarity
+embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+# Create FAISS index for cosine similarity
+index = faiss.IndexFlatIP(embeddings.shape[1])
+index.add(embeddings)
 # Create TF-IDF vectorizer
 tfidf_vectorizer = TfidfVectorizer(stop_words='english')
 tfidf_matrix = tfidf_vectorizer.fit_transform(texts)