bhlewis commited on
Commit
cf6f1f1
·
verified ·
1 Parent(s): f7dbdf9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -47,13 +47,6 @@ def load_data():
47
 
48
  embeddings, patent_numbers, metadata, texts = load_data()
49
 
50
- # Normalize embeddings for cosine similarity
51
- embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
52
-
53
- # Create FAISS index for cosine similarity
54
- index = faiss.IndexFlatIP(embeddings.shape[1])
55
- index.add(embeddings)
56
-
57
  # Load BERT model for encoding search queries
58
  try:
59
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
@@ -66,6 +59,20 @@ except Exception as e:
66
  print("Falling back to a general-purpose model.")
67
  model = SentenceTransformer('all-MiniLM-L6-v2')
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Create TF-IDF vectorizer
70
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
71
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
 
47
 
48
  embeddings, patent_numbers, metadata, texts = load_data()
49
 
 
 
 
 
 
 
 
50
  # Load BERT model for encoding search queries
51
  try:
52
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
 
59
  print("Falling back to a general-purpose model.")
60
  model = SentenceTransformer('all-MiniLM-L6-v2')
61
 
62
+ # Check if the embedding dimensions match
63
+ if embeddings.shape[1] != model.get_sentence_embedding_dimension():
64
+ print("Embedding dimensions do not match. Rebuilding FAISS index.")
65
+ # Rebuild embeddings using the new model
66
+ embeddings = np.array([model.encode(text) for text in texts])
67
+ embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
68
+
69
+ # Normalize embeddings for cosine similarity
70
+ embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
71
+
72
+ # Create FAISS index for cosine similarity
73
+ index = faiss.IndexFlatIP(embeddings.shape[1])
74
+ index.add(embeddings)
75
+
76
  # Create TF-IDF vectorizer
77
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
78
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)