bhlewis commited on
Commit
bc26c5d
·
verified ·
1 Parent(s): 5f63506

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -4,12 +4,12 @@ import h5py
4
  import faiss
5
  import json
6
  from transformers import AutoTokenizer, AutoModel
7
- from sentence_transformers import SentenceTransformer, models
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
  import re
11
  from collections import Counter
12
  import spacy
 
13
 
14
  # Load Spacy model for advanced NLP
15
  try:
@@ -51,19 +51,24 @@ embeddings, patent_numbers, metadata, texts = load_data()
51
  try:
52
  tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
53
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
54
- word_embedding_model = models.Transformer(model_name='anferico/bert-for-patents', tokenizer=tokenizer, auto_model=bert_model)
55
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
56
- model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
57
  except Exception as e:
58
  print(f"Error loading anferico/bert-for-patents: {e}")
59
  print("Falling back to a general-purpose model.")
60
- model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
 
 
 
 
 
61
 
62
  # Check if the embedding dimensions match
63
- if embeddings.shape[1] != model.get_sentence_embedding_dimension():
64
  print("Embedding dimensions do not match. Rebuilding FAISS index.")
65
  # Rebuild embeddings using the new model
66
- embeddings = np.array([model.encode(text) for text in texts])
67
  embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
68
 
69
  # Normalize embeddings for cosine similarity
@@ -97,7 +102,7 @@ def hybrid_search(query, top_k=5):
97
  query_features = extract_key_features(query)
98
 
99
  # Encode the query using the transformer model
100
- query_embedding = model.encode([query])[0]
101
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
102
 
103
  # Perform semantic similarity search
 
4
  import faiss
5
  import json
6
  from transformers import AutoTokenizer, AutoModel
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  import re
10
  from collections import Counter
11
  import spacy
12
+ import torch
13
 
14
  # Load Spacy model for advanced NLP
15
  try:
 
51
  try:
52
  tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
53
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
 
 
 
54
  except Exception as e:
55
  print(f"Error loading anferico/bert-for-patents: {e}")
56
  print("Falling back to a general-purpose model.")
57
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
58
+ bert_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
59
+
60
+ def encode_texts(texts):
61
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
62
+ with torch.no_grad():
63
+ outputs = bert_model(**inputs)
64
+ embeddings = outputs.last_hidden_state.mean(dim=1)
65
+ return embeddings.numpy()
66
 
67
  # Check if the embedding dimensions match
68
+ if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
69
  print("Embedding dimensions do not match. Rebuilding FAISS index.")
70
  # Rebuild embeddings using the new model
71
+ embeddings = encode_texts(texts)
72
  embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
73
 
74
  # Normalize embeddings for cosine similarity
 
102
  query_features = extract_key_features(query)
103
 
104
  # Encode the query using the transformer model
105
+ query_embedding = encode_texts([query])[0]
106
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
107
 
108
  # Perform semantic similarity search