bhlewis commited on
Commit
7935863
1 Parent(s): c884348

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -4
app.py CHANGED
@@ -10,7 +10,6 @@ from sklearn.metrics.pairwise import cosine_similarity
10
  import re
11
  from collections import Counter
12
  import spacy
13
- import joblib
14
 
15
  # Load Spacy model for advanced NLP
16
  try:
@@ -74,10 +73,9 @@ embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
74
  index = faiss.IndexFlatIP(embeddings.shape[1])
75
  index.add(embeddings)
76
 
77
- # Create and save TF-IDF vectorizer
78
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
79
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
80
- joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
81
 
82
  def extract_key_features(text):
83
  # Use Spacy to extract noun phrases and key phrases
@@ -103,4 +101,59 @@ def hybrid_search(query, top_k=5):
103
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
104
 
105
  # Perform semantic similarity search
106
- semantic_distances, semantic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import re
11
  from collections import Counter
12
  import spacy
 
13
 
14
  # Load Spacy model for advanced NLP
15
  try:
 
73
  index = faiss.IndexFlatIP(embeddings.shape[1])
74
  index.add(embeddings)
75
 
76
+ # Create TF-IDF vectorizer
77
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
78
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
 
79
 
80
  def extract_key_features(text):
81
  # Use Spacy to extract noun phrases and key phrases
 
101
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
102
 
103
  # Perform semantic similarity search
104
+ semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
105
+
106
+ # Perform TF-IDF based search
107
+ query_tfidf = tfidf_vectorizer.transform([query])
108
+ tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
109
+ tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
110
+
111
+ # Combine and rank results
112
+ combined_results = {}
113
+ for i, idx in enumerate(semantic_indices[0]):
114
+ patent_number = patent_numbers[idx].decode('utf-8')
115
+ text = metadata[patent_number]['text']
116
+ patent_features = extract_key_features(text)
117
+ common_features, feature_similarity = compare_features(query_features, patent_features)
118
+ combined_results[patent_number] = {
119
+ 'score': semantic_distances[0][i] * 1.5 + feature_similarity,
120
+ 'common_features': common_features,
121
+ 'text': text
122
+ }
123
+
124
+ for idx in tfidf_indices:
125
+ patent_number = patent_numbers[idx].decode('utf-8')
126
+ if patent_number not in combined_results:
127
+ text = metadata[patent_number]['text']
128
+ patent_features = extract_key_features(text)
129
+ common_features, feature_similarity = compare_features(query_features, patent_features)
130
+ combined_results[patent_number] = {
131
+ 'score': tfidf_similarities[idx] + feature_similarity,
132
+ 'common_features': common_features,
133
+ 'text': text
134
+ }
135
+
136
+ # Sort and get top results
137
+ top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
138
+
139
+ results = []
140
+ for patent_number, data in top_results:
141
+ result = f"Patent Number: {patent_number}\n"
142
+ result += f"Text: {data['text'][:200]}...\n"
143
+ result += f"Combined Score: {data['score']:.4f}\n"
144
+ result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
145
+ results.append(result)
146
+
147
+ return "\n".join(results)
148
+
149
+ # Create Gradio interface
150
+ iface = gr.Interface(
151
+ fn=hybrid_search,
152
+ inputs=gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
153
+ outputs=gr.Textbox(lines=10, label="Search Results"),
154
+ title="Patent Similarity Search",
155
+ description="Enter a patent description to find similar patents based on key features."
156
+ )
157
+
158
+ if __name__ == "__main__":
159
+ iface.launch()