Update app.py
Browse files
app.py
CHANGED
@@ -10,11 +10,6 @@ import re
|
|
10 |
from collections import Counter
|
11 |
import spacy
|
12 |
import torch
|
13 |
-
from nltk.corpus import wordnet
|
14 |
-
import nltk
|
15 |
-
|
16 |
-
# Download WordNet data
|
17 |
-
nltk.download('wordnet')
|
18 |
|
19 |
# Load Spacy model for advanced NLP
|
20 |
try:
|
@@ -82,12 +77,16 @@ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
|
82 |
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
|
83 |
|
84 |
def extract_key_features(text):
|
85 |
-
# Use Spacy to extract
|
86 |
doc = nlp(text)
|
|
|
|
|
|
|
|
|
87 |
noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
|
88 |
-
feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of)', sent.text, re.IGNORECASE)]
|
89 |
|
90 |
-
all_features = noun_phrases + feature_phrases
|
91 |
return list(set(all_features))
|
92 |
|
93 |
def compare_features(query_features, patent_features):
|
@@ -95,33 +94,20 @@ def compare_features(query_features, patent_features):
|
|
95 |
similarity_score = len(common_features) / max(len(query_features), len(patent_features))
|
96 |
return common_features, similarity_score
|
97 |
|
98 |
-
def expand_query(query):
|
99 |
-
expanded_query = query
|
100 |
-
for word in query.split():
|
101 |
-
synonyms = wordnet.synsets(word)
|
102 |
-
for syn in synonyms:
|
103 |
-
for lemma in syn.lemmas():
|
104 |
-
expanded_query += " " + lemma.name()
|
105 |
-
return expanded_query
|
106 |
-
|
107 |
def hybrid_search(query, top_k=5):
|
108 |
print(f"Original query: {query}")
|
109 |
|
110 |
-
|
111 |
-
expanded_query = expand_query(query)
|
112 |
-
print(f"Expanded query: {expanded_query}")
|
113 |
-
|
114 |
-
query_features = extract_key_features(expanded_query)
|
115 |
|
116 |
# Encode the query using the transformer model
|
117 |
-
query_embedding = encode_texts([
|
118 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
119 |
|
120 |
# Perform semantic similarity search
|
121 |
semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
|
122 |
|
123 |
# Perform TF-IDF based search
|
124 |
-
query_tfidf = tfidf_vectorizer.transform([
|
125 |
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
126 |
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
|
127 |
|
@@ -161,7 +147,7 @@ def hybrid_search(query, top_k=5):
|
|
161 |
result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
|
162 |
results.append(result)
|
163 |
|
164 |
-
return "\n
|
165 |
|
166 |
# Create Gradio interface with additional input fields
|
167 |
iface = gr.Interface(
|
|
|
10 |
from collections import Counter
|
11 |
import spacy
|
12 |
import torch
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Load Spacy model for advanced NLP
|
15 |
try:
|
|
|
77 |
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
|
78 |
|
79 |
def extract_key_features(text):
|
80 |
+
# Use Spacy to extract technical terms and phrases
|
81 |
doc = nlp(text)
|
82 |
+
technical_terms = []
|
83 |
+
for token in doc:
|
84 |
+
if token.dep_ in ('amod', 'compound') or token.ent_type_ in ('PRODUCT', 'ORG', 'GPE', 'NORP'):
|
85 |
+
technical_terms.append(token.text.lower())
|
86 |
noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
|
87 |
+
feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of|deformable|insulation|heat-resistant|memory foam|high-temperature)', sent.text, re.IGNORECASE)]
|
88 |
|
89 |
+
all_features = technical_terms + noun_phrases + feature_phrases
|
90 |
return list(set(all_features))
|
91 |
|
92 |
def compare_features(query_features, patent_features):
|
|
|
94 |
similarity_score = len(common_features) / max(len(query_features), len(patent_features))
|
95 |
return common_features, similarity_score
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
def hybrid_search(query, top_k=5):
|
98 |
print(f"Original query: {query}")
|
99 |
|
100 |
+
query_features = extract_key_features(query)
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# Encode the query using the transformer model
|
103 |
+
query_embedding = encode_texts([query])[0]
|
104 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
105 |
|
106 |
# Perform semantic similarity search
|
107 |
semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
|
108 |
|
109 |
# Perform TF-IDF based search
|
110 |
+
query_tfidf = tfidf_vectorizer.transform([query])
|
111 |
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
112 |
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
|
113 |
|
|
|
147 |
result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
|
148 |
results.append(result)
|
149 |
|
150 |
+
return "\n.join(results)
|
151 |
|
152 |
# Create Gradio interface with additional input fields
|
153 |
iface = gr.Interface(
|