File size: 8,002 Bytes
663d1ad 8e3cac5 663d1ad 8e3cac5 2a70269 8e3cac5 1af801b 7f07411 8e3cac5 1af801b eaf6036 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 1af801b 8e3cac5 663d1ad 8e3cac5 6744e1a 8e3cac5 6744e1a 8e3cac5 6744e1a 8e3cac5 6744e1a 8e3cac5 6744e1a 8e3cac5 1af801b 8e3cac5 7935863 8e3cac5 e503f85 8e3cac5 7935863 8e3cac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import gradio as gr
import numpy as np
import h5py
import faiss
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter
import torch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
# Load BERT model for lemmatization
bert_lemma_model_name = "bert-base-uncased"
bert_lemma_tokenizer = AutoTokenizer.from_pretrained(bert_lemma_model_name)
bert_lemma_model = AutoModelForMaskedLM.from_pretrained(bert_lemma_model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
# Load BERT model for encoding search queries
bert_encode_model_name = 'anferico/bert-for-patents'
bert_encode_tokenizer = AutoTokenizer.from_pretrained(bert_encode_model_name)
bert_encode_model = AutoModel.from_pretrained(bert_encode_model_name)
def bert_lemmatize(text):
tokens = bert_lemma_tokenizer.tokenize(text)
input_ids = bert_lemma_tokenizer.convert_tokens_to_ids(tokens)
input_tensor = torch.tensor([input_ids]).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
with torch.no_grad():
outputs = bert_lemma_model(input_tensor)
predictions = outputs.logits.argmax(dim=-1)
lemmatized_tokens = bert_lemma_tokenizer.convert_ids_to_tokens(predictions[0])
return ' '.join([token for token in lemmatized_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']])
def preprocess_query(text):
# Convert to lowercase
text = text.lower()
# Remove any HTML tags (if present)
text = re.sub('<.*?>', '', text)
# Remove special characters, but keep hyphens, periods, and commas
text = re.sub(r'[^a-zA-Z0-9\s\-\.\,]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords, but keep all other words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Join tokens back into a string
processed_text = ' '.join(tokens)
# Apply BERT lemmatization
processed_text = bert_lemmatize(processed_text)
return processed_text
def extract_key_features(text):
# For queries, we'll just preprocess and return all non-stopword terms
processed_text = preprocess_query(text)
# Split the processed text into individual terms
features = processed_text.split()
# Remove duplicates while preserving order
features = list(dict.fromkeys(features))
return features
def encode_texts(texts, max_length=512):
inputs = bert_encode_tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
with torch.no_grad():
outputs = bert_encode_model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings.numpy()
def load_data():
try:
with h5py.File('patent_embeddings.h5', 'r') as f:
embeddings = f['embeddings'][:]
patent_numbers = f['patent_numbers'][:]
metadata = {}
texts = []
with open('patent_metadata.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
metadata[data['patent_number']] = data
texts.append(data['text'])
print(f"Embedding shape: {embeddings.shape}")
print(f"Number of patent numbers: {len(patent_numbers)}")
print(f"Number of metadata entries: {len(metadata)}")
return embeddings, patent_numbers, metadata, texts
except FileNotFoundError as e:
print(f"Error: Could not find file. {e}")
raise
except Exception as e:
print(f"An unexpected error occurred while loading data: {e}")
raise
def compare_features(query_features, patent_features):
common_features = set(query_features) & set(patent_features)
similarity_score = len(common_features) / max(len(query_features), len(patent_features))
return common_features, similarity_score
def hybrid_search(query, top_k=5):
print(f"Original query: {query}")
processed_query = preprocess_query(query)
query_features = extract_key_features(processed_query)
# Encode the processed query using the transformer model
query_embedding = encode_texts([processed_query])[0]
query_embedding = query_embedding / np.linalg.norm(query_embedding)
# Perform semantic similarity search
semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
# Perform TF-IDF based search
query_tfidf = tfidf_vectorizer.transform([processed_query])
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
# Combine and rank results
combined_results = {}
for i, idx in enumerate(semantic_indices[0]):
patent_number = patent_numbers[idx].decode('utf-8')
text = metadata[patent_number]['text']
patent_features = extract_key_features(text)
common_features, feature_similarity = compare_features(query_features, patent_features)
combined_results[patent_number] = {
'score': semantic_distances[0][i] * 1.0 + tfidf_similarities[idx] * 0.5 + feature_similarity,
'common_features': common_features,
'text': text
}
for idx in tfidf_indices:
patent_number = patent_numbers[idx].decode('utf-8')
if patent_number not in combined_results:
text = metadata[patent_number]['text']
patent_features = extract_key_features(text)
common_features, feature_similarity = compare_features(query_features, patent_features)
combined_results[patent_number] = {
'score': tfidf_similarities[idx] * 1.0 + feature_similarity,
'common_features': common_features,
'text': text
}
# Sort and get top results
top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
results = []
for patent_number, data in top_results:
result = f"Patent Number: {patent_number}\n"
result += f"Text: {data['text'][:200]}...\n"
result += f"Combined Score: {data['score']:.4f}\n"
result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
results.append(result)
return "\n".join(results)
# Load data and prepare the FAISS index
embeddings, patent_numbers, metadata, texts = load_data()
# Check if the embedding dimensions match
if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
print("Embedding dimensions do not match. Rebuilding FAISS index.")
# Rebuild embeddings using the new model
embeddings = encode_texts(texts)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
# Create FAISS index for cosine similarity
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
# Create Gradio interface with additional input fields
iface = gr.Interface(
fn=hybrid_search,
inputs=[
gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Top K Results"),
],
outputs=gr.Textbox(lines=10, label="Search Results"),
title="Patent Similarity Search",
description="Enter a patent description to find similar patents based on key features."
)
if __name__ == "__main__":
iface.launch() |