File size: 8,002 Bytes
663d1ad
 
 
8e3cac5
663d1ad
8e3cac5
 
 
2a70269
8e3cac5
 
1af801b
 
7f07411
 
8e3cac5
1af801b
 
eaf6036
8e3cac5
 
 
 
1af801b
8e3cac5
 
 
 
1af801b
8e3cac5
 
 
 
 
 
 
 
 
1af801b
8e3cac5
 
 
1af801b
8e3cac5
 
1af801b
8e3cac5
 
1af801b
 
 
 
8e3cac5
1af801b
8e3cac5
1af801b
8e3cac5
 
1af801b
8e3cac5
 
1af801b
8e3cac5
1af801b
8e3cac5
 
 
1af801b
8e3cac5
 
1af801b
8e3cac5
 
1af801b
8e3cac5
1af801b
8e3cac5
 
 
 
 
 
663d1ad
8e3cac5
6744e1a
8e3cac5
 
 
6744e1a
8e3cac5
 
 
 
 
 
 
6744e1a
8e3cac5
 
 
6744e1a
8e3cac5
 
 
 
6744e1a
8e3cac5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1af801b
8e3cac5
7935863
8e3cac5
 
 
 
e503f85
8e3cac5
 
 
7935863
 
 
8e3cac5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import gradio as gr
import numpy as np
import h5py
import faiss
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter
import torch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Load BERT model for lemmatization
bert_lemma_model_name = "bert-base-uncased"
bert_lemma_tokenizer = AutoTokenizer.from_pretrained(bert_lemma_model_name)
bert_lemma_model = AutoModelForMaskedLM.from_pretrained(bert_lemma_model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Load BERT model for encoding search queries
bert_encode_model_name = 'anferico/bert-for-patents'
bert_encode_tokenizer = AutoTokenizer.from_pretrained(bert_encode_model_name)
bert_encode_model = AutoModel.from_pretrained(bert_encode_model_name)

def bert_lemmatize(text):
    tokens = bert_lemma_tokenizer.tokenize(text)
    input_ids = bert_lemma_tokenizer.convert_tokens_to_ids(tokens)
    input_tensor = torch.tensor([input_ids]).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    with torch.no_grad():
        outputs = bert_lemma_model(input_tensor)
    predictions = outputs.logits.argmax(dim=-1)
    lemmatized_tokens = bert_lemma_tokenizer.convert_ids_to_tokens(predictions[0])
    return ' '.join([token for token in lemmatized_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']])

def preprocess_query(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove any HTML tags (if present)
    text = re.sub('<.*?>', '', text)
    
    # Remove special characters, but keep hyphens, periods, and commas
    text = re.sub(r'[^a-zA-Z0-9\s\-\.\,]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords, but keep all other words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    # Apply BERT lemmatization
    processed_text = bert_lemmatize(processed_text)
    
    return processed_text

def extract_key_features(text):
    # For queries, we'll just preprocess and return all non-stopword terms
    processed_text = preprocess_query(text)
    
    # Split the processed text into individual terms
    features = processed_text.split()
    
    # Remove duplicates while preserving order
    features = list(dict.fromkeys(features))
    
    return features

def encode_texts(texts, max_length=512):
    inputs = bert_encode_tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    with torch.no_grad():
        outputs = bert_encode_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

def load_data():
    try:
        with h5py.File('patent_embeddings.h5', 'r') as f:
            embeddings = f['embeddings'][:]
            patent_numbers = f['patent_numbers'][:]
        
        metadata = {}
        texts = []
        with open('patent_metadata.jsonl', 'r') as f:
            for line in f:
                data = json.loads(line)
                metadata[data['patent_number']] = data
                texts.append(data['text'])
        
        print(f"Embedding shape: {embeddings.shape}")
        print(f"Number of patent numbers: {len(patent_numbers)}")
        print(f"Number of metadata entries: {len(metadata)}")
        
        return embeddings, patent_numbers, metadata, texts
    except FileNotFoundError as e:
        print(f"Error: Could not find file. {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred while loading data: {e}")
        raise

def compare_features(query_features, patent_features):
    common_features = set(query_features) & set(patent_features)
    similarity_score = len(common_features) / max(len(query_features), len(patent_features))
    return common_features, similarity_score

def hybrid_search(query, top_k=5):
    print(f"Original query: {query}")
    
    processed_query = preprocess_query(query)
    query_features = extract_key_features(processed_query)
    
    # Encode the processed query using the transformer model
    query_embedding = encode_texts([processed_query])[0]
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    # Perform semantic similarity search
    semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
    
    # Perform TF-IDF based search
    query_tfidf = tfidf_vectorizer.transform([processed_query])
    tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
    
    # Combine and rank results
    combined_results = {}
    for i, idx in enumerate(semantic_indices[0]):
        patent_number = patent_numbers[idx].decode('utf-8')
        text = metadata[patent_number]['text']
        patent_features = extract_key_features(text)
        common_features, feature_similarity = compare_features(query_features, patent_features)
        combined_results[patent_number] = {
            'score': semantic_distances[0][i] * 1.0 + tfidf_similarities[idx] * 0.5 + feature_similarity,
            'common_features': common_features,
            'text': text
        }
    
    for idx in tfidf_indices:
        patent_number = patent_numbers[idx].decode('utf-8')
        if patent_number not in combined_results:
            text = metadata[patent_number]['text']
            patent_features = extract_key_features(text)
            common_features, feature_similarity = compare_features(query_features, patent_features)
            combined_results[patent_number] = {
                'score': tfidf_similarities[idx] * 1.0 + feature_similarity,
                'common_features': common_features,
                'text': text
            }
    
    # Sort and get top results
    top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
    
    results = []
    for patent_number, data in top_results:
        result = f"Patent Number: {patent_number}\n"
        result += f"Text: {data['text'][:200]}...\n"
        result += f"Combined Score: {data['score']:.4f}\n"
        result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
        results.append(result)
    
    return "\n".join(results)

# Load data and prepare the FAISS index
embeddings, patent_numbers, metadata, texts = load_data()

# Check if the embedding dimensions match
if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
    print("Embedding dimensions do not match. Rebuilding FAISS index.")
    # Rebuild embeddings using the new model
    embeddings = encode_texts(texts)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index for cosine similarity
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# Create Gradio interface with additional input fields
iface = gr.Interface(
    fn=hybrid_search,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
        gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Top K Results"),
    ],
    outputs=gr.Textbox(lines=10, label="Search Results"),
    title="Patent Similarity Search",
    description="Enter a patent description to find similar patents based on key features."
)

if __name__ == "__main__":
    iface.launch()