import gradio as gr
import pandas as pd
import numpy as np
import h5py
import json
import os
import tempfile
import re
import time
import logging
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import torch
from sklearn.feature_extraction.text import CountVectorizer
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Ensure you have downloaded the necessary NLTK data'stopwords', quiet=True)'punkt', quiet=True)
# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load pre-trained model from Hugging Face"Loading SentenceTransformer model...")
model = SentenceTransformer('anferico/bert-for-patents').to(device)"SentenceTransformer model loaded successfully.")
def preprocess_text(text):
# Remove "[EN]" label and claim numbers
text = re.sub(r'\[EN\]\s*', '', text)
text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE)
# Convert to lowercase while preserving acronyms and units
words = text.split()
text = ' '.join(word if word.isupper() or re.match(r'^\d+(\.\d+)?[a-zA-Z]+$', word) else word.lower() for word in words)
# Remove special characters except hyphens and periods in numbers
text = re.sub(r'[^\w\s\-.]', ' ', text)
text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text) # Remove periods not in numbers
# Normalize spaces
text = re.sub(r'\s+', ' ', text).strip()
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]
# Join tokens back into text
text = ' '.join(tokens)
# Preserve numerical values with units
text = re.sub(r'(\d+(\.\d+)?)([a-zA-Z]+)', r'\1_\3', text)
# Handle ranges and measurements
text = re.sub(r'(\d+(\.\d+)?)(\s*to\s*)(\d+(\.\d+)?)(\s*[a-zA-Z]+)', r'\1_to_\4_\6', text)
text = re.sub(r'between\s*(\d+(\.\d+)?)(\s*and\s*)(\d+(\.\d+)?)\s*([a-zA-Z]+)', r'between_\1_and_\4_\5', text)
# Preserve chemical formulas
text = re.sub(r'\b([A-Z][a-z]?\d*)+\b', lambda m:' ', ''), text)
return text
def filter_common_terms(texts, threshold=0.10):
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
term_frequencies = np.sum(X.toarray(), axis=0)
document_frequencies = np.sum(X.toarray() > 0, axis=0)
num_documents = X.shape[0]
common_terms = set()
removed_words = {}
for term, doc_freq in zip(vectorizer.get_feature_names_out(), document_frequencies):
if doc_freq / num_documents > threshold:
removed_words[term] = doc_freq
filtered_texts = []
for text in texts:
filtered_text = ' '.join([word for word in text.split() if word not in common_terms])
return filtered_texts, removed_words
def encode_texts(texts, progress=gr.Progress(), batch_size=64):
embeddings = []
total_batches = len(texts) // batch_size + (1 if len(texts) % batch_size != 0 else 0)
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i+batch_size]
batch_texts = [str(text) for text in batch_texts]
batch_embeddings = model.encode(batch_texts, show_progress_bar=True)
progress((i // batch_size + 1) / total_batches, f"Processing batch {i // batch_size + 1}/{total_batches}")
embeddings = np.array(embeddings)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings
def process_file(file, progress=gr.Progress()):
start_time = time.time()
# Read CSV file
df = pd.read_csv(, encoding='utf-8')"CSV file read successfully. Shape: {df.shape}")
required_columns = ['Master Patent Number', 'Abstract', 'Claims']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return None, None, None, f"Error: Missing columns: {', '.join(missing_columns)}"
valid_texts = []
valid_patent_numbers = []
skipped_rows = []
error_rows = []
total_rows = len(df)
for index, row in df.iterrows():
progress((index + 1) / total_rows, f"Processing row {index + 1}/{total_rows}")"Processing row {index + 1}/{total_rows}")
abstract = row['Abstract'] if pd.notna(row['Abstract']) else ''
claims = row['Claims'] if pd.notna(row['Claims']) else ''
if not abstract and not claims:
skipped_rows.append(row['Master Patent Number'])
# Preprocess the abstract and claims separately
preprocessed_abstract = preprocess_text(abstract)
preprocessed_claims = preprocess_text(claims)
# Combine preprocessed abstract and claims
combined_text = preprocessed_abstract + ' ' + preprocessed_claims
valid_patent_numbers.append(str(row['Master Patent Number']))
except Exception as e:
error_message = f"Error processing row {index + 1}: {str(e)}"
error_rows.append((index, row['Master Patent Number'], error_message))
continue"Preprocessed abstracts and claims. Number of valid texts: {len(valid_texts)}")
if skipped_rows:"Skipped {len(skipped_rows)} rows due to missing abstract and claims.")
if error_rows:"Encountered errors in {len(error_rows)} rows.")
# Filter out common terms"Filtering common terms...")
filtered_texts, removed_words = filter_common_terms(valid_texts, threshold=0.10)
# Generate removed words file
removed_words_file = 'removed_words.txt'
with open(removed_words_file, 'w', encoding='utf-8') as f:
for word, count in sorted(removed_words.items(), key=lambda x: x[1], reverse=True):
f.write(f"{word}: {count}\n")"Encoding texts...")
embeddings = encode_texts(filtered_texts, progress)"Texts encoded successfully.")
# Save embeddings and metadata
embeddings_file = tempfile.NamedTemporaryFile(delete=False, suffix='.h5').name
with h5py.File(embeddings_file, 'w') as f:
f.create_dataset('embeddings', data=embeddings)
f.create_dataset('patent_numbers', data=valid_patent_numbers)
metadata_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jsonl').name
with open(metadata_file, 'w', encoding='utf-8') as f:
for index, (patent_number, text) in enumerate(zip(valid_patent_numbers, filtered_texts)):
'index': index,
'patent_number': patent_number,
'text': text,
'embedding_index': index
}, f, ensure_ascii=False)
end_time = time.time()
total_time = end_time - start_time"Processing completed in {total_time:.2f} seconds.")
# Save error log
error_log_file = 'error_log.txt'
with open(error_log_file, 'w', encoding='utf-8') as f:
for row in error_rows:
f.write(f"Row {row[0]}, Patent {row[1]}: {row[2]}\n")
return embeddings_file, metadata_file, removed_words_file, f"Processing complete. Encoded {len(filtered_texts)} patents. Skipped {len(skipped_rows)} patents due to missing data. Errors in {len(error_rows)} rows. See error_log.txt for details."
except Exception as e:
logging.error(f"An error occurred: {e}")
import traceback
return None, None, None, f"An error occurred: {str(e)}"
iface = gr.Interface(
inputs=gr.File(label="Upload a CSV file with patent data"),
gr.File(label="Patent Embeddings (HDF5)"),
gr.File(label="Patent Metadata (JSONL)"),
gr.File(label="Removed Words List (TXT)"),
gr.Textbox(label="Processing Status")
title="Patent Text Encoder",
description="Upload a CSV file containing patent data (must include 'Master Patent Number', 'Abstract', and 'Claims' columns). The app will generate embeddings and save them along with metadata as downloadable files.",
if __name__ == "__main__":