Spaces:

talhasarit41
/

classification-models-comparison

Sleeping

classification-models-comparison / app.py

SeeknnDestroy

download models from hub

cf9e3cb unverified 6 months ago

13.8 kB

	import gradio as gr
	import pickle
	import fasttext
	import numpy as np
	import os
	import torch
	import time
	from transformers import AutoTokenizer, AutoModel
	import torch.nn.functional as F
	from openai import AzureOpenAI
	from huggingface_hub import hf_hub_download

	# Download the FastText model from Hugging Face
	model_path_fasttext_raw = hf_hub_download(repo_id="talhasarit41/fasttext", filename="fasttext_raw.bin")
	model_path_fasttext_preprocessed = hf_hub_download(repo_id="talhasarit41/fasttext", filename="fasttext_preprocessed.bin")

	# Azure OpenAI Configuration
	AZURE_API_VERSION = "2024-02-01"

	# Model directory
	MODEL_DIR = "models"

	# Initialize Azure OpenAI client
	azure_client = AzureOpenAI(
	api_key=os.getenv("AZURE_OPENAI_API_KEY"),
	api_version=AZURE_API_VERSION,
	azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT")
	)

	def generate_e5_embedding(text, model_name='intfloat/multilingual-e5-large'):
	"""Generate E5 embeddings for a single text."""
	start_time = time.time()
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# Add prefix for E5 models
	text = f"query: {text}"

	# Tokenize and generate embedding
	inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**inputs)

	# Mean pooling
	attention_mask = inputs['attention_mask']
	embeddings = mean_pooling(outputs.last_hidden_state, attention_mask)
	# Normalize embeddings
	embeddings = F.normalize(embeddings, p=2, dim=1)

	inference_time = time.time() - start_time
	return embeddings[0].numpy(), inference_time

	def generate_e5_instruct_embedding(text, model_name='intfloat/multilingual-e5-large-instruct'):
	"""Generate E5-instruct embeddings for a single text."""
	start_time = time.time()
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# Add prefix for E5 models
	text = f"query: {text}"

	# Tokenize and generate embedding
	inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**inputs)

	# Mean pooling
	attention_mask = inputs['attention_mask']
	embeddings = mean_pooling(outputs.last_hidden_state, attention_mask)
	# Normalize embeddings
	embeddings = F.normalize(embeddings, p=2, dim=1)

	inference_time = time.time() - start_time
	return embeddings[0].numpy(), inference_time

	def generate_modernbert_embedding(text, model_name="answerdotai/ModernBERT-base"):
	"""Generate ModernBERT embeddings for a single text."""
	start_time = time.time()
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# Tokenize and generate embedding
	inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**inputs)
	# Take [CLS] token embedding
	embeddings = outputs.last_hidden_state[:, 0, :]

	inference_time = time.time() - start_time
	return embeddings[0].numpy(), inference_time

	def mean_pooling(token_embeddings, attention_mask):
	"""Mean pooling function for E5 models."""
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	def get_azure_embedding(text):
	"""Get embeddings from Azure OpenAI API."""
	start_time = time.time()
	response = azure_client.embeddings.create(
	model="text-embedding-3-large",
	input=text
	)
	inference_time = time.time() - start_time
	return np.array(response.data[0].embedding), inference_time

	# Load models
	def load_models():
	models = {}

	# Load pickle models
	with open(os.path.join(MODEL_DIR, 'e5_classifier.pkl'), 'rb') as f:
	models['E5 Classifier'] = pickle.load(f)

	with open(os.path.join(MODEL_DIR, 'e5_large_instruct_classifier.pkl'), 'rb') as f:
	models['E5-Instruct Classifier'] = pickle.load(f)

	with open(os.path.join(MODEL_DIR, 'azure_classifier.pkl'), 'rb') as f:
	models['Azure Classifier'] = pickle.load(f)

	with open(os.path.join(MODEL_DIR, 'azure_knn_classifier.pkl'), 'rb') as f:
	models['Azure KNN Classifier'] = pickle.load(f)

	with open(os.path.join(MODEL_DIR, 'modernbert_rf_classifier.pkl'), 'rb') as f:
	models['ModernBERT RF Classifier'] = pickle.load(f)

	with open(os.path.join(MODEL_DIR, 'gte_classifier.pkl'), 'rb') as f:
	models['GTE Classifier'] = pickle.load(f)

	# Load FastText models
	models['FastText Raw'] = fasttext.load_model(model_path_fasttext_raw)
	models['FastText Preprocessed'] = fasttext.load_model(model_path_fasttext_preprocessed)

	return models

	def format_results(results):
	"""Format results into HTML for better visualization."""
	html = "<div style='font-family: monospace; padding: 10px 20px;'>"
	html += "<table style='width: 100%; border-collapse: collapse; background-color: #1a1a1a; color: #ffffff; margin-bottom: 0;'>"
	html += "<tr style='background-color: #2c3e50;'>"
	html += "<th style='padding: 12px; text-align: left; border: 1px solid #34495e;'>Model</th>"
	html += "<th style='padding: 12px; text-align: left; border: 1px solid #34495e;'>Prediction</th>"
	html += "<th style='padding: 12px; text-align: left; border: 1px solid #34495e;'>Confidence</th>"
	html += "<th style='padding: 12px; text-align: left; border: 1px solid #34495e;'>Time (sec)</th>"
	html += "</tr>"

	for result in results:
	color = get_confidence_color(result['confidence'])
	html += f"<tr style='background-color: #2d2d2d; border-bottom: 1px solid #404040;'>"
	html += f"<td style='padding: 12px; border: 1px solid #404040;'>{result['model']}</td>"
	html += f"<td style='padding: 12px; border: 1px solid #404040;'>{result['prediction']}</td>"
	html += f"<td style='padding: 12px; border: 1px solid #404040;'><span style='color: {color}; font-weight: bold;'>{result['confidence']:.4f}</span></td>"
	html += f"<td style='padding: 12px; border: 1px solid #404040;'>{result['time']:.4f}</td>"
	html += "</tr>"

	html += "</table></div>"
	return html

	def format_progress(progress_value, desc):
	"""Format progress bar HTML."""
	if progress_value >= 100:
	return "" # Return empty string when complete

	html = f"""
	<div style='width: 100%; background-color: #1a1a1a; padding: 10px; border-radius: 5px; margin-bottom: 10px;'>
	<div style='color: white; margin-bottom: 5px;'>{desc}</div>
	<div style='background-color: #2d2d2d; border-radius: 3px;'>
	<div style='background-color: #6b46c1; width: {progress_value}%; height: 20px; border-radius: 3px; transition: width 0.3s ease;'></div>
	</div>
	<div style='color: white; text-align: right; margin-top: 5px;'>{progress_value:.1f}%</div>
	</div>
	"""
	return html

	def get_confidence_color(confidence):
	"""Return color based on confidence score."""
	if confidence >= 0.8:
	return "#00ff00" # Bright green for high confidence
	elif confidence >= 0.5:
	return "#ffa500" # Bright orange for medium confidence
	else:
	return "#ff4444" # Bright red for low confidence

	# [Add GTE embedding generation function]
	def generate_gte_embedding(text, model_name='Alibaba-NLP/gte-multilingual-base'):
	"""Generate GTE embeddings for a single text."""
	start_time = time.time()
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# Tokenize and generate embedding
	inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state[:, 0, :] # [CLS] token
	embeddings = F.normalize(embeddings, p=2, dim=1) # normalize

	inference_time = time.time() - start_time
	return embeddings[0].numpy(), inference_time

	# Make predictions (streaming version)
	def predict_text_streaming(text):
	try:
	models = load_models()
	results = []

	# First yield empty table and progress bar
	yield format_progress(0, "Loading models..."), format_results(results)

	# Process FastText models first (they're fastest as they don't need embeddings)
	for model_name, model in models.items():
	if isinstance(model, fasttext.FastText._FastText):
	yield format_progress(10, f"Processing {model_name}..."), format_results(results)
	start_time = time.time()
	prediction = model.predict(text)
	label = prediction[0][0].replace('__label__', '')
	confidence = float(prediction[1][0])
	inference_time = time.time() - start_time

	results.append({
	'model': model_name,
	'prediction': label,
	'confidence': confidence,
	'time': inference_time
	})
	yield format_progress(20, f"Completed {model_name}"), format_results(results)

	# Process E5 models
	yield format_progress(30, "Processing E5 Classifier..."), format_results(results)
	e5_embedding, embed_time = generate_e5_embedding(text)
	for model_name in ['E5 Classifier', 'E5-Instruct Classifier']:
	start_time = time.time()
	model = models[model_name]
	embedding_2d = e5_embedding.reshape(1, -1)
	prediction = model.predict(embedding_2d)[0]
	probabilities = model.predict_proba(embedding_2d)[0]
	confidence = max(probabilities)
	inference_time = time.time() - start_time

	results.append({
	'model': model_name,
	'prediction': prediction,
	'confidence': confidence,
	'time': inference_time + embed_time
	})
	yield format_progress(40, f"Completed {model_name}"), format_results(results)

	# Process Azure models
	yield format_progress(50, "Processing Azure Embeddings..."), format_results(results)
	azure_embedding, embed_time = get_azure_embedding(text)
	for model_name in ['Azure Classifier', 'Azure KNN Classifier']:
	start_time = time.time()
	model = models[model_name]
	embedding_2d = azure_embedding.reshape(1, -1)
	prediction = model.predict(embedding_2d)[0]
	probabilities = model.predict_proba(embedding_2d)[0]
	confidence = max(probabilities)
	inference_time = time.time() - start_time

	results.append({
	'model': model_name,
	'prediction': prediction,
	'confidence': confidence,
	'time': inference_time + embed_time
	})
	yield format_progress(70, f"Completed {model_name}"), format_results(results)

	# Process ModernBERT model
	yield format_progress(80, "Processing ModernBERT RF Classifier..."), format_results(results)
	modernbert_embedding, embed_time = generate_modernbert_embedding(text)
	model = models['ModernBERT RF Classifier']
	embedding_2d = modernbert_embedding.reshape(1, -1)
	prediction = model.predict(embedding_2d)[0]
	probabilities = model.predict_proba(embedding_2d)[0]
	confidence = max(probabilities)
	inference_time = time.time() - start_time

	results.append({
	'model': 'ModernBERT RF Classifier',
	'prediction': prediction,
	'confidence': confidence,
	'time': inference_time + embed_time
	})
	yield format_progress(90, "Completed ModernBERT RF Classifier"), format_results(results)

	# Process GTE model
	yield format_progress(95, "Processing GTE Classifier..."), format_results(results)
	gte_embedding, embed_time = generate_gte_embedding(text)
	model = models['GTE Classifier']
	embedding_2d = gte_embedding.reshape(1, -1)
	prediction = model.predict(embedding_2d)[0]
	probabilities = model.predict_proba(embedding_2d)[0]
	confidence = max(probabilities)
	inference_time = time.time() - start_time

	results.append({
	'model': 'GTE Classifier',
	'prediction': prediction,
	'confidence': confidence,
	'time': inference_time + embed_time
	})
	yield format_progress(100, "Completed!"), format_results(results)

	except Exception as e:
	yield "", f"<div style='color: red; padding: 20px;'>Error occurred: {str(e)}</div>"

	# Create Gradio interface with custom CSS
	css = """
	.main {
	gap: 0 !important;
	}
	.contain {
	gap: 0 !important;
	}
	.feedback {
	margin-top: 0 !important;
	margin-bottom: 0 !important;
	}
	"""

	iface = gr.Interface(
	fn=predict_text_streaming,
	inputs=gr.Textbox(label="Enter text to classify", lines=3),
	outputs=[
	gr.HTML(label="Progress"),
	gr.HTML(label="Model Predictions")
	],
	title="Text Classification Model Comparison",
	description="Compare predictions from different text classification models (Results stream as they become available)",
	theme=gr.themes.Soft(),
	css=css
	)

	if __name__ == "__main__":
	iface.launch(debug=True)