Spaces:

talhasarit41
/

classification-models-comparison

Sleeping

App Files Files Community

SeeknnDestroy commited on Jan 9

Commit

51efdcf

unverified ·

1 Parent(s): 7a7432d

update models

Browse files

Files changed (4) hide show

.gitignore +1 -0
__pycache__/config.cpython-310.pyc +0 -0
app.py +95 -70
config.py +52 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (2.04 kB). View file

app.py CHANGED Viewed

@@ -8,11 +8,10 @@ import time
 from transformers import AutoTokenizer, AutoModel
 import torch.nn.functional as F
 from openai import AzureOpenAI
-from huggingface_hub import hf_hub_download
-# Download the FastText model from Hugging Face
-model_path_fasttext_raw = hf_hub_download(repo_id="talhasarit41/fasttext", filename="fasttext_raw.bin")
-model_path_fasttext_preprocessed = hf_hub_download(repo_id="talhasarit41/fasttext", filename="fasttext_preprocessed.bin")
 # Azure OpenAI Configuration
 AZURE_API_VERSION = "2024-02-01"
@@ -92,25 +91,33 @@ def get_azure_embedding(text):
 def load_models():
     models = {}
-    # Load pickle models
-    with open(os.path.join(MODEL_DIR, 'e5_classifier.pkl'), 'rb') as f:
-        models['E5 Classifier'] = pickle.load(f)
-    with open(os.path.join(MODEL_DIR, 'e5_large_instruct_classifier.pkl'), 'rb') as f:
-        models['E5-Instruct Classifier'] = pickle.load(f)
-    with open(os.path.join(MODEL_DIR, 'azure_classifier.pkl'), 'rb') as f:
-        models['Azure Classifier'] = pickle.load(f)
-    with open(os.path.join(MODEL_DIR, 'azure_knn_classifier.pkl'), 'rb') as f:
-        models['Azure KNN Classifier'] = pickle.load(f)
-    with open(os.path.join(MODEL_DIR, 'gte_classifier.pkl'), 'rb') as f:
-        models['GTE Classifier'] = pickle.load(f)
     # Load FastText models
-    models['FastText Raw'] = fasttext.load_model(model_path_fasttext_raw)
-    models['FastText Preprocessed'] = fasttext.load_model(model_path_fasttext_preprocessed)
     return models
@@ -185,13 +192,20 @@ def predict_text_streaming(text):
         models = load_models()
         results = []
         # First yield empty table and progress bar
-        yield format_progress(0, "Loading models..."), format_results(results)
-        # Process FastText models first (they're fastest as they don't need embeddings)
         for model_name, model in models.items():
             if isinstance(model, fasttext.FastText._FastText):
-                yield format_progress(10, f"Processing {model_name}..."), format_results(results)
                 start_time = time.time()
                 prediction = model.predict(text)
                 label = prediction[0][0].replace('__label__', '')
@@ -204,65 +218,76 @@ def predict_text_streaming(text):
                     'confidence': confidence,
                     'time': inference_time
                 })
-                yield format_progress(20, f"Completed {model_name}"), format_results(results)
-        # Process E5 models
-        yield format_progress(30, "Processing E5 Classifier..."), format_results(results)
-        e5_embedding, embed_time = generate_e5_embedding(text)
-        for model_name in ['E5 Classifier', 'E5-Instruct Classifier']:
-            start_time = time.time()
-            model = models[model_name]
-            embedding_2d = e5_embedding.reshape(1, -1)
-            prediction = model.predict(embedding_2d)[0]
-            probabilities = model.predict_proba(embedding_2d)[0]
-            confidence = max(probabilities)
-            inference_time = time.time() - start_time
-            results.append({
-                'model': model_name,
-                'prediction': prediction,
-                'confidence': confidence,
-                'time': inference_time + embed_time
-            })
-            yield format_progress(40, f"Completed {model_name}"), format_results(results)
-        # Process Azure models
-        yield format_progress(50, "Processing Azure Embeddings..."), format_results(results)
-        azure_embedding, embed_time = get_azure_embedding(text)
-        for model_name in ['Azure Classifier', 'Azure KNN Classifier']:
-            start_time = time.time()
-            model = models[model_name]
-            embedding_2d = azure_embedding.reshape(1, -1)
             prediction = model.predict(embedding_2d)[0]
             probabilities = model.predict_proba(embedding_2d)[0]
             confidence = max(probabilities)
             inference_time = time.time() - start_time
             results.append({
-                'model': model_name,
                 'prediction': prediction,
                 'confidence': confidence,
                 'time': inference_time + embed_time
             })
-            yield format_progress(70, f"Completed {model_name}"), format_results(results)
-        # Process GTE model
-        yield format_progress(90, "Processing GTE Classifier..."), format_results(results)
-        gte_embedding, embed_time = generate_gte_embedding(text)
-        model = models['GTE Classifier']
-        embedding_2d = gte_embedding.reshape(1, -1)
-        prediction = model.predict(embedding_2d)[0]
-        probabilities = model.predict_proba(embedding_2d)[0]
-        confidence = max(probabilities)
-        inference_time = time.time() - start_time
-        results.append({
-            'model': 'GTE Classifier',
-            'prediction': prediction,
-            'confidence': confidence,
-            'time': inference_time + embed_time
-        })
-        yield format_progress(100, "Completed!"), format_results(results)
     except Exception as e:
         yield "", f"<div style='color: red; padding: 20px;'>Error occurred: {str(e)}</div>"

 from transformers import AutoTokenizer, AutoModel
 import torch.nn.functional as F
 from openai import AzureOpenAI
+from dotenv import load_dotenv
+from config import get_fasttext_path, is_model_enabled
+load_dotenv()
 # Azure OpenAI Configuration
 AZURE_API_VERSION = "2024-02-01"
 def load_models():
     models = {}
+    # Load pickle models only if enabled
+    pickle_models = {
+        'E5 Classifier': 'e5_classifier.pkl',
+        'E5-Instruct Classifier': 'e5_large_instruct_classifier.pkl',
+        'Azure Classifier': 'azure_classifier.pkl',
+        'Azure KNN Classifier': 'azure_knn_classifier.pkl',
+        'GTE Classifier': 'gte_classifier.pkl'
+    }
+    for model_name, filename in pickle_models.items():
+        if is_model_enabled(model_name):
+            with open(os.path.join(MODEL_DIR, filename), 'rb') as f:
+                models[model_name] = pickle.load(f)
     # Load FastText models
+    if is_model_enabled('FastText Default'):
+        models['FastText Default'] = fasttext.load_model(get_fasttext_path('fasttext_default'))
+    if is_model_enabled('FastText Preprocessed'):
+        models['FastText Preprocessed'] = fasttext.load_model(get_fasttext_path('fasttext_preprocessed'))
+    if is_model_enabled('Fasttext WordnNGram 1'):
+        models['Fasttext WordnNGram 1'] = fasttext.load_model(get_fasttext_path('word_n_gram_1'))
+    if is_model_enabled('Fasttext WordnNGram 2'):
+        models['Fasttext WordnNGram 2'] = fasttext.load_model(get_fasttext_path('word_n_gram_2'))
+    if is_model_enabled('Fasttext WordnNGram 3'):
+        models['Fasttext WordnNGram 3'] = fasttext.load_model(get_fasttext_path('word_n_gram_3'))
+    if is_model_enabled('Fasttext Low Overfit'):
+        models['Fasttext Low Overfit'] = fasttext.load_model(get_fasttext_path('low_overfit'))
     return models
         models = load_models()
         results = []
+        if not models:
+            return "", "<div style='color: red; padding: 20px;'>No models are enabled in the configuration.</div>"
+        # Calculate progress step based on number of enabled models
+        progress_step = 100.0 / len(models)
+        current_progress = 0
         # First yield empty table and progress bar
+        yield format_progress(current_progress, "Loading models..."), format_results(results)
+        # Process FastText models first (they're fastest)
         for model_name, model in models.items():
             if isinstance(model, fasttext.FastText._FastText):
+                yield format_progress(current_progress, f"Processing {model_name}..."), format_results(results)
                 start_time = time.time()
                 prediction = model.predict(text)
                 label = prediction[0][0].replace('__label__', '')
                     'confidence': confidence,
                     'time': inference_time
                 })
+                current_progress += progress_step
+                yield format_progress(current_progress, f"Completed {model_name}"), format_results(results)
+        # Process E5-based models
+        e5_embedding = None
+        for model_name, model in models.items():
+            if model_name in ['E5 Classifier', 'E5-Instruct Classifier']:
+                if e5_embedding is None:  # Generate embedding only once
+                    yield format_progress(current_progress, f"Generating E5 embeddings..."), format_results(results)
+                    e5_embedding, embed_time = generate_e5_embedding(text)
+                start_time = time.time()
+                embedding_2d = e5_embedding.reshape(1, -1)
+                prediction = model.predict(embedding_2d)[0]
+                probabilities = model.predict_proba(embedding_2d)[0]
+                confidence = max(probabilities)
+                inference_time = time.time() - start_time
+                results.append({
+                    'model': model_name,
+                    'prediction': prediction,
+                    'confidence': confidence,
+                    'time': inference_time + embed_time
+                })
+                current_progress += progress_step
+                yield format_progress(current_progress, f"Completed {model_name}"), format_results(results)
+        # Process Azure-based models
+        azure_embedding = None
+        for model_name, model in models.items():
+            if model_name in ['Azure Classifier', 'Azure KNN Classifier']:
+                if azure_embedding is None:  # Generate embedding only once
+                    yield format_progress(current_progress, "Generating Azure embeddings..."), format_results(results)
+                    azure_embedding, embed_time = get_azure_embedding(text)
+                start_time = time.time()
+                embedding_2d = azure_embedding.reshape(1, -1)
+                prediction = model.predict(embedding_2d)[0]
+                probabilities = model.predict_proba(embedding_2d)[0]
+                confidence = max(probabilities)
+                inference_time = time.time() - start_time
+                results.append({
+                    'model': model_name,
+                    'prediction': prediction,
+                    'confidence': confidence,
+                    'time': inference_time + embed_time
+                })
+                current_progress += progress_step
+                yield format_progress(current_progress, f"Completed {model_name}"), format_results(results)
+        # Process GTE model
+        if 'GTE Classifier' in models:
+            yield format_progress(current_progress, "Processing GTE Classifier..."), format_results(results)
+            gte_embedding, embed_time = generate_gte_embedding(text)
+            model = models['GTE Classifier']
+            embedding_2d = gte_embedding.reshape(1, -1)
             prediction = model.predict(embedding_2d)[0]
             probabilities = model.predict_proba(embedding_2d)[0]
             confidence = max(probabilities)
             inference_time = time.time() - start_time
             results.append({
+                'model': 'GTE Classifier',
                 'prediction': prediction,
                 'confidence': confidence,
                 'time': inference_time + embed_time
             })
+            current_progress = 100
+            yield format_progress(current_progress, "Completed!"), format_results(results)
     except Exception as e:
         yield "", f"<div style='color: red; padding: 20px;'>Error occurred: {str(e)}</div>"

config.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Dict, Any
+# FastText model paths configuration
+FASTTEXT_CONFIG = {
+    "use_huggingface": True,  # Set to True in production
+    "repo_id": "talhasarit41/fasttext",  # HuggingFace repository ID
+    "huggingface_paths": {
+        "fasttext_default": "fasttext_raw.bin",
+        "fasttext_preprocessed": "fasttext_preprocessed.bin",
+        "word_n_gram_1": "word_n_gram_1.bin",
+        "word_n_gram_2": "word_n_gram_2.bin",
+        "word_n_gram_3": "word_n_gram_3.bin",
+        "low_overfit": "low_overfit.bin"
+    },
+    "local_paths": {
+        "fasttext_default": "/home/seeknndestroy/jetlink/bitbucket/fasttext_related/saved_models/fasttext_raw.bin",
+        "fasttext_preprocessed": "/home/seeknndestroy/jetlink/bitbucket/fasttext_related/saved_models/fasttext_preprocessed.bin",
+        "word_n_gram_1": "/home/seeknndestroy/jetlink/bitbucket/fasttext_related/saved_models/manual_configs/word_n_gram_1.bin",
+        "word_n_gram_2": "/home/seeknndestroy/jetlink/bitbucket/fasttext_related/saved_models/manual_configs/word_n_gram_2.bin",
+        "word_n_gram_3": "/home/seeknndestroy/jetlink/bitbucket/fasttext_related/saved_models/manual_configs/word_n_gram_3.bin",
+        "low_overfit": "/home/seeknndestroy/jetlink/bitbucket/fasttext_related/saved_models/manual_configs/low_overfit.bin"
+    }
+}
+# Model enablement configuration
+MODEL_CONFIG = {
+    "FastText Default": True,
+    "Fasttext Low Overfit": True,
+    "Fasttext WordnNGram 1": True,
+    "Fasttext WordnNGram 2": True,
+    "Fasttext WordnNGram 3": True,
+    "E5 Classifier": False,
+    "E5-Instruct Classifier": False,
+    "Azure Classifier": False,
+    "Azure KNN Classifier": False,
+    "GTE Classifier": False
+}
+def get_fasttext_path(model_name: str) -> str:
+    """Get the appropriate FastText model path based on configuration."""
+    if FASTTEXT_CONFIG["use_huggingface"]:
+        from huggingface_hub import hf_hub_download
+        return hf_hub_download(
+            repo_id=FASTTEXT_CONFIG["repo_id"],
+            filename=FASTTEXT_CONFIG["huggingface_paths"][model_name]
+        )
+    else:
+        return FASTTEXT_CONFIG["local_paths"][model_name]
+def is_model_enabled(model_name: str) -> bool:
+    """Check if a model is enabled in the configuration."""
+    return MODEL_CONFIG.get(model_name, False)