Spaces:

camillebrl
/

modernbert_overfit

Sleeping

App Files Files Community

camillebrl commited on Jan 28

Commit

a66df45

verified ·

1 Parent(s): 5518620

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +65 -12

tasks/text.py CHANGED Viewed

@@ -4,10 +4,18 @@ from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 import random
 from transformers import pipeline, AutoConfig
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 router = APIRouter()
 DESCRIPTION = "Random Baseline"
@@ -61,19 +69,64 @@ async def evaluate_text(request: TextEvaluationRequest):
     true_labels = test_dataset["label"]
     config = AutoConfig.from_pretrained("camillebrl/ModernBERT-envclaims-overfit")
     label2id = config.label2id
     classifier = pipeline(
-            "text-classification",
-            "camillebrl/ModernBERT-envclaims-overfit",
-            device="cpu"
-            )
-    print("len dataset : ", len(test_dataset["quote"]))
-    predictions = []
-    for batch in range(0, len(test_dataset["quote"]), 32):  # Ajustez la taille des batchs
-        batch_quotes = test_dataset["quote"][batch:batch + 32]
-        batch_predictions = classifier(batch_quotes)
-        predictions.extend([label2id[pred["label"]] for pred in batch_predictions])
-        print(predictions)
-    print("final predictions : ", predictions)
     #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
     #--------------------------------------------------------------------------------------------

 from sklearn.metrics import accuracy_score
 import random
 from transformers import pipeline, AutoConfig
+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Dict
+import numpy as np
+import torch
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
+# Disable torch compile
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
 router = APIRouter()
 DESCRIPTION = "Random Baseline"
     true_labels = test_dataset["label"]
     config = AutoConfig.from_pretrained("camillebrl/ModernBERT-envclaims-overfit")
     label2id = config.label2id
+    # classifier = pipeline(
+    #         "text-classification",
+    #         "camillebrl/ModernBERT-envclaims-overfit",
+    #         device="cpu"
+    #         )
+    # print("len dataset : ", len(test_dataset["quote"]))
+    # predictions = []
+    # for batch in range(0, len(test_dataset["quote"]), 32):  # Ajustez la taille des batchs
+    #     batch_quotes = test_dataset["quote"][batch:batch + 32]
+    #     batch_predictions = classifier(batch_quotes)
+    #     predictions.extend([label2id[pred["label"]] for pred in batch_predictions])
+    #     print(predictions)
+    # print("final predictions : ", predictions)
+    # Initialize the model once
     classifier = pipeline(
+        "text-classification",
+        "camillebrl/ModernBERT-envclaims-overfit",
+        device="cpu",  # Explicitly set device
+        batch_size=16  # Set batch size for pipeline
+    )
+    # Prepare batches
+    batch_size = 32
+    quotes = test_dataset["quote"]
+    num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
+    batches = [
+        quotes[i * batch_size:(i + 1) * batch_size]
+        for i in range(num_batches)
+    ]
+    # Process batches in parallel
+    max_workers = min(os.cpu_count(), 4)  # Limit to 4 workers or CPU count
+    print(f"Processing with {max_workers} workers")
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all batches for processing
+        future_to_batch = {
+            executor.submit(
+                process_batch,
+                batch,
+                classifier,
+                label2id
+            ): i for i, batch in enumerate(batches)
+        }
+        # Collect results in order
+        batch_predictions = [[] for _ in range(len(batches))]
+        for future in future_to_batch:
+            batch_idx = future_to_batch[future]
+            try:
+                batch_predictions[batch_idx] = future.result()
+            except Exception as e:
+                print(f"Batch {batch_idx} generated an exception: {e}")
+                batch_predictions[batch_idx] = []
+        # Flatten predictions
+        predictions = [pred for batch in batch_predictions for pred in batch]
     #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
     #--------------------------------------------------------------------------------------------