camillebrl commited on
Commit
a66df45
·
verified ·
1 Parent(s): 5518620

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +65 -12
tasks/text.py CHANGED
@@ -4,10 +4,18 @@ from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
  import random
6
  from transformers import pipeline, AutoConfig
 
 
 
 
 
7
 
8
  from .utils.evaluation import TextEvaluationRequest
9
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
10
 
 
 
 
11
  router = APIRouter()
12
 
13
  DESCRIPTION = "Random Baseline"
@@ -61,19 +69,64 @@ async def evaluate_text(request: TextEvaluationRequest):
61
  true_labels = test_dataset["label"]
62
  config = AutoConfig.from_pretrained("camillebrl/ModernBERT-envclaims-overfit")
63
  label2id = config.label2id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  classifier = pipeline(
65
- "text-classification",
66
- "camillebrl/ModernBERT-envclaims-overfit",
67
- device="cpu"
68
- )
69
- print("len dataset : ", len(test_dataset["quote"]))
70
- predictions = []
71
- for batch in range(0, len(test_dataset["quote"]), 32): # Ajustez la taille des batchs
72
- batch_quotes = test_dataset["quote"][batch:batch + 32]
73
- batch_predictions = classifier(batch_quotes)
74
- predictions.extend([label2id[pred["label"]] for pred in batch_predictions])
75
- print(predictions)
76
- print("final predictions : ", predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  #--------------------------------------------------------------------------------------------
78
  # YOUR MODEL INFERENCE STOPS HERE
79
  #--------------------------------------------------------------------------------------------
 
4
  from sklearn.metrics import accuracy_score
5
  import random
6
  from transformers import pipeline, AutoConfig
7
+ import os
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from typing import List, Dict
10
+ import numpy as np
11
+ import torch
12
 
13
  from .utils.evaluation import TextEvaluationRequest
14
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
15
 
16
+ # Disable torch compile
17
+ os.environ["TORCH_COMPILE_DISABLE"] = "1"
18
+
19
  router = APIRouter()
20
 
21
  DESCRIPTION = "Random Baseline"
 
69
  true_labels = test_dataset["label"]
70
  config = AutoConfig.from_pretrained("camillebrl/ModernBERT-envclaims-overfit")
71
  label2id = config.label2id
72
+ # classifier = pipeline(
73
+ # "text-classification",
74
+ # "camillebrl/ModernBERT-envclaims-overfit",
75
+ # device="cpu"
76
+ # )
77
+ # print("len dataset : ", len(test_dataset["quote"]))
78
+ # predictions = []
79
+ # for batch in range(0, len(test_dataset["quote"]), 32): # Ajustez la taille des batchs
80
+ # batch_quotes = test_dataset["quote"][batch:batch + 32]
81
+ # batch_predictions = classifier(batch_quotes)
82
+ # predictions.extend([label2id[pred["label"]] for pred in batch_predictions])
83
+ # print(predictions)
84
+ # print("final predictions : ", predictions)
85
+ # Initialize the model once
86
  classifier = pipeline(
87
+ "text-classification",
88
+ "camillebrl/ModernBERT-envclaims-overfit",
89
+ device="cpu", # Explicitly set device
90
+ batch_size=16 # Set batch size for pipeline
91
+ )
92
+
93
+ # Prepare batches
94
+ batch_size = 32
95
+ quotes = test_dataset["quote"]
96
+ num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
97
+ batches = [
98
+ quotes[i * batch_size:(i + 1) * batch_size]
99
+ for i in range(num_batches)
100
+ ]
101
+
102
+ # Process batches in parallel
103
+ max_workers = min(os.cpu_count(), 4) # Limit to 4 workers or CPU count
104
+ print(f"Processing with {max_workers} workers")
105
+
106
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
107
+ # Submit all batches for processing
108
+ future_to_batch = {
109
+ executor.submit(
110
+ process_batch,
111
+ batch,
112
+ classifier,
113
+ label2id
114
+ ): i for i, batch in enumerate(batches)
115
+ }
116
+
117
+ # Collect results in order
118
+ batch_predictions = [[] for _ in range(len(batches))]
119
+ for future in future_to_batch:
120
+ batch_idx = future_to_batch[future]
121
+ try:
122
+ batch_predictions[batch_idx] = future.result()
123
+ except Exception as e:
124
+ print(f"Batch {batch_idx} generated an exception: {e}")
125
+ batch_predictions[batch_idx] = []
126
+
127
+ # Flatten predictions
128
+ predictions = [pred for batch in batch_predictions for pred in batch]
129
+
130
  #--------------------------------------------------------------------------------------------
131
  # YOUR MODEL INFERENCE STOPS HERE
132
  #--------------------------------------------------------------------------------------------