JenetGhumman commited on
Commit
39aa6d2
·
verified ·
1 Parent(s): 860f09c

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +45 -101
tasks/text.py CHANGED
@@ -1,29 +1,31 @@
1
  from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- from sklearn.naive_bayes import MultinomialNB
6
- from sklearn.svm import SVC
7
  from sklearn.metrics import accuracy_score
 
8
 
9
  from .utils.evaluation import TextEvaluationRequest
10
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
11
 
12
- # Define the router for text tasks
13
  router = APIRouter()
14
 
15
- DESCRIPTION_NAIVE_BAYES = "Naive Bayes Text Classifier"
16
- DESCRIPTION_SVM = "SVM Text Classifier with TF-IDF"
17
 
18
- # Naive Bayes Endpoint
19
- @router.post("/text", tags=["Text Task"], description=DESCRIPTION_NAIVE_BAYES)
20
  async def evaluate_text(request: TextEvaluationRequest):
21
  """
22
- Evaluate text classification using Naive Bayes.
 
 
 
 
23
  """
 
24
  username, space_url = get_space_info()
25
 
26
- # Label Mapping
27
  LABEL_MAPPING = {
28
  "0_not_relevant": 0,
29
  "1_not_happening": 1,
@@ -35,114 +37,56 @@ async def evaluate_text(request: TextEvaluationRequest):
35
  "7_fossil_fuels_needed": 7
36
  }
37
 
38
- # Load and prepare dataset
39
  dataset = load_dataset(request.dataset_name)
40
- dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
41
-
42
- # Train-Test Split
43
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
44
- train_texts = [x["text"] for x in train_test["train"]]
45
- train_labels = [x["label"] for x in train_test["train"]]
46
- test_texts = [x["text"] for x in train_test["test"]]
47
- test_labels = [x["label"] for x in train_test["test"]]
48
-
49
- # TF-IDF Vectorization
50
- vectorizer = TfidfVectorizer(max_features=5000)
51
- train_vectors = vectorizer.fit_transform(train_texts)
52
- test_vectors = vectorizer.transform(test_texts)
53
-
54
- # Train Naive Bayes Classifier
55
- model = MultinomialNB()
56
- model.fit(train_vectors, train_labels)
57
-
58
- # Track emissions
59
- tracker.start()
60
- tracker.start_task("inference")
61
- predictions = model.predict(test_vectors)
62
- emissions_data = tracker.stop_task()
63
-
64
- # Calculate Accuracy
65
- accuracy = accuracy_score(test_labels, predictions)
66
-
67
- return {
68
- "username": username,
69
- "space_url": space_url,
70
- "submission_timestamp": datetime.now().isoformat(),
71
- "model_description": DESCRIPTION_NAIVE_BAYES,
72
- "accuracy": float(accuracy),
73
- "energy_consumed_wh": emissions_data.energy_consumed * 1000,
74
- "emissions_gco2eq": emissions_data.emissions * 1000,
75
- "emissions_data": clean_emissions_data(emissions_data),
76
- "api_route": "/text",
77
- "dataset_config": {
78
- "dataset_name": request.dataset_name,
79
- "test_size": request.test_size,
80
- "test_seed": request.test_seed
81
- }
82
- }
83
 
84
- # SVM Endpoint
85
- @router.post("/text_svm", tags=["Text Task"], description=DESCRIPTION_SVM)
86
- async def evaluate_text_svm(request: TextEvaluationRequest):
87
- """
88
- Evaluate text classification using SVM.
89
- """
90
- username, space_url = get_space_info()
91
-
92
- # Label Mapping
93
- LABEL_MAPPING = {
94
- "0_not_relevant": 0,
95
- "1_not_happening": 1,
96
- "2_not_human": 2,
97
- "3_not_bad": 3,
98
- "4_solutions_harmful_unnecessary": 4,
99
- "5_science_unreliable": 5,
100
- "6_proponents_biased": 6,
101
- "7_fossil_fuels_needed": 7
102
- }
103
-
104
- # Load and prepare dataset
105
- dataset = load_dataset(request.dataset_name)
106
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
107
 
108
- # Train-Test Split
109
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
110
- train_texts = [x["text"] for x in train_test["train"]]
111
- train_labels = [x["label"] for x in train_test["train"]]
112
- test_texts = [x["text"] for x in train_test["test"]]
113
- test_labels = [x["label"] for x in train_test["test"]]
114
-
115
- # TF-IDF Vectorization
116
- vectorizer = TfidfVectorizer(max_features=5000)
117
- train_vectors = vectorizer.fit_transform(train_texts)
118
- test_vectors = vectorizer.transform(test_texts)
119
-
120
- # Train SVM Classifier
121
- model = SVC(kernel="linear", probability=True)
122
- model.fit(train_vectors, train_labels)
123
-
124
- # Track emissions
125
  tracker.start()
126
  tracker.start_task("inference")
127
- predictions = model.predict(test_vectors)
128
- emissions_data = tracker.stop_task()
129
 
130
- # Calculate Accuracy
131
- accuracy = accuracy_score(test_labels, predictions)
132
-
133
- return {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  "username": username,
135
  "space_url": space_url,
136
  "submission_timestamp": datetime.now().isoformat(),
137
- "model_description": DESCRIPTION_SVM,
138
  "accuracy": float(accuracy),
139
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
140
  "emissions_gco2eq": emissions_data.emissions * 1000,
141
  "emissions_data": clean_emissions_data(emissions_data),
142
- "api_route": "/text_svm",
143
  "dataset_config": {
144
  "dataset_name": request.dataset_name,
145
  "test_size": request.test_size,
146
  "test_seed": request.test_seed
147
  }
148
  }
 
 
 
1
  from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
 
 
 
4
  from sklearn.metrics import accuracy_score
5
+ import random
6
 
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
9
 
 
10
  router = APIRouter()
11
 
12
+ DESCRIPTION = "Random Baseline"
13
+ ROUTE = "/text"
14
 
15
+ @router.post(ROUTE, tags=["Text Task"],
16
+ description=DESCRIPTION)
17
  async def evaluate_text(request: TextEvaluationRequest):
18
  """
19
+ Evaluate text classification for climate disinformation detection.
20
+
21
+ Current Model: Random Baseline
22
+ - Makes random predictions from the label space (0-7)
23
+ - Used as a baseline for comparison
24
  """
25
+ # Get space info
26
  username, space_url = get_space_info()
27
 
28
+ # Define the label mapping
29
  LABEL_MAPPING = {
30
  "0_not_relevant": 0,
31
  "1_not_happening": 1,
 
37
  "7_fossil_fuels_needed": 7
38
  }
39
 
40
+ # Load and prepare the dataset
41
  dataset = load_dataset(request.dataset_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Convert string labels to integers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
45
 
46
+ # Split dataset
47
+ train_test = dataset["train"]
48
+ test_dataset = dataset["test"]
49
+
50
+ # Start tracking emissions
 
 
 
 
 
 
 
 
 
 
 
 
51
  tracker.start()
52
  tracker.start_task("inference")
 
 
53
 
54
+ #--------------------------------------------------------------------------------------------
55
+ # YOUR MODEL INFERENCE CODE HERE
56
+ # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
57
+ #--------------------------------------------------------------------------------------------
58
+
59
+ # Make random predictions (placeholder for actual model inference)
60
+ true_labels = test_dataset["label"]
61
+ predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
62
+
63
+ #--------------------------------------------------------------------------------------------
64
+ # YOUR MODEL INFERENCE STOPS HERE
65
+ #--------------------------------------------------------------------------------------------
66
+
67
+
68
+ # Stop tracking emissions
69
+ emissions_data = tracker.stop_task()
70
+
71
+ # Calculate accuracy
72
+ accuracy = accuracy_score(true_labels, predictions)
73
+
74
+ # Prepare results dictionary
75
+ results = {
76
  "username": username,
77
  "space_url": space_url,
78
  "submission_timestamp": datetime.now().isoformat(),
79
+ "model_description": DESCRIPTION,
80
  "accuracy": float(accuracy),
81
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
82
  "emissions_gco2eq": emissions_data.emissions * 1000,
83
  "emissions_data": clean_emissions_data(emissions_data),
84
+ "api_route": ROUTE,
85
  "dataset_config": {
86
  "dataset_name": request.dataset_name,
87
  "test_size": request.test_size,
88
  "test_seed": request.test_seed
89
  }
90
  }
91
+
92
+ return results