File size: 3,769 Bytes
f4e2c1f
4d6e8c2
f4e2c1f
4d6e8c2
f4e2c1f
4d6e8c2
 
f4e2c1f
4d6e8c2
 
 
 
98fe162
1c33274
70f5f26
f4e2c1f
 
4d6e8c2
 
70f5f26
f4e2c1f
70f5f26
 
 
4d6e8c2
 
 
 
 
 
 
 
 
 
 
 
 
f4e2c1f
4d6e8c2
 
 
 
 
 
 
 
 
 
 
f4e2c1f
4d6e8c2
 
 
70f5f26
f4e2c1f
70f5f26
 
f4e2c1f
 
4d6e8c2
 
64079b0
 
ff046db
64079b0
 
 
 
 
 
 
 
 
 
 
b299c15
 
64079b0
70f5f26
f4e2c1f
70f5f26
f4e2c1f
70f5f26
4d6e8c2
 
f4e2c1f
4d6e8c2
 
f4e2c1f
4d6e8c2
 
 
 
 
70f5f26
4d6e8c2
 
 
 
1c33274
4d6e8c2
 
 
f4e2c1f
 
4d6e8c2
f4e2c1f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import random
from datetime import datetime

from datasets import load_dataset
from fastapi import APIRouter
from sklearn.metrics import accuracy_score

from .utils.emissions import clean_emissions_data, get_space_info, tracker
from .utils.evaluation import TextEvaluationRequest

router = APIRouter()

DESCRIPTION = "RF overfitted like crazy"
ROUTE = "/text"


@router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
    """
    Evaluate text classification for climate disinformation detection.

    Current Model: Random Baseline
    - Makes random predictions from the label space (0-7)
    - Used as a baseline for comparison
    """
    # Get space info
    username, space_url = get_space_info()

    # Define the label mapping
    LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7,
    }

    # Load and prepare the dataset
    dataset = load_dataset(request.dataset_name)

    # Convert string labels to integers
    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})

    # Split dataset
    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
    test_dataset = train_test["test"]

    # Start tracking emissions
    tracker.start()
    tracker.start_task("inference")

    # --------------------------------------------------------------------------------------------
    # YOUR MODEL INFERENCE CODE HERE
    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
    # --------------------------------------------------------------------------------------------

    # Make random predictions (placeholder for actual model inference)
    true_labels = test_dataset["label"]
    import joblib
    from sentence_transformers import SentenceTransformer

    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Convert "quote" key into embeddings
    def embed_quote(example):
        example["quote_embedding"] = embedding_model.encode(example["quote"]).tolist()
        return example

    test_dataset = test_dataset.map(embed_quote, batched=True)
    # Load the model from the file
    rf_loaded = joblib.load("models/random_forest_model.pkl")
    import numpy as np

    # predictions = np.array([np.argmax(x) for x in preds[0]])
    predictions = rf_loaded.predict(np.array(test_dataset["quote_embedding"]))

    # --------------------------------------------------------------------------------------------
    # YOUR MODEL INFERENCE STOPS HERE
    # --------------------------------------------------------------------------------------------

    # Stop tracking emissions
    emissions_data = tracker.stop_task()

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # Prepare results dictionary
    results = {
        "username": username,
        "space_url": space_url,
        "submission_timestamp": datetime.now().isoformat(),
        "model_description": DESCRIPTION,
        "accuracy": float(accuracy),
        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
        "emissions_gco2eq": emissions_data.emissions * 1000,
        "emissions_data": clean_emissions_data(emissions_data),
        "api_route": ROUTE,
        "dataset_config": {
            "dataset_name": request.dataset_name,
            "test_size": request.test_size,
            "test_seed": request.test_seed,
        },
    }

    return results