from fastapi import APIRouter from datetime import datetime from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.metrics import accuracy_score from .utils.evaluation import TextEvaluationRequest from .utils.emissions import tracker, clean_emissions_data, get_space_info # Define the router for text tasks router = APIRouter() DESCRIPTION_NAIVE_BAYES = "Naive Bayes Text Classifier" DESCRIPTION_SVM = "SVM Text Classifier with TF-IDF" # Naive Bayes Endpoint @router.post("/text", tags=["Text Task"], description=DESCRIPTION_NAIVE_BAYES) async def evaluate_text(request: TextEvaluationRequest): """ Evaluate text classification using Naive Bayes. """ username, space_url = get_space_info() # Label Mapping LABEL_MAPPING = { "0_not_relevant": 0, "1_not_happening": 1, "2_not_human": 2, "3_not_bad": 3, "4_solutions_harmful_unnecessary": 4, "5_science_unreliable": 5, "6_proponents_biased": 6, "7_fossil_fuels_needed": 7 } # Load and prepare dataset dataset = load_dataset(request.dataset_name) dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]}) # Train-Test Split train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed) train_texts = [x["text"] for x in train_test["train"]] train_labels = [x["label"] for x in train_test["train"]] test_texts = [x["text"] for x in train_test["test"]] test_labels = [x["label"] for x in train_test["test"]] # TF-IDF Vectorization vectorizer = TfidfVectorizer(max_features=5000) train_vectors = vectorizer.fit_transform(train_texts) test_vectors = vectorizer.transform(test_texts) # Train Naive Bayes Classifier model = MultinomialNB() model.fit(train_vectors, train_labels) # Track emissions tracker.start() tracker.start_task("inference") predictions = model.predict(test_vectors) emissions_data = tracker.stop_task() # Calculate Accuracy accuracy = accuracy_score(test_labels, predictions) return { "username": username, "space_url": space_url, "submission_timestamp": datetime.now().isoformat(), "model_description": DESCRIPTION_NAIVE_BAYES, "accuracy": float(accuracy), "energy_consumed_wh": emissions_data.energy_consumed * 1000, "emissions_gco2eq": emissions_data.emissions * 1000, "emissions_data": clean_emissions_data(emissions_data), "api_route": "/text", "dataset_config": { "dataset_name": request.dataset_name, "test_size": request.test_size, "test_seed": request.test_seed } } # SVM Endpoint @router.post("/text_svm", tags=["Text Task"], description=DESCRIPTION_SVM) async def evaluate_text_svm(request: TextEvaluationRequest): """ Evaluate text classification using SVM. """ username, space_url = get_space_info() # Label Mapping LABEL_MAPPING = { "0_not_relevant": 0, "1_not_happening": 1, "2_not_human": 2, "3_not_bad": 3, "4_solutions_harmful_unnecessary": 4, "5_science_unreliable": 5, "6_proponents_biased": 6, "7_fossil_fuels_needed": 7 } # Load and prepare dataset dataset = load_dataset(request.dataset_name) dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]}) # Train-Test Split train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed) train_texts = [x["text"] for x in train_test["train"]] train_labels = [x["label"] for x in train_test["train"]] test_texts = [x["text"] for x in train_test["test"]] test_labels = [x["label"] for x in train_test["test"]] # TF-IDF Vectorization vectorizer = TfidfVectorizer(max_features=5000) train_vectors = vectorizer.fit_transform(train_texts) test_vectors = vectorizer.transform(test_texts) # Train SVM Classifier model = SVC(kernel="linear", probability=True) model.fit(train_vectors, train_labels) # Track emissions tracker.start() tracker.start_task("inference") predictions = model.predict(test_vectors) emissions_data = tracker.stop_task() # Calculate Accuracy accuracy = accuracy_score(test_labels, predictions) return { "username": username, "space_url": space_url, "submission_timestamp": datetime.now().isoformat(), "model_description": DESCRIPTION_SVM, "accuracy": float(accuracy), "energy_consumed_wh": emissions_data.energy_consumed * 1000, "emissions_gco2eq": emissions_data.emissions * 1000, "emissions_data": clean_emissions_data(emissions_data), "api_route": "/text_svm", "dataset_config": { "dataset_name": request.dataset_name, "test_size": request.test_size, "test_seed": request.test_seed } }