clemdesr commited on
Commit
64079b0
·
1 Parent(s): b42f4fc

feat overfitted random forest

Browse files
models/random_forest_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92976e94be7fe8d676e038f85d4ba89082f3a6455d78771720c8ba9e24b1cfa7
3
+ size 12571609
requirements.txt CHANGED
@@ -11,4 +11,5 @@ librosa==0.10.2.post1
11
  llvmlite == 0.43.0
12
  transformers
13
  torch
14
- transformers[torch]
 
 
11
  llvmlite == 0.43.0
12
  transformers
13
  torch
14
+ transformers[torch]
15
+ sentence-transformers
tasks/text.py CHANGED
@@ -59,52 +59,23 @@ async def evaluate_text(request: TextEvaluationRequest):
59
 
60
  # Make random predictions (placeholder for actual model inference)
61
  true_labels = test_dataset["label"]
 
 
62
 
63
- # import torch
64
- # from transformers import (
65
- # AutoModelForSequenceClassification,
66
- # AutoTokenizer,
67
- # Trainer,
68
- # TrainingArguments,
69
- # )
70
-
71
- # model_name = "clementdesroches/distilbert_climate_ai"
72
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
73
- # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(LABEL_MAPPING))
74
-
75
- # # Tokenize the datasets
76
- # def tokenize_function(examples):
77
- # return tokenizer(examples["quote"], padding="max_length", truncation=True)
78
-
79
- # tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
80
-
81
- # # Set training arguments
82
- # training_args = TrainingArguments(
83
- # output_dir="./bert_classification_results",
84
- # evaluation_strategy="epoch",
85
- # save_strategy="epoch",
86
- # learning_rate=2e-5,
87
- # per_device_train_batch_size=8,
88
- # per_device_eval_batch_size=8,
89
- # num_train_epochs=30,
90
- # weight_decay=0.01,
91
- # load_best_model_at_end=True,
92
- # )
93
-
94
- # # Initialize the Trainer
95
- # trainer = Trainer(
96
- # model=model,
97
- # args=training_args,
98
- # eval_dataset=tokenized_test_dataset,
99
- # tokenizer=tokenizer,
100
- # )
101
-
102
- # import numpy as np
103
-
104
- # preds = trainer.predict(tokenized_test_dataset)
105
 
106
  # predictions = np.array([np.argmax(x) for x in preds[0]])
107
- predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
108
 
109
  # --------------------------------------------------------------------------------------------
110
  # YOUR MODEL INFERENCE STOPS HERE
 
59
 
60
  # Make random predictions (placeholder for actual model inference)
61
  true_labels = test_dataset["label"]
62
+ import joblib
63
+ from sentence_transformers import SentenceTransformer
64
 
65
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
66
+
67
+ # Convert "quote" key into embeddings
68
+ def embed_quote(example):
69
+ example["quote_embedding"] = embedding_model.encode(example["quote"]).tolist()
70
+ return example
71
+
72
+ test_dataset = test_dataset.map(embed_quote, batched=True)
73
+ # Load the model from the file
74
+ rf_loaded = joblib.load("models/random_forest_model.pkl")
75
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # predictions = np.array([np.argmax(x) for x in preds[0]])
78
+ predictions = rf_loaded.predict(np.array(test_dataset["quote_embedding"]))
79
 
80
  # --------------------------------------------------------------------------------------------
81
  # YOUR MODEL INFERENCE STOPS HERE