Canstralian commited on
Commit
55d61a2
·
verified ·
1 Parent(s): 13766c2

Create training_and_evaluation.py

Browse files
Files changed (1) hide show
  1. training_and_evaluation.py +100 -0
training_and_evaluation.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datasets import load_dataset, load_metric
3
+ import numpy as np
4
+ from transformers import AutoAdapterModel, AutoTokenizer, TrainingArguments, Trainer
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+ # Access environment variables using os.getenv()
11
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
+ WAND_API_KEY = os.getenv("WAND_API_KEY")
15
+
16
+ # Use these variables as needed in your code
17
+
18
+
19
+ # Load datasets
20
+ dataset_pentesting = load_dataset("canstralian/pentesting-ai")
21
+ dataset_redpajama = load_dataset("togethercomputer/RedPajama-Data-1T")
22
+
23
+ # Tokenizer
24
+ tokenizer = AutoTokenizer.from_pretrained("canstralian/rabbitredeux")
25
+
26
+ def tokenize_function(examples):
27
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
28
+
29
+ # Tokenize datasets
30
+ tokenized_dataset_pentesting = dataset_pentesting.map(tokenize_function, batched=True)
31
+ tokenized_dataset_redpajama = dataset_redpajama.map(tokenize_function, batched=True)
32
+
33
+ # Prepare datasets
34
+ train_dataset_pentesting = tokenized_dataset_pentesting["train"]
35
+ validation_dataset_pentesting = tokenized_dataset_pentesting["validation"]
36
+
37
+ # Load model and adapter
38
+ model = AutoAdapterModel.from_pretrained("canstralian/rabbitredeux")
39
+ model.load_adapter("Canstralian/RabbitRedux", set_active=True)
40
+
41
+ # Load metric (accuracy)
42
+ metric = load_metric("accuracy")
43
+
44
+ # Training arguments
45
+ training_args = TrainingArguments(
46
+ output_dir="./results",
47
+ num_train_epochs=3,
48
+ per_device_train_batch_size=8,
49
+ per_device_eval_batch_size=8,
50
+ warmup_steps=500,
51
+ weight_decay=0.01,
52
+ logging_dir="./logs",
53
+ logging_steps=10,
54
+ evaluation_strategy="epoch"
55
+ )
56
+
57
+ # Trainer setup
58
+ trainer = Trainer(
59
+ model=model,
60
+ args=training_args,
61
+ train_dataset=train_dataset_pentesting,
62
+ eval_dataset=validation_dataset_pentesting,
63
+ compute_metrics=lambda p: metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
64
+ )
65
+
66
+ # Training
67
+ trainer.train()
68
+
69
+ # Evaluate model
70
+ eval_results = trainer.evaluate()
71
+ print("Evaluation Results: ", eval_results)
72
+
73
+ # Save the fine-tuned model
74
+ model.save_pretrained("./fine_tuned_model")
75
+
76
+ # Test model on new data
77
+ new_data = """
78
+ I love the ocean. It is so peaceful and serene.
79
+ """
80
+
81
+ # Tokenize new data
82
+ tokenized_new_data = tokenize_function({"text": [new_data]})
83
+ input_ids = tokenized_new_data["input_ids"][0]
84
+ attention_mask = tokenized_new_data["attention_mask"][0]
85
+
86
+ # Prediction
87
+ outputs = model(input_ids=np.array([input_ids]), attention_mask=np.array([attention_mask]))
88
+ prediction_scores = outputs.logits[0] # Getting logits for the first sample
89
+
90
+ # Get predicted label
91
+ predicted_label = np.argmax(prediction_scores)
92
+
93
+ print(f"The predicted label is: {predicted_label}")
94
+
95
+ # Evaluate predictions (using some assumed correct label)
96
+ actual_label = 1 # Replace with the actual label if known
97
+
98
+ accuracy = metric.compute(predictions=[predicted_label], references=[actual_label])
99
+
100
+ print(f"Accuracy on new data: {accuracy}")