Spaces:
Sleeping
Sleeping
Merge pull request #12 from riddhibhagwat/main
Browse filesEvaluation Pipeline Updates & Refined Implementation
- ml/{eval/data_transform_pipeline.py → dataset_transformer.py} +0 -0
- ml/dpo_pipeline.py +44 -0
- ml/eval/.reward_eval.py.swp +0 -0
- ml/eval/alpaca.py +43 -0
- ml/eval/bt.py +36 -93
- ml/eval/evaluate.py +0 -185
- ml/eval/evaluate_arguments.py +1 -1
- ml/eval/evaluation_pipeline.py +58 -0
- ml/eval/generate.py +3 -10
- ml/eval/generate_sanity_check.py +1 -1
- ml/eval/reward_eval.py +123 -0
- ml/feel.yaml +176 -0
ml/{eval/data_transform_pipeline.py → dataset_transformer.py}
RENAMED
File without changes
|
ml/dpo_pipeline.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Trainer, TrainingArguments
|
2 |
+
from datasets import Dataset
|
3 |
+
import torch
|
4 |
+
|
5 |
+
def train_dpo_model(model, dataset, learning_rate=5e-5, num_train_epochs=3, per_device_train_batch_size=16):
|
6 |
+
"""
|
7 |
+
Trains a model using Direct Preference Optimization (DPO).
|
8 |
+
|
9 |
+
Args:
|
10 |
+
model: The language model to be trained.
|
11 |
+
dataset: The dataset used for training, should be in Hugging Face Dataset format.
|
12 |
+
learning_rate: Learning rate for the optimizer.
|
13 |
+
num_train_epochs: Number of epochs to train.
|
14 |
+
per_device_train_batch_size: Batch size per device during training.
|
15 |
+
"""
|
16 |
+
model.train()
|
17 |
+
|
18 |
+
training_args = TrainingArguments(
|
19 |
+
output_dir="./dpo_model",
|
20 |
+
evaluation_strategy="epoch",
|
21 |
+
save_strategy="epoch",
|
22 |
+
learning_rate=learning_rate,
|
23 |
+
per_device_train_batch_size=per_device_train_batch_size,
|
24 |
+
per_device_eval_batch_size=per_device_train_batch_size,
|
25 |
+
num_train_epochs=num_train_epochs,
|
26 |
+
weight_decay=0.01,
|
27 |
+
logging_dir="./logs",
|
28 |
+
logging_steps=100,
|
29 |
+
save_total_limit=2,
|
30 |
+
push_to_hub=False,
|
31 |
+
load_best_model_at_end=True,
|
32 |
+
)
|
33 |
+
|
34 |
+
trainer = Trainer(
|
35 |
+
model=model,
|
36 |
+
args=training_args,
|
37 |
+
train_dataset=dataset["train"],
|
38 |
+
eval_dataset=dataset.get("validation", None),
|
39 |
+
)
|
40 |
+
|
41 |
+
trainer.train()
|
42 |
+
|
43 |
+
return model
|
44 |
+
|
ml/eval/.reward_eval.py.swp
ADDED
Binary file (20.5 kB). View file
|
|
ml/eval/alpaca.py
CHANGED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
from alpaca_eval import run_evaluation
|
4 |
+
|
5 |
+
def judge_responses(response1, response2, prompt):
|
6 |
+
"""
|
7 |
+
Use OpenAI GPT-4 API to judge two model responses.
|
8 |
+
Returns: "A" if response1 is better, "B" if response2 is better, or "tie".
|
9 |
+
"""
|
10 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
11 |
+
|
12 |
+
prompt_text = f"""
|
13 |
+
Given the user prompt: "{prompt}"
|
14 |
+
|
15 |
+
Response A: "{response1}"
|
16 |
+
Response B: "{response2}"
|
17 |
+
|
18 |
+
Which response is better? Reply with 'A', 'B', or 'tie'.
|
19 |
+
"""
|
20 |
+
|
21 |
+
try:
|
22 |
+
response = openai.ChatCompletion.create(
|
23 |
+
model="gpt-4",
|
24 |
+
messages=[{"role": "system", "content": "You are an expert evaluator."},
|
25 |
+
{"role": "user", "content": prompt_text}],
|
26 |
+
max_tokens=5
|
27 |
+
)
|
28 |
+
result = response["choices"][0]["message"]["content"].strip().lower()
|
29 |
+
return result if result in ["a", "b", "tie"] else "tie"
|
30 |
+
except Exception as e:
|
31 |
+
print(f"Error in OpenAI API call: {e}")
|
32 |
+
return "tie"
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def alpaca_evaluator(model_name, num_samples=200):
|
37 |
+
results = run_evaluation(
|
38 |
+
model=model_name,
|
39 |
+
num_samples=num_samples, # fewer samples for quick testing
|
40 |
+
reference_model="gpt-4", # Compare against GPT-4 (optional)
|
41 |
+
)
|
42 |
+
return results
|
43 |
+
|
ml/eval/bt.py
CHANGED
@@ -11,9 +11,9 @@ class ScriptArguments:
|
|
11 |
"""
|
12 |
Arguments for the Bradley-Terry evaluation script.
|
13 |
"""
|
14 |
-
|
15 |
-
|
16 |
-
output_file: str = '
|
17 |
|
18 |
|
19 |
####################################
|
@@ -34,63 +34,63 @@ def load_rewards(file_path):
|
|
34 |
return json.load(f)
|
35 |
|
36 |
|
37 |
-
def bradley_terry_comparison(
|
38 |
"""
|
39 |
Perform Bradley-Terry comparison between two sets of model generations.
|
40 |
|
41 |
Args:
|
42 |
-
|
43 |
-
|
44 |
|
45 |
Returns:
|
46 |
list: Comparison results including preferred outputs and probabilities.
|
47 |
dict: Metrics summary including percentage preferred and average probabilities.
|
48 |
"""
|
49 |
results = []
|
50 |
-
|
51 |
-
|
52 |
probabilities = []
|
53 |
|
54 |
-
for ix in range(len(
|
55 |
-
|
56 |
-
|
57 |
|
58 |
# Ensure prompts match
|
59 |
-
assert
|
60 |
|
61 |
# Compute Bradley-Terry probability
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
|
66 |
-
probabilities.append(
|
67 |
-
preferred_model = '
|
68 |
|
69 |
# Count preferences
|
70 |
-
if preferred_model == '
|
71 |
-
|
72 |
else:
|
73 |
-
|
74 |
|
75 |
# Log results
|
76 |
bt_result = {
|
77 |
-
'prompt':
|
78 |
-
'
|
79 |
-
'
|
80 |
-
'
|
81 |
-
'
|
82 |
'preferred': preferred_model,
|
83 |
-
'
|
84 |
}
|
85 |
results.append(bt_result)
|
86 |
|
87 |
# Calculate metrics
|
88 |
-
total_examples = len(
|
89 |
metrics = {
|
90 |
'total_examples': total_examples,
|
91 |
-
'
|
92 |
-
'
|
93 |
-
'
|
94 |
}
|
95 |
|
96 |
return results, metrics
|
@@ -118,9 +118,9 @@ def print_metrics(metrics):
|
|
118 |
"""
|
119 |
print("\nEVALUATION METRICS:")
|
120 |
print(f"Total examples: {metrics['total_examples']}")
|
121 |
-
print(f"Percentage preferred - KTO model: {metrics['
|
122 |
-
print(f"Percentage preferred - SFT model: {metrics['
|
123 |
-
print(f"Average probability of KTO model being preferred: {metrics['
|
124 |
|
125 |
|
126 |
####################################
|
@@ -128,22 +128,17 @@ def print_metrics(metrics):
|
|
128 |
####################################
|
129 |
|
130 |
def main():
|
131 |
-
# Initialize script arguments
|
132 |
args = ScriptArguments()
|
133 |
|
134 |
-
# Load data
|
135 |
print("Loading data...")
|
136 |
-
|
137 |
-
|
138 |
|
139 |
# Perform Bradley-Terry comparison
|
140 |
print("Performing Bradley-Terry comparison...")
|
141 |
-
results, metrics = bradley_terry_comparison(
|
142 |
|
143 |
-
# Save results
|
144 |
save_results(results, args.output_file)
|
145 |
-
|
146 |
-
# Print metrics
|
147 |
print_metrics(metrics)
|
148 |
|
149 |
|
@@ -152,55 +147,3 @@ if __name__ == "__main__":
|
|
152 |
|
153 |
|
154 |
|
155 |
-
# import json
|
156 |
-
# import torch
|
157 |
-
|
158 |
-
# output_file_path = 'bt_results.json'
|
159 |
-
# ref_generations_rewards_file_path = 'ref_models_generations_reward_trl-libqwen1.5-1.8b-sft.json'
|
160 |
-
# finetuned_generations_rewards_file_path = 'finetuned_models_generations_reward_trl-libqwen1.5-1.8b-sft.json'
|
161 |
-
|
162 |
-
# # Open and read JSON files
|
163 |
-
# with open(ref_generations_rewards_file_path, 'r') as f:
|
164 |
-
# ref_rewards = json.load(f)
|
165 |
-
|
166 |
-
# with open(finetuned_generations_rewards_file_path, 'r') as g:
|
167 |
-
# finetuned_rewards = json.load(g)
|
168 |
-
|
169 |
-
# # assert len(ref_rewards) != len(finetuned_rewards), 'ERROR: files are not with the same length.'
|
170 |
-
|
171 |
-
# results = []
|
172 |
-
# finetuned_preffered = 0
|
173 |
-
# for ix in range(len(ref_rewards)):
|
174 |
-
# ref = ref_rewards[ix]
|
175 |
-
# finetuned = finetuned_rewards[ix]
|
176 |
-
# assert ref['prompt'] == finetuned['prompt'], 'ERROR: ref and finetuned prompt are not the same.'
|
177 |
-
|
178 |
-
# # Bradely Terry
|
179 |
-
# finetuned_reward = torch.tensor(finetuned['reward'], dtype=torch.float32)
|
180 |
-
# ref_reward = torch.tensor(ref['reward'], dtype=torch.float32)
|
181 |
-
# prob_finetuned_preferred = torch.sigmoid(finetuned_reward - ref_reward)
|
182 |
-
|
183 |
-
|
184 |
-
# if prob_finetuned_preferred > 0.5:
|
185 |
-
# finetuned_preffered +=1
|
186 |
-
# print(f'example {ix}: finetuned preffered')
|
187 |
-
# else:
|
188 |
-
# print(f'example {ix}: ref preffered')
|
189 |
-
|
190 |
-
# # log results
|
191 |
-
# bt_result = {}
|
192 |
-
# bt_result['prompt'] = ref['prompt']
|
193 |
-
# bt_result['ref_output'] = ref['output']
|
194 |
-
# bt_result['finetuned_output'] = finetuned['output']
|
195 |
-
# bt_result['ref_reward'] = ref['output']
|
196 |
-
# bt_result['finetuned_reward'] = finetuned['output']
|
197 |
-
# bt_result['preffered'] = 'finetuned' if prob_finetuned_preferred > 0.5 else 'ref'
|
198 |
-
# results.append(bt_result)
|
199 |
-
|
200 |
-
|
201 |
-
# # save results in json files
|
202 |
-
|
203 |
-
# with open(output_file_path, "w") as f:
|
204 |
-
# json.dump(results, f, indent=4)
|
205 |
-
|
206 |
-
# print('BT EVALUATION COMPLETED.')
|
|
|
11 |
"""
|
12 |
Arguments for the Bradley-Terry evaluation script.
|
13 |
"""
|
14 |
+
old_generations_file: str
|
15 |
+
new_generations_file: str
|
16 |
+
output_file: str = 'bt_results.json'
|
17 |
|
18 |
|
19 |
####################################
|
|
|
34 |
return json.load(f)
|
35 |
|
36 |
|
37 |
+
def bradley_terry_comparison(old_rewards, new_rewards):
|
38 |
"""
|
39 |
Perform Bradley-Terry comparison between two sets of model generations.
|
40 |
|
41 |
Args:
|
42 |
+
old_rewards (list): List of dictionaries for the OLD model's generations and rewards.
|
43 |
+
new_rewards (list): List of dictionaries for the NEW model's generations and rewards.
|
44 |
|
45 |
Returns:
|
46 |
list: Comparison results including preferred outputs and probabilities.
|
47 |
dict: Metrics summary including percentage preferred and average probabilities.
|
48 |
"""
|
49 |
results = []
|
50 |
+
new_preferred_count = 0
|
51 |
+
old_preferred_count = 0
|
52 |
probabilities = []
|
53 |
|
54 |
+
for ix in range(len(old_rewards)):
|
55 |
+
old = old_rewards[ix]
|
56 |
+
new = new_rewards[ix]
|
57 |
|
58 |
# Ensure prompts match
|
59 |
+
assert old['prompt'] == new['prompt'], f"ERROR: Prompts at index {ix} do not match."
|
60 |
|
61 |
# Compute Bradley-Terry probability
|
62 |
+
new_reward = torch.tensor(old['reward'], dtype=torch.float32)
|
63 |
+
old_reward = torch.tensor(new['reward'], dtype=torch.float32)
|
64 |
+
prob_new_preferred = torch.sigmoid(new_reward - old_reward).item()
|
65 |
|
66 |
+
probabilities.append(prob_new_preferred)
|
67 |
+
preferred_model = 'new' if prob_new_preferred > 0.5 else 'old'
|
68 |
|
69 |
# Count preferences
|
70 |
+
if preferred_model == 'new':
|
71 |
+
new_preferred_count += 1
|
72 |
else:
|
73 |
+
old_preferred_count += 1
|
74 |
|
75 |
# Log results
|
76 |
bt_result = {
|
77 |
+
'prompt': old['prompt'],
|
78 |
+
'old_output': old['output'],
|
79 |
+
'new_output': new['output'],
|
80 |
+
'old_reward': old['reward'],
|
81 |
+
'new_reward': new['reward'],
|
82 |
'preferred': preferred_model,
|
83 |
+
'prob_new_preferred': prob_new_preferred
|
84 |
}
|
85 |
results.append(bt_result)
|
86 |
|
87 |
# Calculate metrics
|
88 |
+
total_examples = len(old_rewards)
|
89 |
metrics = {
|
90 |
'total_examples': total_examples,
|
91 |
+
'new_preferred_percentage': 100 * new_preferred_count / total_examples,
|
92 |
+
'old_preferred_percentage': 100 * old_preferred_count / total_examples,
|
93 |
+
'avg_probability_new_preferred': sum(probabilities) / total_examples
|
94 |
}
|
95 |
|
96 |
return results, metrics
|
|
|
118 |
"""
|
119 |
print("\nEVALUATION METRICS:")
|
120 |
print(f"Total examples: {metrics['total_examples']}")
|
121 |
+
print(f"Percentage preferred - KTO model: {metrics['new_preferred_percentage']:.2f}%")
|
122 |
+
print(f"Percentage preferred - SFT model: {metrics['old_preferred_percentage']:.2f}%")
|
123 |
+
print(f"Average probability of KTO model being preferred: {metrics['avg_probability_new_preferred']:.4f}")
|
124 |
|
125 |
|
126 |
####################################
|
|
|
128 |
####################################
|
129 |
|
130 |
def main():
|
|
|
131 |
args = ScriptArguments()
|
132 |
|
|
|
133 |
print("Loading data...")
|
134 |
+
old_rewards = load_rewards(args.sft_generations_file)
|
135 |
+
new_rewards = load_rewards(args.kto_generations_file)
|
136 |
|
137 |
# Perform Bradley-Terry comparison
|
138 |
print("Performing Bradley-Terry comparison...")
|
139 |
+
results, metrics = bradley_terry_comparison(old_rewards, new_rewards)
|
140 |
|
|
|
141 |
save_results(results, args.output_file)
|
|
|
|
|
142 |
print_metrics(metrics)
|
143 |
|
144 |
|
|
|
147 |
|
148 |
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ml/eval/evaluate.py
DELETED
@@ -1,185 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
import os
|
3 |
-
from typing import Any, Dict, List
|
4 |
-
|
5 |
-
import torch
|
6 |
-
import transformers
|
7 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
|
8 |
-
from accelerate import Accelerator
|
9 |
-
from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config, maybe_unpair_preference_dataset, setup_chat_format
|
10 |
-
from tqdm import tqdm
|
11 |
-
|
12 |
-
# Add script directory to system path for importing local modules
|
13 |
-
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
14 |
-
sys.path.append(os.path.dirname(SCRIPT_DIR))
|
15 |
-
|
16 |
-
from eval.utils import jload, jdump
|
17 |
-
from eval.evaluate_arguments import EvalArguments
|
18 |
-
|
19 |
-
|
20 |
-
# set `device` to "cuda" if a GPU is available. otherwise, defaults to CPU
|
21 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
-
|
23 |
-
def create_model():
|
24 |
-
# loads a specified reward model and sets it to use the GPU ("cuda")
|
25 |
-
# CHANGE FUNCTION DEPENDING OF THE MODEL YOU LOAD
|
26 |
-
model = AutoModelForSequenceClassification.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", num_labels=1).to("cuda")
|
27 |
-
return model
|
28 |
-
|
29 |
-
|
30 |
-
def create_tokenizer():
|
31 |
-
# loads the tokenizer that pairs with the model for encoding the text data
|
32 |
-
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", use_auth_token=True)
|
33 |
-
return tokenizer
|
34 |
-
|
35 |
-
|
36 |
-
def MyAccelerator(mixed_precision):
|
37 |
-
# wrap `Accelerator` to set up model handling with mixed-precision (to save memory)
|
38 |
-
accelerator = Accelerator(mixed_precision=mixed_precision)
|
39 |
-
return accelerator
|
40 |
-
|
41 |
-
|
42 |
-
#####################################
|
43 |
-
# Idan's script from here
|
44 |
-
#####################################
|
45 |
-
|
46 |
-
|
47 |
-
def main():
|
48 |
-
|
49 |
-
# Parse evaluation arguments from `EvalArguments`
|
50 |
-
parser = transformers.HfArgumentParser((EvalArguments, ))
|
51 |
-
args, = parser.parse_args_into_dataclasses()
|
52 |
-
|
53 |
-
# set `mixed_precision` based on `args.bfloat16` (if true use bf16, otherwise fp16)
|
54 |
-
mixed_precision = 'bf16' if args.bfloat16 else 'fp16'
|
55 |
-
args.mixed_precision = mixed_precision
|
56 |
-
|
57 |
-
# initialize `MyAccelerator` with the chosen mixed precision setting
|
58 |
-
accelerator = MyAccelerator(
|
59 |
-
mixed_precision=mixed_precision,
|
60 |
-
)
|
61 |
-
|
62 |
-
|
63 |
-
# load model and tokenizer
|
64 |
-
model = create_model()
|
65 |
-
if 't5' not in args.model_name_or_path:
|
66 |
-
# t5 models where trained with fp32
|
67 |
-
model = accelerator.prepare(model)
|
68 |
-
model.eval()
|
69 |
-
|
70 |
-
tokenizer = create_tokenizer()
|
71 |
-
|
72 |
-
print("Output file path:", args.output_filepath)
|
73 |
-
|
74 |
-
# load LM generations data from `args.output_filepath` + handles cases where it’s a single file or directory.
|
75 |
-
filenames = []
|
76 |
-
eval_data_list_dict = []
|
77 |
-
if os.path.isfile(args.output_filepath):
|
78 |
-
print(f'Loading data from {args.output_filepath}...')
|
79 |
-
eval_data_list_dict.append(jload(args.output_filepath))
|
80 |
-
filenames.append(args.output_filepath)
|
81 |
-
elif os.path.isdir(args.output_filepath):
|
82 |
-
print(f'Loading data from {args.output_filepath}...')
|
83 |
-
for filename in os.listdir(args.output_filepath):
|
84 |
-
if filename.endswith('.json'):
|
85 |
-
print(f'Loaded file {filename}')
|
86 |
-
eval_data_list_dict.append(jload(os.path.join(args.output_filepath, filename)))
|
87 |
-
filenames.append(os.path.join(args.output_filepath, filename))
|
88 |
-
else:
|
89 |
-
raise Exception('Output file(s) not found!')
|
90 |
-
|
91 |
-
|
92 |
-
# process each file and call `evaluate_data()` to calculate reward scores
|
93 |
-
for filename, eval_data_dict in zip(filenames, eval_data_list_dict):
|
94 |
-
eval_data = evaluate_data(args, model, tokenizer, eval_data_dict)
|
95 |
-
|
96 |
-
if args.result_filename is None:
|
97 |
-
path_to_result = os.path.basename(filename).split('.json')[0] + f"_reward_{args.model_name_or_path.replace('/', '')}.json"
|
98 |
-
else:
|
99 |
-
path_to_result = args.result_filename
|
100 |
-
|
101 |
-
print(f'Saving results to file {path_to_result}...')
|
102 |
-
jdump(eval_data, path_to_result)
|
103 |
-
|
104 |
-
|
105 |
-
def get_reward_output_fn(reward_output_fmt: str, apply_sigmoid_to_reward: bool):
|
106 |
-
# defines the reward output function format based on `reward_output_fmt`
|
107 |
-
if reward_output_fmt is None:
|
108 |
-
reward_output_fn = lambda x: x.squeeze().cpu().detach().numpy().tolist()
|
109 |
-
elif reward_output_fmt == '0':
|
110 |
-
reward_output_fn = lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0].tolist()
|
111 |
-
elif reward_output_fmt == '1':
|
112 |
-
reward_output_fn = lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1].tolist()
|
113 |
-
elif reward_output_fmt == '1-0':
|
114 |
-
reward_output_fn = lambda x: (x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1] - x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0]).tolist()
|
115 |
-
else:
|
116 |
-
raise NotImplementedError(f'Unsupported reward output format: {reward_output_fmt}')
|
117 |
-
|
118 |
-
# Apply sigmoid transformation if `apply_sigmoid_to_reward` is true
|
119 |
-
if apply_sigmoid_to_reward:
|
120 |
-
reward_output_fn = lambda x: torch.sigmoid(torch.tensor(x)).numpy().tolist()
|
121 |
-
|
122 |
-
return reward_output_fn
|
123 |
-
|
124 |
-
|
125 |
-
@torch.inference_mode()
|
126 |
-
def evaluate_data(args: EvalArguments, model, tokenizer, eval_data_list_dict) -> List[Dict[str, Any]]:
|
127 |
-
"""Given a generated dataset, evaluate it using the reward model
|
128 |
-
|
129 |
-
args: argparse.Namespace, the arguments to use
|
130 |
-
reward_model: reward_model_module.RewardModel, the reward model to use
|
131 |
-
eval_data_list_dict: List[Dict[str, Any]], the generated data to evaluate
|
132 |
-
"""
|
133 |
-
|
134 |
-
pbar = tqdm(total=len(eval_data_list_dict), desc="eval")
|
135 |
-
rewards_list = []
|
136 |
-
reward_output_fn = get_reward_output_fn(args.reward_output_fmt, args.apply_sigmoid_to_reward)
|
137 |
-
|
138 |
-
print('Evaluating reward scores...')
|
139 |
-
|
140 |
-
# Split `eval_data_list_dict` into batches for processing
|
141 |
-
for idx in range(0, len(eval_data_list_dict), args.per_device_batch_size):
|
142 |
-
if len(eval_data_list_dict) > (idx + args.per_device_batch_size):
|
143 |
-
batch_list_dict = eval_data_list_dict[idx:idx+args.per_device_batch_size]
|
144 |
-
else:
|
145 |
-
batch_list_dict = eval_data_list_dict[idx:]
|
146 |
-
|
147 |
-
# create formatted text from prompts and outputs for tokenization
|
148 |
-
if 'prompt' in batch_list_dict[0]:
|
149 |
-
batch_full_outputs = [l['prompt'] + ' ' + l['output'] for l in batch_list_dict]
|
150 |
-
else:
|
151 |
-
print('Overriding with custom prompt format')
|
152 |
-
prompt_fmt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: {output}"
|
153 |
-
for l in batch_list_dict:
|
154 |
-
l['output'] = l['output'].split('.')[0] + '.'
|
155 |
-
batch_full_outputs = [prompt_fmt.format_map(l) for l in batch_list_dict]
|
156 |
-
|
157 |
-
# tokenize and send the batched text to the model’s device
|
158 |
-
encoded_full_responses = tokenizer(batch_full_outputs, return_tensors="pt", padding=True, truncation=True)
|
159 |
-
encoded_full_responses = encoded_full_responses.to(model.device) # i added this
|
160 |
-
|
161 |
-
# generate reward scores and stores them in `rewards_list`
|
162 |
-
reward_outputs = model(**encoded_full_responses)
|
163 |
-
rewards = reward_output_fn(reward_outputs.logits)
|
164 |
-
rewards_list.extend(rewards if isinstance(rewards, list) else [rewards])
|
165 |
-
|
166 |
-
# update progress bar after each batch is processed
|
167 |
-
pbar.update(len(batch_list_dict))
|
168 |
-
|
169 |
-
print('Combining reward outputs into outputs...')
|
170 |
-
|
171 |
-
# add calculated rewards to each item in `eval_data_list_dict`
|
172 |
-
for j in range(len(eval_data_list_dict)):
|
173 |
-
eval_data_list_dict[j]['reward'] = rewards_list[j]
|
174 |
-
eval_data_list_dict[j]['reward_model'] = args.model_name_or_path + args.model_pretrained_lora_weights if args.model_pretrained_lora_weights is not None else args.model_name_or_path
|
175 |
-
|
176 |
-
print('Finished evaluating reward scores!')
|
177 |
-
|
178 |
-
print('Mean reward score: ', sum(rewards_list) / len(rewards_list))
|
179 |
-
print('Std reward score: ', torch.tensor(rewards_list).std().item())
|
180 |
-
|
181 |
-
return eval_data_list_dict
|
182 |
-
|
183 |
-
|
184 |
-
if __name__ == '__main__':
|
185 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ml/eval/evaluate_arguments.py
CHANGED
@@ -3,7 +3,7 @@ from dataclasses import dataclass, field
|
|
3 |
@dataclass
|
4 |
class EvalArguments:
|
5 |
model_name_or_path: str = field(
|
6 |
-
default="
|
7 |
model_pretrained_lora_weights: str = field(
|
8 |
default=None, metadata={"help": "Path to a checkpoint directory."})
|
9 |
output_filepath: str = field(
|
|
|
3 |
@dataclass
|
4 |
class EvalArguments:
|
5 |
model_name_or_path: str = field(
|
6 |
+
default="CohereForAI/aya-expanse-8b", metadata={"help": "Name to a huggingface native pretrained model or path to a model on disk."})
|
7 |
model_pretrained_lora_weights: str = field(
|
8 |
default=None, metadata={"help": "Path to a checkpoint directory."})
|
9 |
output_filepath: str = field(
|
ml/eval/evaluation_pipeline.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###########
|
2 |
+
# IMPORTS #
|
3 |
+
###########
|
4 |
+
from reward_eval import process_evaluation
|
5 |
+
from generate import generate_files
|
6 |
+
from alpaca import alpaca_evaluator, judge_responses
|
7 |
+
from bt import bradley_terry_comparison, load_rewards
|
8 |
+
from evaluate_arguments import EvalArguments
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
#############
|
13 |
+
# EVALUATOR #
|
14 |
+
#############
|
15 |
+
'''
|
16 |
+
Evaluation Pipeline
|
17 |
+
|
18 |
+
Parameters:
|
19 |
+
eval_dataset: list of dictionaries that contain the prompt and response in the same form as below:
|
20 |
+
[{"prompt": "How are you?", "output": "I'm doing great!"}, {"prompt": "What's your name?", "output": "Assistant"}]
|
21 |
+
reward_output_filepath: string (must end in .json) that represents the path of the output of the reward score evaluation
|
22 |
+
model: base model that is being evaluated (defaults to starter base model - Aya-23-8B )
|
23 |
+
all_responses: should be a path to a csv file that has all the model's responses and their corresponding prompts with the following
|
24 |
+
format: response1 --> col 1, response2 --> col 2, prompt --> col 3
|
25 |
+
|
26 |
+
language: which language is being used for this model (needs to be a valid FeeLLanguage object once FeeLLanguage class is updated)
|
27 |
+
'''
|
28 |
+
def evaluator_master_fn(eval_dataset: list[dict],
|
29 |
+
reward_output_filepath: str,
|
30 |
+
all_responses: str,
|
31 |
+
language: str,
|
32 |
+
new_model,
|
33 |
+
old_model="CohereForAI/aya-expanse-8b"):
|
34 |
+
# language is string for now, will be an object later with FeeLLanguage class definition with specific lanugage
|
35 |
+
# functionalities (will also store latest model and be much easier to handle such functions)
|
36 |
+
|
37 |
+
# 1. Reward score evaluation:
|
38 |
+
args = EvalArguments(bfloat16=True,
|
39 |
+
reward_output_fmt='1-0',
|
40 |
+
apply_sigmoid_to_reward=False,
|
41 |
+
per_device_batch_size=8,
|
42 |
+
output_filepath="new_evaluation",
|
43 |
+
result_filename=None,
|
44 |
+
model_name_or_path=new_model)
|
45 |
+
reward_score_result = process_evaluation(args, model_name=new_model, eval_data_list_dict=eval_dataset)
|
46 |
+
|
47 |
+
# 2. Alpaca Eval - Judging Responses
|
48 |
+
judge_df = pd.read_csv(all_responses)
|
49 |
+
judge_df["winner"] = judge_df.apply(lambda r: judge_responses(r["response1"], r["response2"], r["prompt"]), axis = 1) # axis = 1 -- loops rows
|
50 |
+
|
51 |
+
# 3. Alpaca Eval - model comparison
|
52 |
+
alpaca_results = alpaca_evaluator(new_model, num_samples=200) # can adjust num_samples as needed, potentially based on language
|
53 |
+
|
54 |
+
# 4. Bradley Terry Evaluation
|
55 |
+
bt_results = bradley_terry_comparison(load_rewards(old_model), load_rewards(new_model))
|
56 |
+
|
57 |
+
return reward_score_result, judge_df, alpaca_results, bt_results
|
58 |
+
|
ml/eval/generate.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
-
import torch
|
2 |
from dataclasses import dataclass
|
3 |
-
from accelerate import PartialState
|
4 |
from datasets import load_dataset
|
5 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
6 |
from trl import ModelConfig, maybe_unpair_preference_dataset, setup_chat_format
|
@@ -8,15 +6,10 @@ from tqdm import tqdm
|
|
8 |
import json
|
9 |
import os
|
10 |
import sys
|
11 |
-
from pdb import set_trace as st
|
12 |
-
|
13 |
|
14 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
15 |
sys.path.append(os.path.dirname(SCRIPT_DIR))
|
16 |
|
17 |
-
from dataloaders.data_loader import get_oasst
|
18 |
-
|
19 |
-
|
20 |
####################################
|
21 |
# CONFIGURATION
|
22 |
####################################
|
@@ -96,7 +89,7 @@ def load_oasst_test_dataset():
|
|
96 |
"""Load and prepare the dataset."""
|
97 |
|
98 |
# Load oasst test dataset
|
99 |
-
test_dataset =
|
100 |
return test_dataset
|
101 |
|
102 |
|
@@ -141,7 +134,7 @@ def save_results(results, output_file):
|
|
141 |
# MAIN SCRIPT
|
142 |
####################################
|
143 |
|
144 |
-
def
|
145 |
# Load model and tokenizer
|
146 |
print("Loading kto fine-tuned model...")
|
147 |
kto_model, kto_tokenizer = load_model_and_tokenizer(script_args.kto_model_path, use_auth_token=True)
|
@@ -166,4 +159,4 @@ def main():
|
|
166 |
|
167 |
|
168 |
if __name__ == "__main__":
|
169 |
-
|
|
|
|
|
1 |
from dataclasses import dataclass
|
|
|
2 |
from datasets import load_dataset
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
from trl import ModelConfig, maybe_unpair_preference_dataset, setup_chat_format
|
|
|
6 |
import json
|
7 |
import os
|
8 |
import sys
|
|
|
|
|
9 |
|
10 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
11 |
sys.path.append(os.path.dirname(SCRIPT_DIR))
|
12 |
|
|
|
|
|
|
|
13 |
####################################
|
14 |
# CONFIGURATION
|
15 |
####################################
|
|
|
89 |
"""Load and prepare the dataset."""
|
90 |
|
91 |
# Load oasst test dataset
|
92 |
+
test_dataset = load_dataset(split='test')
|
93 |
return test_dataset
|
94 |
|
95 |
|
|
|
134 |
# MAIN SCRIPT
|
135 |
####################################
|
136 |
|
137 |
+
def generate_files():
|
138 |
# Load model and tokenizer
|
139 |
print("Loading kto fine-tuned model...")
|
140 |
kto_model, kto_tokenizer = load_model_and_tokenizer(script_args.kto_model_path, use_auth_token=True)
|
|
|
159 |
|
160 |
|
161 |
if __name__ == "__main__":
|
162 |
+
generate_files()
|
ml/eval/generate_sanity_check.py
CHANGED
@@ -45,7 +45,7 @@ ref_model = AutoModelForCausalLM.from_pretrained(
|
|
45 |
).to("cuda")
|
46 |
print(f'loaded reference model')
|
47 |
|
48 |
-
# load a
|
49 |
ref_tokenizer = AutoTokenizer.from_pretrained(
|
50 |
ref_model_args.model_name_or_path, trust_remote_code=ref_model_args.trust_remote_code
|
51 |
)
|
|
|
45 |
).to("cuda")
|
46 |
print(f'loaded reference model')
|
47 |
|
48 |
+
# load a tokenizer
|
49 |
ref_tokenizer = AutoTokenizer.from_pretrained(
|
50 |
ref_model_args.model_name_or_path, trust_remote_code=ref_model_args.trust_remote_code
|
51 |
)
|
ml/eval/reward_eval.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from typing import Any, Dict, List
|
4 |
+
import json
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, CohereConfig, AutoModel
|
7 |
+
from accelerate import Accelerator
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
# Add script directory to system path for importing local modules
|
11 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
12 |
+
sys.path.append(os.path.dirname(SCRIPT_DIR))
|
13 |
+
|
14 |
+
from eval.utils import jload, jdump
|
15 |
+
from eval.evaluate_arguments import EvalArguments
|
16 |
+
|
17 |
+
|
18 |
+
# set `device` to "cuda" if a GPU is available. otherwise, defaults to CPU
|
19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
+
|
21 |
+
def create_model(model_name: str):
|
22 |
+
"""
|
23 |
+
loads pre-trained reward model and moves it onto device
|
24 |
+
"""
|
25 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", num_labels=1).to("cuda")
|
26 |
+
return model
|
27 |
+
|
28 |
+
|
29 |
+
def create_tokenizer(model_name):
|
30 |
+
# loads the tokenizer that pairs with the model for encoding the text data
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
|
32 |
+
return tokenizer
|
33 |
+
|
34 |
+
|
35 |
+
def MyAccelerator(mixed_precision: str):
|
36 |
+
"""
|
37 |
+
accelerator initialization (wrapper) for handling mixed precision
|
38 |
+
"""
|
39 |
+
return Accelerator(mixed_precision=mixed_precision)
|
40 |
+
|
41 |
+
def get_reward_output_fn(reward_output_format: str, sigmoid: bool):
|
42 |
+
def default(x):
|
43 |
+
return x.squeeze().cpu().detach().numpy().tolist()
|
44 |
+
reward_fn_map = {
|
45 |
+
'0': lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0].tolist(),
|
46 |
+
'1': lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1].tolist(),
|
47 |
+
'1-0': lambda x: (x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1] - x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0]).tolist()
|
48 |
+
}
|
49 |
+
reward_output_fn = reward_fn_map.get(reward_output_format, default)
|
50 |
+
if sigmoid:
|
51 |
+
return lambda x: torch.sigmoid(torch.tensor(x)).numpy().tolist()
|
52 |
+
return reward_output_fn
|
53 |
+
|
54 |
+
def evaluate_data(args, model, tokenizer, eval_data_list_dict) -> List[Dict[str, Any]]:
|
55 |
+
"""
|
56 |
+
Evaluate the dataset using the reward model.
|
57 |
+
"""
|
58 |
+
reward_output_fn = get_reward_output_fn(args.reward_output_fmt, args.apply_sigmoid_to_reward)
|
59 |
+
pbar = tqdm(total=len(eval_data_list_dict), desc="Evaluating Rewards")
|
60 |
+
rewards_list = []
|
61 |
+
|
62 |
+
for idx in range(0, len(eval_data_list_dict), args.per_device_batch_size):
|
63 |
+
batch_list_dict = eval_data_list_dict[idx:idx+args.per_device_batch_size]
|
64 |
+
|
65 |
+
# Create prompt-response pairs
|
66 |
+
batch_full_outputs = [
|
67 |
+
f"{l['prompt']} {l['output']}" for l in batch_list_dict
|
68 |
+
] if 'prompt' in batch_list_dict[0] else [f"Below is an instruction: {l['instruction']} Response: {l['output']}" for l in batch_list_dict]
|
69 |
+
|
70 |
+
# Tokenize reponse and send to device
|
71 |
+
encoded_full_responses = tokenizer(batch_full_outputs, return_tensors="pt", padding=True, truncation=True)
|
72 |
+
encoded_full_responses = encoded_full_responses.to(model.device)
|
73 |
+
|
74 |
+
# Generate rewards
|
75 |
+
with torch.inference_mode():
|
76 |
+
reward_outputs = model(**encoded_full_responses)
|
77 |
+
rewards = reward_output_fn(reward_outputs.logits)
|
78 |
+
rewards_list.extend(rewards)
|
79 |
+
|
80 |
+
pbar.update(len(batch_list_dict))
|
81 |
+
|
82 |
+
# Adding reward scores to original data
|
83 |
+
for i, data in enumerate(eval_data_list_dict):
|
84 |
+
data['reward'] = rewards_list[i]
|
85 |
+
|
86 |
+
return eval_data_list_dict
|
87 |
+
|
88 |
+
def process_evaluation(args, model_name: str, eval_data_list_dict) -> List[Dict[str, Any]]:
|
89 |
+
"""
|
90 |
+
Main function for processing evaluation, takes model name as input.
|
91 |
+
"""
|
92 |
+
# mixed_precision = 'bf16' if args.bfloat16 else 'fp16'
|
93 |
+
|
94 |
+
# Initialize accelerator and model
|
95 |
+
# accelerator = MyAccelerator(mixed_precision)
|
96 |
+
model = create_model(model_name)
|
97 |
+
tokenizer = create_tokenizer(model_name)
|
98 |
+
|
99 |
+
model.eval()
|
100 |
+
|
101 |
+
eval_data = evaluate_data(args, model, tokenizer, eval_data_list_dict)
|
102 |
+
|
103 |
+
result_filename = args.result_filename or f"{os.path.basename(args.output_filepath).split('.')[0]}_reward_results.json"
|
104 |
+
with open(result_filename, "w") as f:
|
105 |
+
json.dump(eval_data, f)
|
106 |
+
|
107 |
+
return eval_data
|
108 |
+
|
109 |
+
|
110 |
+
# ONLY FOR TESTING:
|
111 |
+
if __name__ == '__main__':
|
112 |
+
args = EvalArguments(bfloat16=True,
|
113 |
+
reward_output_fmt='1-0',
|
114 |
+
apply_sigmoid_to_reward=False,
|
115 |
+
per_device_batch_size=8,
|
116 |
+
output_filepath= '/path/to/your/data.json',
|
117 |
+
result_filename=None,
|
118 |
+
model_name_or_path="CohereForAI/aya-expanse-8b")
|
119 |
+
|
120 |
+
|
121 |
+
eval_data_list_dict = [{"prompt": "How are you?", "output": "I'm doing great!"}, {"prompt": "What's your name?", "output": "Assistant"}]
|
122 |
+
|
123 |
+
process_evaluation(args, model_name="CohereForAI/aya-expanse-8b", eval_data_list_dict=eval_data_list_dict)
|
ml/feel.yaml
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: feel
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
- bioconda
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- accelerate=1.4.0=pyhd8ed1ab_0
|
8 |
+
- aiohappyeyeballs=2.4.6=pyhd8ed1ab_0
|
9 |
+
- aiohttp=3.11.13=py313ha9b7d5b_0
|
10 |
+
- aiosignal=1.3.2=pyhd8ed1ab_0
|
11 |
+
- annotated-types=0.7.0=pyhd8ed1ab_1
|
12 |
+
- anyio=4.8.0=pyhd8ed1ab_0
|
13 |
+
- attrs=25.1.0=pyh71513ae_0
|
14 |
+
- aws-c-auth=0.8.1=hfc2798a_0
|
15 |
+
- aws-c-cal=0.8.1=hc8a0bd2_3
|
16 |
+
- aws-c-common=0.10.6=h5505292_0
|
17 |
+
- aws-c-compression=0.3.0=hc8a0bd2_5
|
18 |
+
- aws-c-event-stream=0.5.0=h54f970a_11
|
19 |
+
- aws-c-http=0.9.2=h96aa502_4
|
20 |
+
- aws-c-io=0.15.3=haba67d1_6
|
21 |
+
- aws-c-mqtt=0.11.0=h24f418c_12
|
22 |
+
- aws-c-s3=0.7.9=hf37e03c_1
|
23 |
+
- aws-c-sdkutils=0.2.2=hc8a0bd2_0
|
24 |
+
- aws-checksums=0.2.2=hc8a0bd2_4
|
25 |
+
- aws-crt-cpp=0.29.9=ha81f72f_2
|
26 |
+
- aws-sdk-cpp=1.11.489=h0e5014b_0
|
27 |
+
- azure-core-cpp=1.14.0=hd50102c_0
|
28 |
+
- azure-identity-cpp=1.10.0=hc602bab_0
|
29 |
+
- azure-storage-blobs-cpp=12.13.0=h7585a09_1
|
30 |
+
- azure-storage-common-cpp=12.8.0=h9ca1f76_1
|
31 |
+
- azure-storage-files-datalake-cpp=12.12.0=hcdd55da_1
|
32 |
+
- brotli-python=1.1.0=py313h3579c5c_2
|
33 |
+
- bzip2=1.0.8=h99b78c6_7
|
34 |
+
- c-ares=1.34.4=h5505292_0
|
35 |
+
- ca-certificates=2025.1.31=hf0a4a13_0
|
36 |
+
- certifi=2025.1.31=pyhd8ed1ab_0
|
37 |
+
- charset-normalizer=3.4.1=pyhd8ed1ab_0
|
38 |
+
- colorama=0.4.6=pyhd8ed1ab_1
|
39 |
+
- cpython=3.13.2=py313hd8ed1ab_101
|
40 |
+
- datasets=3.3.2=pyhd8ed1ab_0
|
41 |
+
- dill=0.3.8=pyhd8ed1ab_0
|
42 |
+
- distro=1.9.0=pyhd8ed1ab_1
|
43 |
+
- docstring_parser=0.16=pyhd8ed1ab_0
|
44 |
+
- eval_type_backport=0.2.2=pyha770c72_0
|
45 |
+
- exceptiongroup=1.2.2=pyhd8ed1ab_1
|
46 |
+
- filelock=3.17.0=pyhd8ed1ab_0
|
47 |
+
- frozendict=2.4.6=py313h63a2874_0
|
48 |
+
- frozenlist=1.5.0=py313ha9b7d5b_1
|
49 |
+
- fsspec=2024.12.0=pyhd8ed1ab_0
|
50 |
+
- gflags=2.2.2=hf9b8971_1005
|
51 |
+
- glog=0.7.1=heb240a5_0
|
52 |
+
- gmp=6.3.0=h7bae524_2
|
53 |
+
- gmpy2=2.1.5=py313h2cdc120_3
|
54 |
+
- h11=0.14.0=pyhd8ed1ab_1
|
55 |
+
- h2=4.2.0=pyhd8ed1ab_0
|
56 |
+
- hpack=4.1.0=pyhd8ed1ab_0
|
57 |
+
- httpcore=1.0.7=pyh29332c3_1
|
58 |
+
- httpx=0.28.1=pyhd8ed1ab_0
|
59 |
+
- huggingface_hub=0.29.1=pyhd8ed1ab_0
|
60 |
+
- hyperframe=6.1.0=pyhd8ed1ab_0
|
61 |
+
- idna=3.10=pyhd8ed1ab_1
|
62 |
+
- jinja2=3.1.5=pyhd8ed1ab_0
|
63 |
+
- jiter=0.8.2=py313hdde674f_0
|
64 |
+
- krb5=1.21.3=h237132a_0
|
65 |
+
- libabseil=20240722.0=cxx17_h07bc746_4
|
66 |
+
- libarrow=19.0.1=h0945df6_0_cpu
|
67 |
+
- libarrow-acero=19.0.1=hf07054f_0_cpu
|
68 |
+
- libarrow-dataset=19.0.1=hf07054f_0_cpu
|
69 |
+
- libarrow-substrait=19.0.1=h4239455_0_cpu
|
70 |
+
- libblas=3.9.0=31_h10e41b3_openblas
|
71 |
+
- libbrotlicommon=1.1.0=hd74edd7_2
|
72 |
+
- libbrotlidec=1.1.0=hd74edd7_2
|
73 |
+
- libbrotlienc=1.1.0=hd74edd7_2
|
74 |
+
- libcblas=3.9.0=31_hb3479ef_openblas
|
75 |
+
- libcrc32c=1.1.2=hbdafb3b_0
|
76 |
+
- libcurl=8.12.1=h73640d1_0
|
77 |
+
- libcxx=19.1.7=ha82da77_0
|
78 |
+
- libedit=3.1.20250104=pl5321hafb1f1b_0
|
79 |
+
- libev=4.33=h93a5062_2
|
80 |
+
- libevent=2.1.12=h2757513_1
|
81 |
+
- libexpat=2.6.4=h286801f_0
|
82 |
+
- libffi=3.4.2=h3422bc3_5
|
83 |
+
- libgfortran=5.0.0=13_2_0_hd922786_3
|
84 |
+
- libgfortran5=13.2.0=hf226fd6_3
|
85 |
+
- libgoogle-cloud=2.35.0=hdbe95d5_0
|
86 |
+
- libgoogle-cloud-storage=2.35.0=h7081f7f_0
|
87 |
+
- libgrpc=1.67.1=h0a426d6_2
|
88 |
+
- libiconv=1.18=hfe07756_1
|
89 |
+
- liblapack=3.9.0=31_hc9a63f6_openblas
|
90 |
+
- liblzma=5.6.4=h39f12f2_0
|
91 |
+
- libmpdec=4.0.0=h99b78c6_0
|
92 |
+
- libnghttp2=1.64.0=h6d7220d_0
|
93 |
+
- libopenblas=0.3.29=openmp_hf332438_0
|
94 |
+
- libopentelemetry-cpp=1.18.0=h0c05b2d_1
|
95 |
+
- libopentelemetry-cpp-headers=1.18.0=hce30654_1
|
96 |
+
- libparquet=19.0.1=h636d7b7_0_cpu
|
97 |
+
- libprotobuf=5.28.3=h3bd63a1_1
|
98 |
+
- libre2-11=2024.07.02=h07bc746_2
|
99 |
+
- libsqlite=3.49.1=h3f77e49_1
|
100 |
+
- libssh2=1.11.1=h9cc3647_0
|
101 |
+
- libthrift=0.21.0=h64651cc_0
|
102 |
+
- libtorch=2.6.0=cpu_generic_h6adcabc_0
|
103 |
+
- libutf8proc=2.10.0=hda25de7_0
|
104 |
+
- libuv=1.50.0=h5505292_0
|
105 |
+
- libxml2=2.13.6=hce475f1_0
|
106 |
+
- libzlib=1.3.1=h8359307_2
|
107 |
+
- llvm-openmp=19.1.7=hdb05f8b_0
|
108 |
+
- lz4-c=1.10.0=h286801f_1
|
109 |
+
- markdown-it-py=3.0.0=pyhd8ed1ab_1
|
110 |
+
- markupsafe=3.0.2=py313ha9b7d5b_1
|
111 |
+
- mdurl=0.1.2=pyhd8ed1ab_1
|
112 |
+
- mpc=1.3.1=h8f1351a_1
|
113 |
+
- mpfr=4.2.1=hb693164_3
|
114 |
+
- mpmath=1.3.0=pyhd8ed1ab_1
|
115 |
+
- multidict=6.1.0=py313h6347b5a_1
|
116 |
+
- multiprocess=0.70.16=py313h20a7fcf_1
|
117 |
+
- ncurses=6.5=h5e97a16_3
|
118 |
+
- networkx=3.4.2=pyh267e887_2
|
119 |
+
- nlohmann_json=3.11.3=h00cdb27_1
|
120 |
+
- nomkl=1.0=h5ca1d4c_0
|
121 |
+
- numpy=2.2.3=py313h41a2e72_0
|
122 |
+
- openai=1.65.2=pyhd8ed1ab_0
|
123 |
+
- openssl=3.4.1=h81ee809_0
|
124 |
+
- optree=0.14.1=py313h0ebd0e5_0
|
125 |
+
- orc=2.0.3=h0ff2369_2
|
126 |
+
- packaging=24.2=pyhd8ed1ab_2
|
127 |
+
- pandas=2.2.3=py313h47b39a6_1
|
128 |
+
- pip=25.0.1=pyh145f28c_0
|
129 |
+
- prometheus-cpp=1.3.0=h0967b3e_0
|
130 |
+
- propcache=0.2.1=py313ha9b7d5b_1
|
131 |
+
- psutil=7.0.0=py313h90d716c_0
|
132 |
+
- pyarrow=19.0.1=py313h39782a4_0
|
133 |
+
- pyarrow-core=19.0.1=py313hf9431ad_0_cpu
|
134 |
+
- pybind11=2.13.6=pyh1ec8472_2
|
135 |
+
- pybind11-global=2.13.6=pyh415d2e4_2
|
136 |
+
- pydantic=2.10.6=pyh3cfb1c2_0
|
137 |
+
- pydantic-core=2.27.2=py313hdde674f_0
|
138 |
+
- pygments=2.19.1=pyhd8ed1ab_0
|
139 |
+
- pysocks=1.7.1=pyha55dd90_7
|
140 |
+
- python=3.13.2=h81fe080_101_cp313
|
141 |
+
- python-dateutil=2.9.0.post0=pyhff2d567_1
|
142 |
+
- python-tzdata=2025.1=pyhd8ed1ab_0
|
143 |
+
- python-xxhash=3.5.0=py313h90d716c_2
|
144 |
+
- python_abi=3.13=5_cp313
|
145 |
+
- pytorch=2.6.0=cpu_generic_py313_h2e75435_0
|
146 |
+
- pytz=2024.1=pyhd8ed1ab_0
|
147 |
+
- pyyaml=6.0.2=py313ha9b7d5b_2
|
148 |
+
- re2=2024.07.02=h6589ca4_2
|
149 |
+
- readline=8.2=h1d1bf99_2
|
150 |
+
- regex=2024.11.6=py313h90d716c_0
|
151 |
+
- requests=2.32.3=pyhd8ed1ab_1
|
152 |
+
- rich=13.9.4=pyhd8ed1ab_1
|
153 |
+
- safetensors=0.5.3=py313hdde674f_0
|
154 |
+
- setuptools=75.8.2=pyhff2d567_0
|
155 |
+
- shtab=1.7.1=pyhd8ed1ab_1
|
156 |
+
- six=1.17.0=pyhd8ed1ab_0
|
157 |
+
- sleef=3.8=h8391f65_0
|
158 |
+
- snappy=1.2.1=h98b9ce2_1
|
159 |
+
- sniffio=1.3.1=pyhd8ed1ab_1
|
160 |
+
- sympy=1.13.3=pyh2585a3b_105
|
161 |
+
- tk=8.6.13=h5083fa2_1
|
162 |
+
- tokenizers=0.21.0=py313h9a4dfeb_0
|
163 |
+
- tqdm=4.67.1=pyhd8ed1ab_1
|
164 |
+
- transformers=4.49.0=pyhd8ed1ab_0
|
165 |
+
- trl=0.15.2=pyhd8ed1ab_0
|
166 |
+
- typing-extensions=4.12.2=hd8ed1ab_1
|
167 |
+
- typing_extensions=4.12.2=pyha770c72_1
|
168 |
+
- tyro=0.9.1=pyhff2d567_0
|
169 |
+
- tzdata=2025a=h78e105d_0
|
170 |
+
- urllib3=2.2.2=pyhd8ed1ab_0
|
171 |
+
- xxhash=0.8.3=h5505292_0
|
172 |
+
- yaml=0.2.5=h3422bc3_2
|
173 |
+
- yarl=1.18.3=py313ha9b7d5b_1
|
174 |
+
- zlib=1.3.1=h8359307_2
|
175 |
+
- zstd=1.5.7=h6491c7d_1
|
176 |
+
prefix: /opt/anaconda3/envs/hf-rlhf
|