Spaces:
Sleeping
Sleeping
from datasets import Dataset, load_dataset | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
import json | |
from ipdb import set_trace as st | |
from transformers import AutoTokenizer | |
from enum import Enum | |
class SupportedLanguages(str, Enum): | |
"""Enumeration of supported languages""" | |
ENGLISH = "English" | |
DUTCH = "Dutch" | |
ITALIAN = "Italian" | |
SPANISH = "Spanish" | |
FRENCH = "French" | |
GERMAN = "German" | |
PORTUGUESE = "Portuguese" | |
RUSSIAN = "Russian" | |
CHINESE = "Chinese" | |
JAPANESE = "Japanese" | |
KOREAN = "Korean" | |
def transform_conversation( | |
entry: dict, | |
model_name: str, | |
max_history_turns: int = 10, | |
max_history_tokens: int = 4000 | |
) -> list: | |
"""Transform conversation into KTO format with history""" | |
data_points = [] | |
conversation = entry["conversation"] | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
for i, message in enumerate(conversation): | |
# Only create data points for assistant messages that have ratings | |
if message["role"] != "assistant" or message["rating"] not in [1, -1]: | |
continue | |
# Get previous messages up to limits | |
formatted_history = [] | |
formatted_prompt = "" | |
tokens = 0 | |
pairs = 0 # Count complete user/assistant pairs | |
# Start from the current message and work backwards | |
current_idx = i - 1 | |
while current_idx >= 0 and pairs < max_history_turns: | |
# We need both user and assistant messages to form a pair | |
if current_idx > 0 and conversation[current_idx]["role"] == "assistant" and conversation[current_idx-1]["role"] == "user": | |
# Add the pair to history | |
formatted_history.insert(0, conversation[current_idx-1]) # user | |
formatted_history.insert(1, conversation[current_idx]) # assistant | |
# Check token limit | |
try: | |
current_formatted = tokenizer.apply_chat_template(formatted_history, tokenize=False) | |
current_tokens = len(tokenizer.encode(current_formatted)) | |
if current_tokens > max_history_tokens: | |
formatted_history = formatted_history[2:] # Remove the oldest pair | |
break | |
formatted_prompt = current_formatted | |
tokens = current_tokens | |
pairs += 1 | |
current_idx -= 2 | |
except Exception: | |
# If template application fails, remove the last added pair | |
formatted_history = formatted_history[2:] | |
break | |
else: | |
current_idx -= 1 | |
# Add the final user message that prompted the rated response | |
if i > 0 and conversation[i-1]["role"] == "user": | |
last_history = formatted_history + [conversation[i-1]] | |
try: | |
formatted_prompt = tokenizer.apply_chat_template(last_history, tokenize=False) | |
except Exception: | |
# If template application fails, use the previous valid prompt | |
pass | |
data_points.append({ | |
"prompt": formatted_prompt.strip(), | |
"completion": message["content"].strip(), | |
"label": message["rating"] == 1, | |
"timestamp": entry["timestamp"], | |
"session_id": entry["session_id"], | |
"conversation_id": entry["conversation_id"], | |
"language": entry["language"] | |
}) | |
return data_points | |
def process_feel_dataset( | |
language: str, | |
model_name: str = "CohereForAI/aya-expanse-8b", | |
max_history_turns: int = 10, | |
max_history_tokens: int = 4000 | |
): | |
""" | |
Processes the feel dataset into a format suitable for KTO training using TRL. | |
Args: | |
language: Language to filter the dataset for (must be one of SupportedLanguages) | |
model_name: Name of the model to format for | |
max_history_turns: Maximum number of previous turns to include in history | |
max_history_tokens: Maximum number of tokens allowed in history | |
Returns: | |
dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format | |
Raises: | |
ValueError: If language is not provided or not in SupportedLanguages | |
""" | |
# Validate language | |
if not language: | |
raise ValueError("Language parameter is required") | |
try: | |
# Validate that it's a supported language | |
SupportedLanguages(language) | |
except ValueError: | |
supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages]) | |
raise ValueError( | |
f"Invalid language: '{language}'\n" | |
f"Supported languages are:\n- {supported_langs}" | |
) | |
# Load feel dataset from HuggingFace | |
feel_dataset = load_dataset("feel-fl/feel-feedback")["train"] | |
# Filter dataset by language | |
feel_dataset = feel_dataset.filter(lambda x: x["language"] == language) | |
if len(feel_dataset) == 0: | |
raise ValueError(f"No data found for language: {language}") | |
kto_data = [] | |
# Process all conversations in the filtered dataset | |
for entry in feel_dataset: | |
kto_data.extend(transform_conversation( | |
entry, | |
model_name, | |
max_history_turns, | |
max_history_tokens | |
)) | |
if len(kto_data) == 0: | |
raise ValueError(f"No valid training examples found for language: {language}") | |
# Convert to DataFrame | |
kto_df = pd.DataFrame(kto_data) | |
# Split into train and test sets (70% train, 30% test) | |
train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42) | |
# Reset index to remove '__index_level_0__' | |
train_df = train_df.reset_index(drop=True) | |
test_df = test_df.reset_index(drop=True) | |
# Convert to Hugging Face Dataset | |
train_dataset = Dataset.from_pandas(train_df) | |
test_dataset = Dataset.from_pandas(test_df) | |
print(f"Processed {len(kto_data)} examples for language: {language}") | |
print(f"Train set size: {len(train_dataset)}") | |
print(f"Test set size: {len(test_dataset)}") | |
return {"train": train_dataset, "test": test_dataset} | |
if __name__ == "__main__": | |
# Process the dataset | |
datasets = process_feel_dataset("English") | |
# Print distribution of positive/negative labels | |
train_labels = datasets['train']['label'] | |
test_labels = datasets['test']['label'] | |
print("\nLabel Distribution:") | |
print("Train set:") | |
print(f"Positive feedback: {sum(train_labels)}") | |
print(f"Negative feedback: {len(train_labels) - sum(train_labels)}") | |
print(f"Positive ratio: {sum(train_labels)/len(train_labels):.2%}") | |
print("\nTest set:") | |
print(f"Positive feedback: {sum(test_labels)}") | |
print(f"Negative feedback: {len(test_labels) - sum(test_labels)}") | |
print(f"Positive ratio: {sum(test_labels)/len(test_labels):.2%}") | |
# Load original FEEL dataset | |
feel_dataset = load_dataset("feel-fl/feel-feedback", split="train") | |
# Print one original conversation | |
print("\nOriginal conversation from FEEL dataset:") | |
print(json.dumps(feel_dataset[0], indent=2)) | |
# Print sample entries from processed dataset | |
print("\nSample entries from processed KTO dataset:") | |
print("\n" + "="*80 + "\nTRAIN SET SAMPLES\n" + "="*80) | |
# Export datasets to CSV | |
train_df = datasets['train'].to_pandas() | |
test_df = datasets['test'].to_pandas() | |
train_df.to_csv('kto_train_dataset.csv', index=False) | |
test_df.to_csv('kto_test_dataset.csv', index=False) | |
print("\nDatasets exported to 'kto_train_dataset.csv' and 'kto_test_dataset.csv'") | |