from datasets import Dataset, load_dataset import pandas as pd from sklearn.model_selection import train_test_split import json from ipdb import set_trace as st from transformers import AutoTokenizer from enum import Enum class SupportedLanguages(str, Enum): """Enumeration of supported languages""" ENGLISH = "English" DUTCH = "Dutch" ITALIAN = "Italian" SPANISH = "Spanish" FRENCH = "French" GERMAN = "German" PORTUGUESE = "Portuguese" RUSSIAN = "Russian" CHINESE = "Chinese" JAPANESE = "Japanese" KOREAN = "Korean" def transform_conversation( entry: dict, model_name: str, max_history_turns: int = 10, max_history_tokens: int = 4000 ) -> list: """Transform conversation into KTO format with history""" data_points = [] conversation = entry["conversation"] tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) for i, message in enumerate(conversation): # Only create data points for assistant messages that have ratings if message["role"] != "assistant" or message["rating"] not in [1, -1]: continue # Get previous messages up to limits formatted_history = [] formatted_prompt = "" tokens = 0 pairs = 0 # Count complete user/assistant pairs # Start from the current message and work backwards current_idx = i - 1 while current_idx >= 0 and pairs < max_history_turns: # We need both user and assistant messages to form a pair if current_idx > 0 and conversation[current_idx]["role"] == "assistant" and conversation[current_idx-1]["role"] == "user": # Add the pair to history formatted_history.insert(0, conversation[current_idx-1]) # user formatted_history.insert(1, conversation[current_idx]) # assistant # Check token limit try: current_formatted = tokenizer.apply_chat_template(formatted_history, tokenize=False) current_tokens = len(tokenizer.encode(current_formatted)) if current_tokens > max_history_tokens: formatted_history = formatted_history[2:] # Remove the oldest pair break formatted_prompt = current_formatted tokens = current_tokens pairs += 1 current_idx -= 2 except Exception: # If template application fails, remove the last added pair formatted_history = formatted_history[2:] break else: current_idx -= 1 # Add the final user message that prompted the rated response if i > 0 and conversation[i-1]["role"] == "user": last_history = formatted_history + [conversation[i-1]] try: formatted_prompt = tokenizer.apply_chat_template(last_history, tokenize=False) except Exception: # If template application fails, use the previous valid prompt pass data_points.append({ "prompt": formatted_prompt.strip(), "completion": message["content"].strip(), "label": message["rating"] == 1, "timestamp": entry["timestamp"], "session_id": entry["session_id"], "conversation_id": entry["conversation_id"], "language": entry["language"] }) return data_points def process_feel_dataset( language: str, model_name: str = "CohereForAI/aya-expanse-8b", max_history_turns: int = 10, max_history_tokens: int = 4000 ): """ Processes the feel dataset into a format suitable for KTO training using TRL. Args: language: Language to filter the dataset for (must be one of SupportedLanguages) model_name: Name of the model to format for max_history_turns: Maximum number of previous turns to include in history max_history_tokens: Maximum number of tokens allowed in history Returns: dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format Raises: ValueError: If language is not provided or not in SupportedLanguages """ # Validate language if not language: raise ValueError("Language parameter is required") try: # Validate that it's a supported language SupportedLanguages(language) except ValueError: supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages]) raise ValueError( f"Invalid language: '{language}'\n" f"Supported languages are:\n- {supported_langs}" ) # Load feel dataset from HuggingFace feel_dataset = load_dataset("feel-fl/feel-feedback")["train"] # Filter dataset by language feel_dataset = feel_dataset.filter(lambda x: x["language"] == language) if len(feel_dataset) == 0: raise ValueError(f"No data found for language: {language}") kto_data = [] # Process all conversations in the filtered dataset for entry in feel_dataset: kto_data.extend(transform_conversation( entry, model_name, max_history_turns, max_history_tokens )) if len(kto_data) == 0: raise ValueError(f"No valid training examples found for language: {language}") # Convert to DataFrame kto_df = pd.DataFrame(kto_data) # Split into train and test sets (70% train, 30% test) train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42) # Reset index to remove '__index_level_0__' train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) # Convert to Hugging Face Dataset train_dataset = Dataset.from_pandas(train_df) test_dataset = Dataset.from_pandas(test_df) print(f"Processed {len(kto_data)} examples for language: {language}") print(f"Train set size: {len(train_dataset)}") print(f"Test set size: {len(test_dataset)}") return {"train": train_dataset, "test": test_dataset} if __name__ == "__main__": # Process the dataset datasets = process_feel_dataset("English") # Print distribution of positive/negative labels train_labels = datasets['train']['label'] test_labels = datasets['test']['label'] print("\nLabel Distribution:") print("Train set:") print(f"Positive feedback: {sum(train_labels)}") print(f"Negative feedback: {len(train_labels) - sum(train_labels)}") print(f"Positive ratio: {sum(train_labels)/len(train_labels):.2%}") print("\nTest set:") print(f"Positive feedback: {sum(test_labels)}") print(f"Negative feedback: {len(test_labels) - sum(test_labels)}") print(f"Positive ratio: {sum(test_labels)/len(test_labels):.2%}") # Load original FEEL dataset feel_dataset = load_dataset("feel-fl/feel-feedback", split="train") # Print one original conversation print("\nOriginal conversation from FEEL dataset:") print(json.dumps(feel_dataset[0], indent=2)) # Print sample entries from processed dataset print("\nSample entries from processed KTO dataset:") print("\n" + "="*80 + "\nTRAIN SET SAMPLES\n" + "="*80) # Export datasets to CSV train_df = datasets['train'].to_pandas() test_df = datasets['test'].to_pandas() train_df.to_csv('kto_train_dataset.csv', index=False) test_df.to_csv('kto_test_dataset.csv', index=False) print("\nDatasets exported to 'kto_train_dataset.csv' and 'kto_test_dataset.csv'")