Spaces:

feel-fl
/

open-human-feedback-chat

Running on Zero

open-human-feedback-chat / ml /kto_dataset_processor.py

jenbenarye

fixed convo processing logic

1353a4a 4 months ago

7.69 kB

	from datasets import Dataset, load_dataset
	import pandas as pd
	from sklearn.model_selection import train_test_split
	import json
	from ipdb import set_trace as st
	from transformers import AutoTokenizer
	from enum import Enum

	class SupportedLanguages(str, Enum):
	"""Enumeration of supported languages"""
	ENGLISH = "English"
	DUTCH = "Dutch"
	ITALIAN = "Italian"
	SPANISH = "Spanish"
	FRENCH = "French"
	GERMAN = "German"
	PORTUGUESE = "Portuguese"
	RUSSIAN = "Russian"
	CHINESE = "Chinese"
	JAPANESE = "Japanese"
	KOREAN = "Korean"

	def transform_conversation(
	entry: dict,
	model_name: str,
	max_history_turns: int = 10,
	max_history_tokens: int = 4000
	) -> list:
	"""Transform conversation into KTO format with history"""
	data_points = []
	conversation = entry["conversation"]
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

	for i, message in enumerate(conversation):
	# Only create data points for assistant messages that have ratings
	if message["role"] != "assistant" or message["rating"] not in [1, -1]:
	continue

	# Get previous messages up to limits
	formatted_history = []
	formatted_prompt = ""
	tokens = 0
	pairs = 0 # Count complete user/assistant pairs

	# Start from the current message and work backwards
	current_idx = i - 1
	while current_idx >= 0 and pairs < max_history_turns:
	# We need both user and assistant messages to form a pair
	if current_idx > 0 and conversation[current_idx]["role"] == "assistant" and conversation[current_idx-1]["role"] == "user":
	# Add the pair to history
	formatted_history.insert(0, conversation[current_idx-1]) # user
	formatted_history.insert(1, conversation[current_idx]) # assistant

	# Check token limit
	try:
	current_formatted = tokenizer.apply_chat_template(formatted_history, tokenize=False)
	current_tokens = len(tokenizer.encode(current_formatted))

	if current_tokens > max_history_tokens:
	formatted_history = formatted_history[2:] # Remove the oldest pair
	break

	formatted_prompt = current_formatted
	tokens = current_tokens
	pairs += 1
	current_idx -= 2
	except Exception:
	# If template application fails, remove the last added pair
	formatted_history = formatted_history[2:]
	break
	else:
	current_idx -= 1

	# Add the final user message that prompted the rated response
	if i > 0 and conversation[i-1]["role"] == "user":
	last_history = formatted_history + [conversation[i-1]]
	try:
	formatted_prompt = tokenizer.apply_chat_template(last_history, tokenize=False)
	except Exception:
	# If template application fails, use the previous valid prompt
	pass

	data_points.append({
	"prompt": formatted_prompt.strip(),
	"completion": message["content"].strip(),
	"label": message["rating"] == 1,
	"timestamp": entry["timestamp"],
	"session_id": entry["session_id"],
	"conversation_id": entry["conversation_id"],
	"language": entry["language"]
	})

	return data_points

	def process_feel_dataset(
	language: str,
	model_name: str = "CohereForAI/aya-expanse-8b",
	max_history_turns: int = 10,
	max_history_tokens: int = 4000
	):
	"""
	Processes the feel dataset into a format suitable for KTO training using TRL.

	Args:
	language: Language to filter the dataset for (must be one of SupportedLanguages)
	model_name: Name of the model to format for
	max_history_turns: Maximum number of previous turns to include in history
	max_history_tokens: Maximum number of tokens allowed in history

	Returns:
	dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format

	Raises:
	ValueError: If language is not provided or not in SupportedLanguages
	"""
	# Validate language
	if not language:
	raise ValueError("Language parameter is required")

	try:
	# Validate that it's a supported language
	SupportedLanguages(language)
	except ValueError:
	supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages])
	raise ValueError(
	f"Invalid language: '{language}'\n"
	f"Supported languages are:\n- {supported_langs}"
	)

	# Load feel dataset from HuggingFace
	feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]

	# Filter dataset by language
	feel_dataset = feel_dataset.filter(lambda x: x["language"] == language)

	if len(feel_dataset) == 0:
	raise ValueError(f"No data found for language: {language}")

	kto_data = []

	# Process all conversations in the filtered dataset
	for entry in feel_dataset:
	kto_data.extend(transform_conversation(
	entry,
	model_name,
	max_history_turns,
	max_history_tokens
	))

	if len(kto_data) == 0:
	raise ValueError(f"No valid training examples found for language: {language}")

	# Convert to DataFrame
	kto_df = pd.DataFrame(kto_data)

	# Split into train and test sets (70% train, 30% test)
	train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42)

	# Reset index to remove '__index_level_0__'
	train_df = train_df.reset_index(drop=True)
	test_df = test_df.reset_index(drop=True)

	# Convert to Hugging Face Dataset
	train_dataset = Dataset.from_pandas(train_df)
	test_dataset = Dataset.from_pandas(test_df)

	print(f"Processed {len(kto_data)} examples for language: {language}")
	print(f"Train set size: {len(train_dataset)}")
	print(f"Test set size: {len(test_dataset)}")

	return {"train": train_dataset, "test": test_dataset}

	if __name__ == "__main__":
	# Process the dataset
	datasets = process_feel_dataset("English")

	# Print distribution of positive/negative labels
	train_labels = datasets['train']['label']
	test_labels = datasets['test']['label']

	print("\nLabel Distribution:")
	print("Train set:")
	print(f"Positive feedback: {sum(train_labels)}")
	print(f"Negative feedback: {len(train_labels) - sum(train_labels)}")
	print(f"Positive ratio: {sum(train_labels)/len(train_labels):.2%}")

	print("\nTest set:")
	print(f"Positive feedback: {sum(test_labels)}")
	print(f"Negative feedback: {len(test_labels) - sum(test_labels)}")
	print(f"Positive ratio: {sum(test_labels)/len(test_labels):.2%}")

	# Load original FEEL dataset
	feel_dataset = load_dataset("feel-fl/feel-feedback", split="train")

	# Print one original conversation
	print("\nOriginal conversation from FEEL dataset:")
	print(json.dumps(feel_dataset[0], indent=2))

	# Print sample entries from processed dataset
	print("\nSample entries from processed KTO dataset:")
	print("\n" + "="80 + "\nTRAIN SET SAMPLES\n" + "="80)

	# Export datasets to CSV
	train_df = datasets['train'].to_pandas()
	test_df = datasets['test'].to_pandas()

	train_df.to_csv('kto_train_dataset.csv', index=False)
	test_df.to_csv('kto_test_dataset.csv', index=False)

	print("\nDatasets exported to 'kto_train_dataset.csv' and 'kto_test_dataset.csv'")