File size: 7,687 Bytes
755d824
c8a2d4e
755d824
 
 
36b0fc6
d151abe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36b0fc6
 
 
 
 
 
 
 
 
 
09e9f82
36b0fc6
 
09e9f82
36b0fc6
 
 
 
09e9f82
 
36b0fc6
09e9f82
 
 
 
 
 
1353a4a
09e9f82
1353a4a
 
09e9f82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36b0fc6
 
09e9f82
36b0fc6
 
 
 
d151abe
 
36b0fc6
 
 
 
 
d151abe
09e9f82
36b0fc6
 
 
c8a2d4e
755d824
 
 
d151abe
36b0fc6
 
 
c8a2d4e
 
36b0fc6
d151abe
 
 
c8a2d4e
d151abe
 
 
 
 
 
 
 
 
 
 
 
 
 
36b0fc6
 
d151abe
 
 
 
 
 
 
755d824
 
d151abe
755d824
36b0fc6
 
 
 
 
 
c8a2d4e
d151abe
 
 
755d824
 
239efc0
755d824
 
239efc0
755d824
 
 
239efc0
 
755d824
 
 
d151abe
 
 
 
755d824
 
239efc0
36b0fc6
d151abe
36b0fc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09e9f82
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from datasets import Dataset, load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import json
from ipdb import set_trace as st
from transformers import AutoTokenizer
from enum import Enum

class SupportedLanguages(str, Enum):
    """Enumeration of supported languages"""
    ENGLISH = "English"
    DUTCH = "Dutch"
    ITALIAN = "Italian"
    SPANISH = "Spanish"
    FRENCH = "French"
    GERMAN = "German"
    PORTUGUESE = "Portuguese"
    RUSSIAN = "Russian"
    CHINESE = "Chinese"
    JAPANESE = "Japanese"
    KOREAN = "Korean"

def transform_conversation(
    entry: dict,
    model_name: str,
    max_history_turns: int = 10,
    max_history_tokens: int = 4000
) -> list:
    """Transform conversation into KTO format with history"""
    data_points = []
    conversation = entry["conversation"]
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    for i, message in enumerate(conversation):
        # Only create data points for assistant messages that have ratings
        if message["role"] != "assistant" or message["rating"] not in [1, -1]:
            continue

        # Get previous messages up to limits
        formatted_history = []
        formatted_prompt = ""
        tokens = 0
        pairs = 0  # Count complete user/assistant pairs

        # Start from the current message and work backwards
        current_idx = i - 1
        while current_idx >= 0 and pairs < max_history_turns:
            # We need both user and assistant messages to form a pair
            if current_idx > 0 and conversation[current_idx]["role"] == "assistant" and conversation[current_idx-1]["role"] == "user":
                # Add the pair to history
                formatted_history.insert(0, conversation[current_idx-1])  # user
                formatted_history.insert(1, conversation[current_idx])    # assistant

                # Check token limit
                try:
                    current_formatted = tokenizer.apply_chat_template(formatted_history, tokenize=False)
                    current_tokens = len(tokenizer.encode(current_formatted))

                    if current_tokens > max_history_tokens:
                        formatted_history = formatted_history[2:]  # Remove the oldest pair
                        break

                    formatted_prompt = current_formatted
                    tokens = current_tokens
                    pairs += 1
                    current_idx -= 2
                except Exception:
                    # If template application fails, remove the last added pair
                    formatted_history = formatted_history[2:]
                    break
            else:
                current_idx -= 1

        # Add the final user message that prompted the rated response
        if i > 0 and conversation[i-1]["role"] == "user":
            last_history = formatted_history + [conversation[i-1]]
            try:
                formatted_prompt = tokenizer.apply_chat_template(last_history, tokenize=False)
            except Exception:
                # If template application fails, use the previous valid prompt
                pass

        data_points.append({
            "prompt": formatted_prompt.strip(),
            "completion": message["content"].strip(),
            "label": message["rating"] == 1,
            "timestamp": entry["timestamp"],
            "session_id": entry["session_id"],
            "conversation_id": entry["conversation_id"],
            "language": entry["language"]
        })

    return data_points

def process_feel_dataset(
    language: str,
    model_name: str = "CohereForAI/aya-expanse-8b",
    max_history_turns: int = 10,
    max_history_tokens: int = 4000
):
    """
    Processes the feel dataset into a format suitable for KTO training using TRL.

    Args:
        language: Language to filter the dataset for (must be one of SupportedLanguages)
        model_name: Name of the model to format for
        max_history_turns: Maximum number of previous turns to include in history
        max_history_tokens: Maximum number of tokens allowed in history

    Returns:
        dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format

    Raises:
        ValueError: If language is not provided or not in SupportedLanguages
    """
    # Validate language
    if not language:
        raise ValueError("Language parameter is required")

    try:
        # Validate that it's a supported language
        SupportedLanguages(language)
    except ValueError:
        supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages])
        raise ValueError(
            f"Invalid language: '{language}'\n"
            f"Supported languages are:\n- {supported_langs}"
        )

    # Load feel dataset from HuggingFace
    feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]

    # Filter dataset by language
    feel_dataset = feel_dataset.filter(lambda x: x["language"] == language)

    if len(feel_dataset) == 0:
        raise ValueError(f"No data found for language: {language}")

    kto_data = []

    # Process all conversations in the filtered dataset
    for entry in feel_dataset:
        kto_data.extend(transform_conversation(
            entry,
            model_name,
            max_history_turns,
            max_history_tokens
        ))

    if len(kto_data) == 0:
        raise ValueError(f"No valid training examples found for language: {language}")

    # Convert to DataFrame
    kto_df = pd.DataFrame(kto_data)

    # Split into train and test sets (70% train, 30% test)
    train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42)

    # Reset index to remove '__index_level_0__'
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Convert to Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    print(f"Processed {len(kto_data)} examples for language: {language}")
    print(f"Train set size: {len(train_dataset)}")
    print(f"Test set size: {len(test_dataset)}")

    return {"train": train_dataset, "test": test_dataset}

if __name__ == "__main__":
    # Process the dataset
    datasets = process_feel_dataset("English")

    # Print distribution of positive/negative labels
    train_labels = datasets['train']['label']
    test_labels = datasets['test']['label']

    print("\nLabel Distribution:")
    print("Train set:")
    print(f"Positive feedback: {sum(train_labels)}")
    print(f"Negative feedback: {len(train_labels) - sum(train_labels)}")
    print(f"Positive ratio: {sum(train_labels)/len(train_labels):.2%}")

    print("\nTest set:")
    print(f"Positive feedback: {sum(test_labels)}")
    print(f"Negative feedback: {len(test_labels) - sum(test_labels)}")
    print(f"Positive ratio: {sum(test_labels)/len(test_labels):.2%}")

    # Load original FEEL dataset
    feel_dataset = load_dataset("feel-fl/feel-feedback", split="train")

    # Print one original conversation
    print("\nOriginal conversation from FEEL dataset:")
    print(json.dumps(feel_dataset[0], indent=2))

    # Print sample entries from processed dataset
    print("\nSample entries from processed KTO dataset:")
    print("\n" + "="*80 + "\nTRAIN SET SAMPLES\n" + "="*80)

    # Export datasets to CSV
    train_df = datasets['train'].to_pandas()
    test_df = datasets['test'].to_pandas()

    train_df.to_csv('kto_train_dataset.csv', index=False)
    test_df.to_csv('kto_test_dataset.csv', index=False)

    print("\nDatasets exported to 'kto_train_dataset.csv' and 'kto_test_dataset.csv'")