File size: 6,674 Bytes
f053717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9eff11a
f053717
 
 
 
 
 
 
 
9eff11a
f053717
 
 
9eff11a
f053717
 
 
 
 
 
 
 
 
 
9eff11a
f053717
 
 
 
 
 
 
 
 
9eff11a
 
 
f053717
 
9eff11a
 
 
f053717
9eff11a
 
 
f053717
 
 
 
9eff11a
 
f053717
9eff11a
f053717
9eff11a
 
 
 
 
 
f053717
9eff11a
 
 
 
 
 
 
 
 
 
 
 
f053717
9eff11a
 
 
 
 
 
 
 
 
 
f053717
9eff11a
f053717
9eff11a
 
 
 
f053717
9eff11a
 
 
 
 
f053717
 
 
 
 
 
9eff11a
f053717
 
9eff11a
 
 
 
f053717
 
9eff11a
f053717
9eff11a
f053717
9eff11a
f053717
 
9eff11a
 
 
f053717
 
9eff11a
 
 
 
 
 
 
 
 
 
 
 
 
f053717
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr
import pandas as pd
import os
import uuid
import datetime
import logging
from huggingface_hub import hf_hub_download, upload_file, list_repo_tree
from dotenv import load_dotenv

load_dotenv()

# Configuration
HF_INPUT_DATASET = os.getenv("HF_INPUT_DATASET")
HF_INPUT_DATASET_PATH = os.getenv("HF_INPUT_DATASET_PATH")
HF_INPUT_DATASET_ID_COLUMN = os.getenv("HF_INPUT_DATASET_ID_COLUMN")
HF_INPUT_DATASET_COLUMN_A = os.getenv("HF_INPUT_DATASET_COLUMN_A")
HF_INPUT_DATASET_COLUMN_B = os.getenv("HF_INPUT_DATASET_COLUMN_B")
HF_OUTPUT_DATASET = os.getenv("HF_OUTPUT_DATASET")
HF_OUTPUT_DATASET_DIR = os.getenv("HF_OUTPUT_DATASET_DIR")

INSTRUCTIONS = """
# Pairwise Model Output Labeling
Please compare the two model outputs shown below and select which one you think is better.
- Choose "Left is better" if the left output is superior
- Choose "Right is better" if the right output is superior
- Choose "Tie" if they are equally good or bad
- Choose "Can't choose" if you cannot make a determination
"""
 
class PairwiseLabeler:
    def __init__(self):
        self.df = self.read_hf_dataset()
        self.results = {}
    
    def __len__(self):
        return len(self.df)

    def read_hf_dataset(self) -> pd.DataFrame:
        try:
            local_file = hf_hub_download(repo_id=HF_INPUT_DATASET, repo_type="dataset", filename=HF_INPUT_DATASET_PATH)
            if local_file.endswith(".json"):
                return pd.read_json(local_file)
            elif local_file.endswith(".jsonl"):
                return pd.read_json(local_file, orient="records", lines=True)
            elif local_file.endswith(".csv"):
                return pd.read_csv(local_file)
            elif local_file.endswith(".parquet"):
                return pd.read_parquet(local_file)
            else:
                raise ValueError(f"Unsupported file type: {local_file}")
        except Exception as e:
            logging.error(f"Couldn't read HF dataset from {HF_INPUT_DATASET_PATH}. Using sample data instead.")
            sample_data = {
                HF_INPUT_DATASET_ID_COLUMN: [f"sample_{i}" for i in range(5)],
                HF_INPUT_DATASET_COLUMN_A: [f"This is sample generation A {i}" for i in range(5)],
                HF_INPUT_DATASET_COLUMN_B: [f"This is sample generation B {i}" for i in range(5)],
            }
            return pd.DataFrame(sample_data)

    def get_current_pair(self, user_id, user_index):
        if user_index >= len(self.df):
            return None, None, None

        item = self.df.iloc[user_index]
        item_id = item.get(HF_INPUT_DATASET_ID_COLUMN, f"item_{user_index}")
        left_text = item.get(HF_INPUT_DATASET_COLUMN_A, "")
        right_text = item.get(HF_INPUT_DATASET_COLUMN_B, "")
        
        return item_id, left_text, right_text

    def submit_judgment(self, user_id, user_index, item_id, left_text, right_text, choice):
        if item_id is None:
            return None, None, None, user_index
        
        # Store user votes uniquely
        if user_id not in self.results:
            self.results[user_id] = []

        # Check if user already voted for this item
        existing_vote = next((r for r in self.results[user_id] if r["item_id"] == item_id), None)
        
        if existing_vote:
            existing_vote["judgment"] = choice
            existing_vote["timestamp"] = datetime.datetime.now().isoformat()
        else:
            self.results[user_id].append({
                "item_id": item_id,
                "generation_a": left_text,
                "generation_b": right_text,
                "judgment": choice,
                "timestamp": datetime.datetime.now().isoformat(),
                "labeler_id": user_id
            })
        
        # Save immediately
        self.save_results(user_id)

        # Move to the next item
        user_index += 1
        next_id, next_left, next_right = self.get_current_pair(user_id, user_index)
        return next_id, next_left, next_right, user_index

    def save_results(self, user_id):
        if user_id not in self.results or not self.results[user_id]:
            return

        try:
            results_df = pd.DataFrame(self.results[user_id])
            filename = f"results_{user_id}.jsonl"
            results_df.to_json(filename, orient="records", lines=True)

            # Push to Hugging Face Hub
            upload_file(repo_id=HF_OUTPUT_DATASET, repo_type="dataset",
                        path_in_repo=os.path.join(HF_OUTPUT_DATASET_DIR, filename),
                        path_or_fileobj=filename)

            os.remove(filename)
        except Exception as e:
            logging.error(f"Error saving results: {e}")

# Initialize the labeler
labeler = PairwiseLabeler()

# Gradio UI
with gr.Blocks() as app:
    gr.Markdown(INSTRUCTIONS)

    user_id = gr.Textbox(label="Enter your user ID", interactive=True)
    user_index = gr.State(0)  # Track each user's progress

    with gr.Row():
        with gr.Column():
            left_output = gr.Textbox(label="Model Output A", lines=10, interactive=False)
        with gr.Column():
            right_output = gr.Textbox(label="Model Output B", lines=10, interactive=False)
    
    item_id = gr.Textbox(visible=False)
    
    with gr.Row():
        left_btn = gr.Button("⬅️ A is better")
        right_btn = gr.Button("➡️ B is better")
        tie_btn = gr.Button("🤝 Tie")
        cant_choose_btn = gr.Button("🤔 Can't choose")
    
    def load_first_pair(user_id):
        if not user_id:
            return None, None, None, 0
        return labeler.get_current_pair(user_id, 0) + (0,)

    def judge(choice, user_id, user_index, item_id, left_text, right_text):
        return labeler.submit_judgment(user_id, user_index, item_id, left_text, right_text, choice)

    user_id.submit(load_first_pair, inputs=[user_id], outputs=[item_id, left_output, right_output, user_index])
    left_btn.click(judge, inputs=[gr.State("A is better"), user_id, user_index, item_id, left_output, right_output], outputs=[item_id, left_output, right_output, user_index])
    right_btn.click(judge, inputs=[gr.State("B is better"), user_id, user_index, item_id, left_output, right_output], outputs=[item_id, left_output, right_output, user_index])
    tie_btn.click(judge, inputs=[gr.State("Tie"), user_id, user_index, item_id, left_output, right_output], outputs=[item_id, left_output, right_output, user_index])
    cant_choose_btn.click(judge, inputs=[gr.State("Can't choose"), user_id, user_index, item_id, left_output, right_output], outputs=[item_id, left_output, right_output, user_index])

if __name__ == "__main__":
    app.launch()