Spaces:
Sleeping
Sleeping
initial app interface
Browse files
app.ipynb
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "3bae1d7d-a2be-444d-97cc-d1cbf8843bf1",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Invisible RAG Pilot Demo App"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 3,
|
14 |
+
"id": "2a8e18f7-cc88-4bbf-a6e1-095237ed7714",
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"name": "stdout",
|
19 |
+
"output_type": "stream",
|
20 |
+
"text": [
|
21 |
+
"Running on local URL: http://127.0.0.1:7861\n",
|
22 |
+
"\n",
|
23 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"data": {
|
28 |
+
"text/html": [
|
29 |
+
"<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
30 |
+
],
|
31 |
+
"text/plain": [
|
32 |
+
"<IPython.core.display.HTML object>"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
"metadata": {},
|
36 |
+
"output_type": "display_data"
|
37 |
+
}
|
38 |
+
],
|
39 |
+
"source": [
|
40 |
+
"import json\n",
|
41 |
+
"import gradio as gr\n",
|
42 |
+
"\n",
|
43 |
+
"\n",
|
44 |
+
"class RAGInterface:\n",
|
45 |
+
" \"\"\"\n",
|
46 |
+
" Setup the gradio app for loading/saving/syncronizing the mockup A/B evaluation RAG tasks.\n",
|
47 |
+
" The app is deployed on Hugging Face spaces at https://huggingface.co/spaces/sukiboo/invisible-rag-demo\n",
|
48 |
+
" \"\"\"\n",
|
49 |
+
"\n",
|
50 |
+
" def __init__(self):\n",
|
51 |
+
" self.setup_interface()\n",
|
52 |
+
" self.launch_interface()\n",
|
53 |
+
"\n",
|
54 |
+
" def setup_interface(self):\n",
|
55 |
+
" \"\"\"Configure the A/B Evaluation RAG task interface.\"\"\"\n",
|
56 |
+
" with gr.Blocks(title='AB Evaluate RAG') as self.interface:\n",
|
57 |
+
"\n",
|
58 |
+
" # protected fields\n",
|
59 |
+
" _task_id = gr.Textbox(label='Task ID', interactive=False, visible=False)\n",
|
60 |
+
"\n",
|
61 |
+
" # task id, load button, chat history, search results\n",
|
62 |
+
" with gr.Row():\n",
|
63 |
+
" task_id = gr.Textbox(container=False, placeholder='Task ID', scale=9)\n",
|
64 |
+
" load_button = gr.Button('Load Task', scale=1)\n",
|
65 |
+
" chat = gr.Chatbot(height=700, layout='bubble', label='Chat History')\n",
|
66 |
+
" sources = gr.Markdown()\n",
|
67 |
+
"\n",
|
68 |
+
" # model completions for answers 1 and 2\n",
|
69 |
+
" with gr.Row():\n",
|
70 |
+
" with gr.Column():\n",
|
71 |
+
" answer1 = gr.Textbox(label='Answer 1', max_lines=50)\n",
|
72 |
+
" with gr.Column():\n",
|
73 |
+
" answer2 = gr.Textbox(label='Answer 2', max_lines=50)\n",
|
74 |
+
"\n",
|
75 |
+
" # individual ratings for answers 1 and 2\n",
|
76 |
+
" with gr.Row():\n",
|
77 |
+
" with gr.Column():\n",
|
78 |
+
" groundedness1 = gr.Radio(label='Groundedness', choices=['Bad', 'Good', 'Perfect'])\n",
|
79 |
+
" fluency1 = gr.Radio(label='Fluency', choices=['Bad', 'Good', 'Perfect'])\n",
|
80 |
+
" utility1 = gr.Radio(label='Utility', choices=['Catastrophic', 'Bad', 'Good', 'Perfect'])\n",
|
81 |
+
" notes1 = gr.Textbox(label='Notes', placeholder='N/A')\n",
|
82 |
+
" with gr.Column():\n",
|
83 |
+
" groundedness2 = gr.Radio(label='Groundedness', choices=['Bad', 'Good', 'Perfect'])\n",
|
84 |
+
" fluency2 = gr.Radio(label='Fluency', choices=['Bad', 'Good', 'Perfect'])\n",
|
85 |
+
" utility2 = gr.Radio(label='Utility', choices=['Catastrophic', 'Bad', 'Good', 'Perfect'])\n",
|
86 |
+
" notes2 = gr.Textbox(label='Notes', placeholder='N/A')\n",
|
87 |
+
"\n",
|
88 |
+
" # overall rating\n",
|
89 |
+
" overall = gr.Radio(label='Overall Rating', choices=['#1 Better', 'Equally Bad', 'Equally Good', '#2 Better'])\n",
|
90 |
+
" notes = gr.Textbox(label='Notes', placeholder='A brief justification for the overall rating')\n",
|
91 |
+
"\n",
|
92 |
+
" # save button\n",
|
93 |
+
" save_button = gr.Button('Save Task')\n",
|
94 |
+
"\n",
|
95 |
+
" # input/output fields\n",
|
96 |
+
" answers = (answer1, answer2)\n",
|
97 |
+
" ratings1 = (groundedness1, fluency1, utility1, notes1)\n",
|
98 |
+
" ratings2 = (groundedness2, fluency2, utility2, notes2)\n",
|
99 |
+
" ratings = (*ratings1, *ratings2, overall, notes)\n",
|
100 |
+
"\n",
|
101 |
+
" # button clicks\n",
|
102 |
+
" load_button.click(self.load_task, inputs=[task_id], outputs=[_task_id, chat, sources, *answers, *ratings])\n",
|
103 |
+
" save_button.click(self.save_task, inputs=[_task_id, *ratings], outputs=None)\n",
|
104 |
+
"\n",
|
105 |
+
" def load_task(self, task_id):\n",
|
106 |
+
" \"\"\"Load the task and parse the info.\"\"\"\n",
|
107 |
+
" task = self.read_task(task_id)\n",
|
108 |
+
" try:\n",
|
109 |
+
" id = task['id']\n",
|
110 |
+
" chat = task['chat_history'] + [[task['question'], task['search_query']]]\n",
|
111 |
+
" answers = [task['answer_1'], task['answer_2']]\n",
|
112 |
+
" sources = self.load_sources(task)\n",
|
113 |
+
" ratings = self.load_ratings(task)\n",
|
114 |
+
" gr.Info(f'Task {task_id} is loaded!')\n",
|
115 |
+
" return id, chat, sources, *answers, *ratings\n",
|
116 |
+
" except:\n",
|
117 |
+
" raise gr.Error(f'Could not load the task {task_id} :(')\n",
|
118 |
+
"\n",
|
119 |
+
" def read_task(self, task_id):\n",
|
120 |
+
" \"\"\"Read the json task file.\"\"\"\n",
|
121 |
+
" try:\n",
|
122 |
+
" with open(f'./data/{task_id}.json') as task_file:\n",
|
123 |
+
" task = json.load(task_file)\n",
|
124 |
+
" return task\n",
|
125 |
+
" except FileNotFoundError:\n",
|
126 |
+
" raise gr.Error(f'Task {task_id} is not found :(')\n",
|
127 |
+
"\n",
|
128 |
+
" def load_sources(self, task):\n",
|
129 |
+
" \"\"\"Parse the search results.\"\"\"\n",
|
130 |
+
" sources = ['## Search Results']\n",
|
131 |
+
" for idx, source in enumerate(task['search_results']):\n",
|
132 |
+
" sources.append(f'### {idx+1}. {source.replace(\"<\", f\"{chr(92)}<\")}')\n",
|
133 |
+
" return '\\n\\n---\\n\\n'.join(sources + [''])\n",
|
134 |
+
"\n",
|
135 |
+
" def load_ratings(self, task):\n",
|
136 |
+
" \"\"\"Parse the ratings for each answer.\"\"\"\n",
|
137 |
+
" # load ratings for answer 1\n",
|
138 |
+
" ratings1 = (task['ratings_1']['groundedness'],\n",
|
139 |
+
" task['ratings_1']['fluency'],\n",
|
140 |
+
" task['ratings_1']['utility'],\n",
|
141 |
+
" task['ratings_1']['notes'])\n",
|
142 |
+
" # load ratings for answer 2\n",
|
143 |
+
" ratings2 = (task['ratings_2']['groundedness'],\n",
|
144 |
+
" task['ratings_2']['fluency'],\n",
|
145 |
+
" task['ratings_2']['utility'],\n",
|
146 |
+
" task['ratings_2']['notes'])\n",
|
147 |
+
" # load overall ratings\n",
|
148 |
+
" overall = task['overall']\n",
|
149 |
+
" notes = task['notes']\n",
|
150 |
+
" return (*ratings1, *ratings2, overall, notes)\n",
|
151 |
+
"\n",
|
152 |
+
" def save_task(self, task_id, *ratings):\n",
|
153 |
+
" \"\"\"Save the task into a new json file.\"\"\"\n",
|
154 |
+
" # load the original task\n",
|
155 |
+
" with open(f'./data/{task_id}.json') as task_file:\n",
|
156 |
+
" task = json.load(task_file)\n",
|
157 |
+
" # parse the ratings\n",
|
158 |
+
" groundedness1, fluency1, utility1, notes1, \\\n",
|
159 |
+
" groundedness2, fluency2, utility2, notes2, \\\n",
|
160 |
+
" overall, notes = ratings\n",
|
161 |
+
" # update the ratings for answer 1\n",
|
162 |
+
" task['ratings_1']['groundedness'] = groundedness1\n",
|
163 |
+
" task['ratings_1']['fluency'] = fluency1\n",
|
164 |
+
" task['ratings_1']['utility'] = utility1\n",
|
165 |
+
" task['ratings_1']['notes'] = notes1\n",
|
166 |
+
" # update the ratings for answer 2\n",
|
167 |
+
" task['ratings_2']['groundedness'] = groundedness2\n",
|
168 |
+
" task['ratings_2']['fluency'] = fluency2\n",
|
169 |
+
" task['ratings_2']['utility'] = utility2\n",
|
170 |
+
" task['ratings_2']['notes'] = notes2\n",
|
171 |
+
" # update overall ratings\n",
|
172 |
+
" task['overall'] = overall\n",
|
173 |
+
" task['notes'] = notes\n",
|
174 |
+
" # save the task to json file\n",
|
175 |
+
" try:\n",
|
176 |
+
" with open(f'./data/{task_id}.json', 'w', encoding='utf-8') as task_file:\n",
|
177 |
+
" json.dump(task, task_file, ensure_ascii=False, indent=4)\n",
|
178 |
+
" gr.Info(f'Task {task_id} is saved!')\n",
|
179 |
+
" except:\n",
|
180 |
+
" raise gr.Error(f'Could not save the task {task_id} :(')\n",
|
181 |
+
"\n",
|
182 |
+
" def launch_interface(self):\n",
|
183 |
+
" \"\"\"Launch the A/B Evaluation RAG task interface.\"\"\"\n",
|
184 |
+
" gr.close_all()\n",
|
185 |
+
" self.interface.queue(default_concurrency_limit=None)\n",
|
186 |
+
" self.interface.launch()\n",
|
187 |
+
"\n",
|
188 |
+
"\n",
|
189 |
+
"rag = RAGInterface()"
|
190 |
+
]
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"cell_type": "code",
|
194 |
+
"execution_count": null,
|
195 |
+
"id": "ade1097d-35ce-4f7a-a689-1b51973cbc70",
|
196 |
+
"metadata": {},
|
197 |
+
"outputs": [],
|
198 |
+
"source": []
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "code",
|
202 |
+
"execution_count": 4,
|
203 |
+
"id": "6707866e-8f1b-4bda-9b12-0008e289ab77",
|
204 |
+
"metadata": {},
|
205 |
+
"outputs": [],
|
206 |
+
"source": [
|
207 |
+
"# create placeholder tasks\n",
|
208 |
+
"import os\n",
|
209 |
+
"import json\n",
|
210 |
+
"\n",
|
211 |
+
"os.makedirs('./data/', exist_ok=True)\n",
|
212 |
+
"for idx in range(3):\n",
|
213 |
+
" task = {\n",
|
214 |
+
" 'id': f'demo_task_{idx+1}',\n",
|
215 |
+
" 'chat_history': [['user message 1', 'bot message 1'], ['user message 2', 'bot message 2']],\n",
|
216 |
+
" 'question': 'question',\n",
|
217 |
+
" 'search_query': 'search query',\n",
|
218 |
+
" 'search_results': ['source 1', 'source 2', 'source 3'],\n",
|
219 |
+
" 'answer_1': 'answer 1',\n",
|
220 |
+
" 'answer_2': 'answer 2',\n",
|
221 |
+
" 'ratings_1': {'groundedness': 'null', 'utility': 'null', 'fluency': 'null', 'notes': ''},\n",
|
222 |
+
" 'ratings_2': {'groundedness': 'null', 'utility': 'null', 'fluency': 'null', 'notes': ''},\n",
|
223 |
+
" 'overall': 'null',\n",
|
224 |
+
" 'notes': ''\n",
|
225 |
+
" }\n",
|
226 |
+
" with open(f'./data/demo_task_{idx+1}.json', 'w', encoding='utf-8') as task_file:\n",
|
227 |
+
" json.dump(task, task_file, ensure_ascii=False, indent=4)\n"
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"cell_type": "code",
|
232 |
+
"execution_count": null,
|
233 |
+
"id": "d5023979-626b-4135-8805-3de1a846586e",
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [],
|
236 |
+
"source": []
|
237 |
+
}
|
238 |
+
],
|
239 |
+
"metadata": {
|
240 |
+
"kernelspec": {
|
241 |
+
"display_name": "Python 3 (ipykernel)",
|
242 |
+
"language": "python",
|
243 |
+
"name": "python3"
|
244 |
+
},
|
245 |
+
"language_info": {
|
246 |
+
"codemirror_mode": {
|
247 |
+
"name": "ipython",
|
248 |
+
"version": 3
|
249 |
+
},
|
250 |
+
"file_extension": ".py",
|
251 |
+
"mimetype": "text/x-python",
|
252 |
+
"name": "python",
|
253 |
+
"nbconvert_exporter": "python",
|
254 |
+
"pygments_lexer": "ipython3",
|
255 |
+
"version": "3.12.1"
|
256 |
+
}
|
257 |
+
},
|
258 |
+
"nbformat": 4,
|
259 |
+
"nbformat_minor": 5
|
260 |
+
}
|