Scherzando's picture
change text
72f1b23
import gradio as gr
import json
from datetime import datetime
import random
import os
from huggingface_hub import Repository
random.seed(20240128)
import subprocess
# Set Git user information
subprocess.run(["git", "config", "--global", "user.email", "[email protected]"])
subprocess.run(["git", "config", "--global", "user.name", "yiduohao"])
hf_token = os.getenv("HF_TOKEN")
print("HF Token is none?", hf_token is None)
# Initialize the repository
DATASET_REPO_URL = "https://huggingface.co/datasets/Scherzando/RIR-Resound-User-Study-Response"
repo = Repository(
local_dir="user_responses",
clone_from=DATASET_REPO_URL,
use_auth_token=hf_token
)
def prepare_test_cases():
# json_path = "videos/videos.json"
json_path = "rir/rir.json"
with open(json_path, "r") as f:
video_dict = json.load(f)
video_ids = list(video_dict.keys())
for video_id in video_ids:
if random.random() > 0.5:
video_list = [video_dict[video_id]['ours'], video_dict[video_id]['baseline']]
else:
video_list = [video_dict[video_id]['baseline'], video_dict[video_id]['ours']]
random.shuffle(video_list)
video_dict[video_id]['Video 1'] = video_list[0]
video_dict[video_id]['Video 2'] = video_list[1]
video_dict[video_id]['Ground Truth'] = video_dict[video_id]['groundtruth']
return video_dict
video_dict = prepare_test_cases()
video_ids = list(video_dict.keys())
random.shuffle(video_ids)
questions = [
"Between Video 1 and Video 2, which one's audio conveyed changes in audio volume more accurately compared to the Reference?",
"Between Video 1 and Video 2, which one's audio made it easier for you to identify the direction of the sound source more accurately?",
"Between Video 1 and Video 2, which one's audio do you feel aligns better with the Reference overall?"
]
submissions_file = "user_responses/response.jsonl"
def has_already_submitted(user_id):
if os.path.exists(submissions_file):
with open(submissions_file, "r") as f:
for line in f:
submission = json.loads(line)
if submission.get("u_id") == user_id:
return True
return False
# Save responses
def save_responses(unique_submission, *responses):
timestamp = datetime.now().isoformat()
info = responses[-1]
responses = responses[:-1]
unique_id = info["session_id"]
user_id = f"{unique_id}"
# Check for unique submission
if unique_submission and has_already_submitted(user_id):
return "You have already submitted responses. Thank you for participating!"
# Initialize the result dictionary
result = {
"u_id": user_id,
"timestamp": timestamp,
"responses": []
}
for index in range(len(video_ids)):
start_idx = index * len(questions)
end_idx = start_idx + len(questions)
response = responses[start_idx:end_idx]
if any(r is None for r in response):
return "Please answer all questions before submitting."
video_id = video_ids[index]
pair_response = {
video_id: {
'distance': video_dict[video_id][response[0]],
'direction': video_dict[video_id][response[1]],
'overall': video_dict[video_id][response[2]],
}
}
result["responses"].append(pair_response)
result["responses"] = sorted(result["responses"], key=lambda x: x.keys())
# Save response locally and push to Hugging Face Hub
with open(submissions_file, "a") as f:
f.write(json.dumps(result) + "\n")
# Push changes to the Hugging Face dataset repo
repo.push_to_hub()
return "All responses saved! Thank you for participating!"
def create_interface(unique_submission=False):
with gr.Blocks() as demo:
gr.Markdown("# Human Preference Study: Room Spatial Audio Rendering")
gr.Markdown("""
Before starting the study, please make sure you are in a quiet environment and wearing headphones, and read the following guidance carefully.
- In this study, you will be presented with pairs of videos **with spatial audio**.
- Each pair consists of a reference spatial audio (marked by **Reference**) and two generated spatial audio (marked by **Video 1** and **Video 2**).
- Please watch and **listen** to each row of videos carefully and answer the three associated questions.
- For each video, the left-hand side is the camera (head) view, and the right-hand side is the corresponding birds-eye view of the room with **speaker (blue)** and **head poses (red)**.
**Binanural Headphones are required!**
""")
# Display video pairs and questions
responses = []
for index, video_id in enumerate(video_ids):
video_gt = video_dict[video_id]['groundtruth']
video1 = video_dict[video_id]['Video 1']
video2 = video_dict[video_id]['Video 2']
gr.Markdown(f"### Video Pair {index + 1}")
with gr.Row():
gr.Video(video_gt, label="Reference")
gr.Video(video1, label="Video 1")
gr.Video(video2, label="Video 2")
# with gr.Column():
with gr.Row():
responses.append(gr.Radio(["Video 1", "Video 2"], label=questions[0], value=None))
with gr.Row():
responses.append(gr.Radio(["Video 1", "Video 2"], label=questions[1], value=None))
with gr.Row():
responses.append(gr.Radio(["Video 1", "Video 2"], label=questions[2], value=None))
gr.Markdown("---")
info = gr.JSON(visible=False)
demo.load(predict, None, info)
submit_btn = gr.Button("Submit")
result_message = gr.Textbox(label="Message (please only submit once)", interactive=False)
submit_btn.click(
fn=lambda *args: save_responses(unique_submission, *args),
inputs=responses+[info],
outputs=result_message
)
return demo
def predict(request: gr.Request):
headers = request.headers
host = request.client.host
user_agent = request.headers["user-agent"]
session_id = request.session_hash
return {
"ip": host,
"user_agent": user_agent,
"headers": headers,
"session_id": session_id
}
if __name__ == "__main__":
# Launch with unique_submission set based on `--unique` flag
demo = create_interface(unique_submission=True)
demo.launch(share=True)