Spaces:

deepsync
/

data-gathering

Sleeping

App Files Files Community

data-gathering / app.py

deepsync

Update app.py

5f1d785 verified over 1 year ago

raw

history blame contribute delete

8.21 kB

	import os
	import json
	import random
	import pymongo
	import requests
	import gradio as gr
	from collections import defaultdict
	from uuid import uuid4


	DB_CONN = pymongo.MongoClient(os.environ.get("DB_URL"))
	line_details = {l["source_id"]:l for l in list(DB_CONN['vo_data_dump']['qa_video_dump'].find()) if l.get("done") is not True}

	videos_list = defaultdict(list)
	for line_detail in line_details.values():
	videos_list[line_detail["video_id"]].append(line_detail)


	# Functions
	def render_video_details(video_id):
	video_detail = videos_list[video_id]
	video_title = video_detail[0]["video_title"]
	source_language = video_detail[0]["source_language"]
	target_language = video_detail[0]["target_language"]
	video_duration = video_detail[0]["video_duration"]
	line_ids = [v["source_id"] for v in video_detail if v.get("done") is not True]
	video_link = video_detail[0]["video_link"]
	html = f"""<iframe width="560" height="315" src="{video_link.replace('watch?v=', 'embed/')}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>"""
	if video_link.startswith("gs://"):
	video_html_embed, video_streamable_url = None, requests.post(os.environ["CF_URL"], params={"url": video_link}).text
	else:
	video_html_embed, video_streamable_url = html, None
	return video_title, source_language, target_language, video_duration, gr.Dropdown.update(choices=line_ids, value=line_ids[0]), video_html_embed, video_streamable_url

	def render_line_details(line_id):
	line_detail = line_details[line_id]
	source_transcript = line_detail["source_transcript"]
	translation = line_detail["translator_translated_text"]
	response = requests.post(os.environ["CF_URL"], params={"url": line_detail["source_audio_url"]})
	signed_uri = response.text
	audio_response = requests.get(signed_uri).content
	return audio_response, source_transcript, translation

	def dump_data_db(line_id, video_type, video_subject, video_topics, video_tone, gender, demography, total_time, source_language, target_language, comments, accent, source_transcript, translation, speaker_gender):
	line_detail = line_details[line_id]
	video_id = line_detail["video_id"]
	line_data = {
	"line_id": line_id,
	"video_id": video_id,
	"src_text": source_transcript,
	"intermediate_text": line_detail["english_transcript"],
	"tgt_text": translation,
	"accent": accent,
	"speaking_rate": round(len(translation.split()) / line_detail["line_duration"], 3),
	"src_time": line_detail["line_duration"],
	"gender": speaker_gender,
	"version": "v1",
	"done": True
	}
	video_data = {
	"video_id": video_id,
	"type": video_type,
	"src_lang": source_language,
	"tgt_lang": target_language,
	"subject": video_subject,
	"topics": video_topics,
	"tone": video_tone,
	"gender": gender,
	"demography": demography,
	"total_time": float(total_time),
	"comments": comments
	}
	video_data_collection = DB_CONN['vo_data_dump']['video_data']
	video_data_collection.update_one({'video_id': video_id}, {"$set": video_data}, upsert=True)

	line_data_collection = DB_CONN['vo_data_dump']['line_data']
	line_data_collection.update_one({'line_id': line_id}, {"$set": line_data}, upsert=True)

	DB_CONN['vo_data_dump']['qa_video_dump'].update_one({'source_id': line_id}, {"$set": {"done": True}}, upsert=True)

	# Delete video
	i = -1
	for i in range(len(videos_list[video_id])):
	if videos_list[video_id][i]["source_id"] == line_id:
	del videos_list[video_id][i]
	break
	if len(videos_list[video_id]):
	source_ids = [v["source_id"] for v in videos_list[video_id]]
	return gr.Dropdown.update(choices=source_ids, value=source_ids[i]), video_id
	else:
	del videos_list[video_id]
	new_video_id = random.choice(list(videos_list.values()))[0]['video_id']
	source_ids = [v["source_id"] for v in videos_list[new_video_id] if v.get('done') is not True]
	return gr.Dropdown.update(choices=source_ids, value=source_ids[0]), gr.Dropdown.update(choices=list(videos_list.keys()), value=new_video_id)


	# UI
	with gr.Blocks() as demo:
	gr.Markdown("## Data Aggregation")
	with gr.Row():
	video_id_dropdown = gr.Dropdown(list(videos_list.keys()), label="Video IDs")
	video_title_text = gr.Textbox(label="Video Title", interactive=False)
	with gr.Row():
	video_html = gr.HTML(label="Video")
	video_playable = gr.Video(label="Video")
	gr.Markdown("""Please ensure to fill these sections and do not leave them empty. An example of how to update `Video Type`, `Video Subject` and `Video Topics`:
	- If the video is of a `Biology lecture`, Video Type: _learning_, Video Subject: _science_, Video Topics: _biology_
	- If the video is of `Cooking`, Video Type: _cooking_, Video Subject: _cooking_, Video Topics: _indian recipe_""")
	with gr.Row():
	video_type_dropdown = gr.Dropdown(["learning", "audiobook", "podcast", "vlog", "news", "cooking", "review"], label="Video Type", value="learning", allow_custom_value=True, info="Feel free to add new item if none of them is right.")
	video_subject_dropdown = gr.Dropdown(["tech", "science", "lifestyle", "cooking", "travel", "finance", "politics"], label="Video Subject", value="tech", allow_custom_value=True, info="Feel free to add new item if none of them is right.")
	video_topics_text = gr.Textbox(label="Video Topics")
	with gr.Row():
	video_tone_dropdown = gr.Dropdown(["casual", "semi-formal", "formal"], label="Video Tone", value="semi-formal", info="Tone used in the video.")
	gender_dropdown = gr.Dropdown(["male", "female", "non-binary"], label="Gender", value="female", info="Gender used by the speaker in the video as whole.")
	demography_text = gr.Textbox(label="Demography", info="Demography like normal, old, young, toddler, etc.")
	with gr.Row():
	source_language_text = gr.Textbox(label="Source Language", interactive=False, visible=False)
	target_language_text = gr.Textbox(label="Target Language", interactive=False, visible=False)
	with gr.Row():
	total_time_text = gr.Textbox(label="Video Duration", interactive=False)
	comments_text = gr.Textbox(label="Comments", info="Any extra comments regarding the nature of video.")
	gr.Markdown("Video Line Information")
	line_id_dropdown = gr.Dropdown([], label="Line IDs")
	with gr.Row():
	audio_item = gr.Audio(label="Source Audio", type="filepath")
	accent_text = gr.Textbox(label="Accent", value="indian", info="Accent of the person speaking. For example, indian, american, british.")
	speaker_gender_dropdown = gr.Dropdown(["female", "male"], label="Audio Gender", value="female", info="Gender of the speaker")
	with gr.Row():
	source_transcript_text = gr.Textbox(label="Source Transcript", info="Transcription of the above rendered audio. Please ensure correct punctuations based on the audio.")
	translation_text = gr.Textbox(label="Translation", interactive=False, visible=False)
	update_button = gr.Button("Update")

	# Actions
	video_id_dropdown.change(render_video_details, video_id_dropdown, [video_title_text, source_language_text, target_language_text, total_time_text, line_id_dropdown, video_html, video_playable])
	line_id_dropdown.change(render_line_details, line_id_dropdown, [audio_item, source_transcript_text, translation_text])
	update_button.click(dump_data_db, [line_id_dropdown, video_type_dropdown, video_subject_dropdown, video_topics_text, video_tone_dropdown, gender_dropdown, demography_text, total_time_text, source_language_text, target_language_text, comments_text, accent_text, source_transcript_text, translation_text, speaker_gender_dropdown], [line_id_dropdown, video_id_dropdown])


	if __name__=="__main__":
	demo.queue().launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD")))