deepsync commited on
Commit
d5710aa
·
verified ·
1 Parent(s): 213bb12

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import pymongo
5
+ import requests
6
+ import gradio as gr
7
+ from collections import defaultdict
8
+ from uuid import uuid4
9
+
10
+ DB_CONN = pymongo.MongoClient(os.environ.get("DB_URL"))
11
+ line_details = {l["source_id"]:l for l in list(DB_CONN['vo_data_dump']['qa_video_dump'].find()) if l.get("done") is not True}
12
+
13
+ videos_list = defaultdict(list)
14
+ for line_detail in line_details.values():
15
+ videos_list[line_detail["video_id"]].append(line_detail)
16
+
17
+
18
+ # Functions
19
+ def render_video_details(video_id):
20
+ video_detail = videos_list[video_id]
21
+ video_title = video_detail[0]["video_title"]
22
+ source_language = video_detail[0]["source_language"]
23
+ target_language = video_detail[0]["target_language"]
24
+ video_duration = video_detail[0]["video_duration"]
25
+ line_ids = [v["source_id"] for v in video_detail if v.get("done") is not True]
26
+ video_link = video_detail[0]["video_link"]
27
+ html = f"""<iframe width="560" height="315" src="{video_link.replace('watch?v=', 'embed/')}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>"""
28
+ if video_link.startswith("gs://"):
29
+ video_html_embed, video_streamable_url = None, requests.post(os.environ["CF_URL"], params={"url": video_link}).text
30
+ else:
31
+ video_html_embed, video_streamable_url = html, None
32
+ return video_title, source_language, target_language, video_duration, gr.Dropdown.update(choices=line_ids, value=line_ids[0]), video_html_embed, video_streamable_url
33
+
34
+ def render_line_details(line_id):
35
+ line_detail = line_details[line_id]
36
+ source_transcript = line_detail["source_transcript"]
37
+ translation = line_detail["translator_translated_text"]
38
+ response = requests.post(os.environ["CF_URL"], params={"url": line_detail["source_audio_url"]})
39
+ signed_uri = response.text
40
+ audio_response = requests.get(signed_uri)
41
+ path = os.path.join("temp_audios", f"{uuid4()}.mp3")
42
+ with open(path, "wb") as f:
43
+ f.write(audio_response.content)
44
+ return path, source_transcript, translation
45
+
46
+ def dump_data_db(line_id, video_type, video_subject, video_topics, video_tone, gender, demography, total_time, source_language, target_language, comments, accent, source_transcript, translation):
47
+ line_detail = line_details[line_id]
48
+ video_id = line_detail["video_id"]
49
+ line_data = {
50
+ "line_id": line_id,
51
+ "video_id": video_id,
52
+ "src_text": source_transcript,
53
+ "intermediate_text": line_detail["english_transcript"],
54
+ "tgt_text": translation,
55
+ "accent": accent,
56
+ "speaking_rate": round(len(translation.split()) / line_detail["line_duration"], 3),
57
+ "src_time": line_detail["line_duration"],
58
+ "version": "v1",
59
+ "done": True
60
+ }
61
+ video_data = {
62
+ "video_id": video_id,
63
+ "type": video_type,
64
+ "src_lang": source_language,
65
+ "tgt_lang": target_language,
66
+ "subject": video_subject,
67
+ "topics": video_topics,
68
+ "tone": video_tone,
69
+ "gender": gender,
70
+ "demography": demography,
71
+ "total_time": float(total_time),
72
+ "comments": comments
73
+ }
74
+ video_data_collection = DB_CONN['vo_data_dump']['video_data']
75
+ video_data_collection.update_one({'video_id': video_id}, {"$set": video_data}, upsert=True)
76
+
77
+ line_data_collection = DB_CONN['vo_data_dump']['line_data']
78
+ line_data_collection.update_one({'line_id': line_id}, {"$set": line_data}, upsert=True)
79
+
80
+ DB_CONN['vo_data_dump']['qa_video_dump'].update_one({'source_id': line_id}, {"$set": {"done": True}}, upsert=True)
81
+
82
+ # Delete video
83
+ i = -1
84
+ for i in range(len(videos_list[video_id])):
85
+ if videos_list[video_id][i]["source_id"] == line_id:
86
+ del videos_list[video_id][i]
87
+ break
88
+ if len(videos_list[video_id]):
89
+ source_ids = [v["source_id"] for v in videos_list[video_id]]
90
+ return gr.Dropdown.update(choices=source_ids, value=source_ids[i]), video_id
91
+ else:
92
+ del videos_list[video_id]
93
+ new_video_id = random.choice(list(videos_list.values()))[0]['video_id']
94
+ source_ids = [v["source_id"] for v in videos_list[new_video_id] if v.get('done') is not True]
95
+ return gr.Dropdown.update(choices=source_ids, value=source_ids[0]), gr.Dropdown.update(choices=list(videos_list.keys()), value=new_video_id)
96
+
97
+
98
+ # UI
99
+ with gr.Blocks() as demo:
100
+ gr.Markdown("## Data Aggregation")
101
+ with gr.Row():
102
+ video_id_dropdown = gr.Dropdown(list(videos_list.keys()), label="Video IDs")
103
+ video_title_text = gr.Textbox(label="Video Title", interactive=False)
104
+ with gr.Row():
105
+ video_html = gr.HTML(label="Video")
106
+ video_playable = gr.Video(label="Video")
107
+ with gr.Row():
108
+ video_type_dropdown = gr.Dropdown(["learning", "audiobook", "podcast", "vlog", "news", "cooking", "review"], label="Video Type", value="learning")
109
+ video_subject_dropdown = gr.Dropdown(["tech", "science", "lifestyle", "cooking", "travel", "finance", "politics"], label="Video Subject", value="tech")
110
+ video_topics_text = gr.Textbox(label="Video Topics")
111
+ with gr.Row():
112
+ video_tone_dropdown = gr.Dropdown(["casual", "semi-formal", "formal"], label="Video Tone", value="semi-formal")
113
+ gender_dropdown = gr.Dropdown(["male", "female", "non-binary"], label="Gender", value="female")
114
+ demography_text = gr.Textbox(label="Demography")
115
+ with gr.Row():
116
+ source_language_text = gr.Textbox(label="Source Language")
117
+ target_language_text = gr.Textbox(label="Target Language")
118
+ with gr.Row():
119
+ total_time_text = gr.Textbox(label="Video Duration")
120
+ comments_text = gr.Textbox(label="Comments")
121
+ gr.Markdown("Video Line Information")
122
+ line_id_dropdown = gr.Dropdown([], label="Line IDs")
123
+ with gr.Row():
124
+ audio_item = gr.Audio(label="Source Audio", type="filepath")
125
+ accent_text = gr.Textbox(label="Accent", value="indian")
126
+ with gr.Row():
127
+ source_transcript_text = gr.Textbox(label="Source Transcript")
128
+ translation_text = gr.Textbox(label="Translation", interactive=False)
129
+ update_button = gr.Button("Update")
130
+
131
+ # Actions
132
+ video_id_dropdown.change(render_video_details, video_id_dropdown, [video_title_text, source_language_text, target_language_text, total_time_text, line_id_dropdown, video_html, video_playable])
133
+ line_id_dropdown.change(render_line_details, line_id_dropdown, [audio_item, source_transcript_text, translation_text])
134
+ update_button.click(dump_data_db, [line_id_dropdown, video_type_dropdown, video_subject_dropdown, video_topics_text, video_tone_dropdown, gender_dropdown, demography_text, total_time_text, source_language_text, target_language_text, comments_text, accent_text, source_transcript_text, translation_text], [line_id_dropdown, video_id_dropdown])
135
+
136
+
137
+ if __name__=="__main__":
138
+ demo.queue().launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD")))