vsrinivas commited on
Commit
dcab4e1
·
verified ·
1 Parent(s): 9ce00ec

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +289 -0
app.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from elevenlabs import VoiceSettings
2
+ from elevenlabs.client import ElevenLabs
3
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
4
+ import whisper
5
+ from ai71 import AI71
6
+ from datetime import datetime
7
+ import os
8
+ import time
9
+ from pydub import AudioSegment
10
+ from IPython.display import Audio, display, Video, HTML
11
+ # import assemblyai as aai
12
+ from base64 import b64encode
13
+ import gradio as gr
14
+ import concurrent.futures
15
+
16
+ AI71_API_KEY = os.get('AI71_API_KEY')
17
+ XI_API_KEY = os.get('ELEVEN_LABS_API_KEY')
18
+ client = ElevenLabs(api_key=XI_API_KEY)
19
+
20
+ model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
21
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
22
+ transcriber = whisper.load_model("turbo")
23
+
24
+ language_codes = {"English":"en", "Hindi":"hi", "Portuguese":"pt", "Chinese":"zh", "Spanish":"es",
25
+ "French":"fr", "German":"de", "Japanese":"ja", "Arabic":"ar", "Russian":"ru",
26
+ "Korean":"ko", "Indonesian":"id", "Italian":"it", "Dutch":"nl","Turkish":"tr",
27
+ "Polish":"pl", "Swedish":"sv", "Filipino":"fil", "Malay":"ms", "Romanian":"ro",
28
+ "Ukrainian":"uk", "Greek":"el", "Czech":"cs", "Danish":"da", "Finnish":"fi",
29
+ "Bulgarian":"bg", "Croatian":"hr", "Slovak":"sk"}
30
+
31
+ meeting_texts = []
32
+ n_participants = 4 # This can be adjusted based on the number of people in the call
33
+ language_choices = ["English", "Polish", "Hindi", "Arabic"]
34
+
35
+
36
+ def wait_for_dubbing_completion(dubbing_id: str) -> bool:
37
+ """
38
+ Waits for the dubbing process to complete by periodically checking the status.
39
+
40
+ Args:
41
+ dubbing_id (str): The dubbing project id.
42
+
43
+ Returns:
44
+ bool: True if the dubbing is successful, False otherwise.
45
+ """
46
+ MAX_ATTEMPTS = 120
47
+ CHECK_INTERVAL = 10 # In seconds
48
+
49
+ for _ in range(MAX_ATTEMPTS):
50
+ metadata = client.dubbing.get_dubbing_project_metadata(dubbing_id)
51
+ if metadata.status == "dubbed":
52
+ return True
53
+ elif metadata.status == "dubbing":
54
+ print(
55
+ "Dubbing in progress... Will check status again in",
56
+ CHECK_INTERVAL,
57
+ "seconds.",
58
+ )
59
+ time.sleep(CHECK_INTERVAL)
60
+ else:
61
+ print("Dubbing failed:", metadata.error_message)
62
+ return False
63
+
64
+ print("Dubbing timed out")
65
+ return False
66
+
67
+ def download_dubbed_file(dubbing_id: str, language_code: str) -> str:
68
+ """
69
+ Downloads the dubbed file for a given dubbing ID and language code.
70
+
71
+ Args:
72
+ dubbing_id: The ID of the dubbing project.
73
+ language_code: The language code for the dubbing.
74
+
75
+ Returns:
76
+ The file path to the downloaded dubbed file.
77
+ """
78
+ dir_path = f"data/{dubbing_id}"
79
+ os.makedirs(dir_path, exist_ok=True)
80
+
81
+ file_path = f"{dir_path}/{language_code}.mp4"
82
+ with open(file_path, "wb") as file:
83
+ for chunk in client.dubbing.get_dubbed_file(dubbing_id, language_code):
84
+ file.write(chunk)
85
+
86
+ return file_path
87
+
88
+ def create_dub_from_file(
89
+ input_file_path: str,
90
+ file_format: str,
91
+ source_language: str,
92
+ target_language: str,
93
+ ):
94
+ # ) -> Optional[str]:
95
+ """
96
+ Dubs an audio or video file from one language to another and saves the output.
97
+
98
+ Args:
99
+ input_file_path (str): The file path of the audio or video to dub.
100
+ file_format (str): The file format of the input file.
101
+ source_language (str): The language of the input file.
102
+ target_language (str): The target language to dub into.
103
+
104
+ Returns:
105
+ Optional[str]: The file path of the dubbed file or None if operation failed.
106
+ """
107
+ if not os.path.isfile(input_file_path):
108
+ raise FileNotFoundError(f"The input file does not exist: {input_file_path}")
109
+
110
+ with open(input_file_path, "rb") as audio_file:
111
+ response = client.dubbing.dub_a_video_or_an_audio_file(
112
+ file=(os.path.basename(input_file_path), audio_file, file_format), # Optional file
113
+ target_lang=target_language, # The target language to dub the content into. Can be none if dubbing studio editor is enabled and running manual mode
114
+ # mode="automatic", # automatic or manual.
115
+ source_lang=source_language, # Source language
116
+ num_speakers=1, # Number of speakers to use for the dubbing.
117
+ watermark=True, # Whether to apply watermark to the output video.
118
+ )
119
+
120
+ # rest of the code
121
+ dubbing_id = response.dubbing_id
122
+ if wait_for_dubbing_completion(dubbing_id):
123
+ output_file_path = download_dubbed_file(dubbing_id, target_language)
124
+ return output_file_path
125
+ else:
126
+ return None
127
+
128
+
129
+ def summarize(meeting_texts=meeting_texts):
130
+ mt = ', '.join([f"{k}: {v}" for i in meeting_texts for k, v in i.items()])
131
+ meeting_date_time = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
132
+ meeting_texts = meeting_date_time + '\n' + mt
133
+
134
+ meeting_conversation_processed ='\n'.join(mt)
135
+ # print("M:", session_conversation_processed)
136
+
137
+ minutes_of_meeting = ""
138
+ for chunk in AI71(AI71_API_KEY.strip()).chat.completions.create(
139
+ model="tiiuae/falcon-180b-chat",
140
+ messages=[
141
+ {"role": "system", "content": """You are an expereiced Secretary who can summarize meeting discussions into minutes of meeting.
142
+ Summarize the meetings discussions provided as Speakerwise conversation. Ensure to mention the title as 'Minutes of Meeting held on {meeting_date_time} and present the summary with better viewing format and title in bold letters"""},
143
+ {"role": "user", "content": meeting_conversation_processed},
144
+ ],
145
+ stream=True,
146
+ ):
147
+ if chunk.choices[0].delta.content:
148
+ summary = chunk.choices[0].delta.content
149
+ minutes_of_meeting += summary
150
+ minutes_of_meeting = minutes_of_meeting.replace('User:', '').strip()
151
+ print("\n")
152
+ print(minutes_of_meeting)
153
+ return minutes_of_meeting
154
+
155
+
156
+ # Placeholder function for speech to text conversion
157
+ def speech_to_text(video):
158
+ print('Started transcribing')
159
+ # transcript = transcriber.transcribe(video).text
160
+ # print('transcript:', transcript)
161
+ # transcript = transcriber.transcribe(video).text
162
+ audio = AudioSegment.from_file(video, format="mp4")
163
+ audio.export('temp.mp3', format="mp3")
164
+ transcript= transcriber.transcribe('temp.mp3')['text']
165
+ print('transcript:', transcript)
166
+ return transcript
167
+
168
+ # Placeholder function for translating text
169
+ def translate_text(text, source_language,target_language):
170
+ tokenizer.src_lang = source_language
171
+ encoded_ln = tokenizer(text, return_tensors="pt")
172
+ generated_tokens = model.generate(**encoded_ln, forced_bos_token_id=tokenizer.get_lang_id(target_language))
173
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
174
+ print('translated_text:', translated_text)
175
+ return translated_text
176
+
177
+ # Placeholder function for dubbing (text-to-speech in another language)
178
+ def synthesize_speech(video, source_language,target_language):
179
+ print('Started dubbing')
180
+ dub_video = create_dub_from_file(input_file_path = video,
181
+ file_format = 'audio/mpeg',
182
+ source_language = source_language,
183
+ target_language = target_language)
184
+ # mp4 = open(dub_video,'rb').read()
185
+ # video_url = "data:video/mp4;base64," + b64encode(mp4).decode()
186
+ # display(HTML("""
187
+ # <video width=400 controls>
188
+ # <source src="%s" type="video/mp4">
189
+ # </video>
190
+ # """ % video_url))
191
+ # print(dub_video)
192
+ return dub_video
193
+
194
+ # This function handles the processing when any participant speaks
195
+ def process_speaker(video, speaker_idx, n_participants, *language_list):
196
+ transcript = speech_to_text(video)
197
+
198
+ # Create outputs for each participant
199
+ outputs = []
200
+ global meeting_texts
201
+ def process_translation_dubbing(i):
202
+ if i != speaker_idx:
203
+ participant_language = language_codes[language_list[i]]
204
+ speaker_language = language_codes[language_list[speaker_idx]]
205
+ translated_text = translate_text(transcript, speaker_language, participant_language)
206
+ dubbed_video = synthesize_speech(video, speaker_language, participant_language)
207
+ return translated_text, dubbed_video
208
+ return None, None
209
+
210
+ with concurrent.futures.ThreadPoolExecutor() as executor:
211
+ futures = [executor.submit(process_translation_dubbing, i) for i in range(n_participants)]
212
+ results = [f.result() for f in futures]
213
+
214
+ for i, (translated_text, dubbed_video) in enumerate(results):
215
+ if i == speaker_idx:
216
+ outputs.insert(0, transcript)
217
+ else:
218
+ outputs.append(translated_text)
219
+ outputs.append(dubbed_video)
220
+ if speaker_idx == 0:
221
+ meeting_texts.append({f"Speaker_{speaker_idx+1}":outputs[0]})
222
+ else:
223
+ meeting_texts.append({f"Speaker_{speaker_idx+1}":outputs[1]})
224
+ # for i in range(n_participants):
225
+
226
+ # if i == speaker_idx:
227
+ # # outputs.append(transcript)
228
+ # outputs.insert(0, transcript)
229
+ # # outputs.insert(1, None)
230
+ # else:
231
+ # participant_language = language_codes[language_list[i]]
232
+ # print('participant_language:', participant_language)
233
+ # speaker_language = language_codes[language_list[speaker_idx]]
234
+ # print('speaker_language:', speaker_language)
235
+
236
+ # translated_text = translate_text(transcript, speaker_language, participant_language)
237
+ # dubbed_video = synthesize_speech(video, speaker_language, participant_language)
238
+ # outputs.append(translated_text)
239
+ # outputs.append(dubbed_video)
240
+ print(len(outputs))
241
+ print(outputs)
242
+ print('meeting_texts: ',meeting_texts)
243
+ return outputs
244
+
245
+ def create_participant_row(i, language_choices):
246
+ """Creates the UI for a single participant."""
247
+ with gr.Row():
248
+ video_input = gr.Video(label=f"Participant {i+1} Video", interactive=True)
249
+ language_dropdown = gr.Dropdown(choices=language_choices, label=f"Participant {i+1} Language", value=language_choices[i])
250
+ transcript_output = gr.Textbox(label=f"Participant {i+1} Transcript")
251
+ translated_text = gr.Textbox(label="Speaker's Translated Text")
252
+ dubbed_video = gr.Video(label="Speaker's Dubbed Video")
253
+ return video_input, language_dropdown, transcript_output, translated_text, dubbed_video
254
+
255
+ # Main dynamic Gradio interface
256
+ def create_gradio_interface(n_participants, language_choices):
257
+ with gr.Blocks() as demo:
258
+ gr.Markdown("# Multilingual Conference Call Simulation")
259
+
260
+ video_inputs = []
261
+ language_dropdowns = []
262
+ transcript_outputs = []
263
+ translated_texts = []
264
+ dubbed_videos = []
265
+
266
+ # Create a row for each participant
267
+ for i in range(n_participants):
268
+ video_input, language_dropdown, transcript_output, translated_text, dubbed_video = create_participant_row(i, language_choices)
269
+ video_inputs.append(video_input)
270
+ language_dropdowns.append(language_dropdown)
271
+ transcript_outputs.append(transcript_output)
272
+ translated_texts.append(translated_text)
273
+ dubbed_videos.append(dubbed_video)
274
+
275
+ # Create dynamic processing buttons for each participant
276
+ for i in range(n_participants):
277
+ gr.Button(f"Submit Speaker {i+1}'s Speech").click(
278
+ process_speaker,
279
+ [video_inputs[i], gr.State(i), gr.State(n_participants)] + [language_dropdowns[j] for j in range(n_participants)],
280
+ [transcript_outputs[i]] + [k for j in zip(translated_texts[:i]+translated_texts[i+1:], dubbed_videos[:i]+dubbed_videos[i+1:]) for k in j]
281
+ )
282
+ minutes = gr.Textbox(label="Minutes of Meeting")
283
+ gr.Button(f"Generate Minutes of meeting").click(summarize, None, minutes)
284
+
285
+ # Launch with .queue() to keep it running properly in Jupyter
286
+ demo.queue().launch(debug=True, share=True)
287
+
288
+
289
+ create_gradio_interface(n_participants, language_choices)