Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from elevenlabs import VoiceSettings
|
2 |
+
from elevenlabs.client import ElevenLabs
|
3 |
+
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
4 |
+
import whisper
|
5 |
+
from ai71 import AI71
|
6 |
+
from datetime import datetime
|
7 |
+
import os
|
8 |
+
import time
|
9 |
+
from pydub import AudioSegment
|
10 |
+
from IPython.display import Audio, display, Video, HTML
|
11 |
+
# import assemblyai as aai
|
12 |
+
from base64 import b64encode
|
13 |
+
import gradio as gr
|
14 |
+
import concurrent.futures
|
15 |
+
|
16 |
+
AI71_API_KEY = os.get('AI71_API_KEY')
|
17 |
+
XI_API_KEY = os.get('ELEVEN_LABS_API_KEY')
|
18 |
+
client = ElevenLabs(api_key=XI_API_KEY)
|
19 |
+
|
20 |
+
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
|
21 |
+
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
|
22 |
+
transcriber = whisper.load_model("turbo")
|
23 |
+
|
24 |
+
language_codes = {"English":"en", "Hindi":"hi", "Portuguese":"pt", "Chinese":"zh", "Spanish":"es",
|
25 |
+
"French":"fr", "German":"de", "Japanese":"ja", "Arabic":"ar", "Russian":"ru",
|
26 |
+
"Korean":"ko", "Indonesian":"id", "Italian":"it", "Dutch":"nl","Turkish":"tr",
|
27 |
+
"Polish":"pl", "Swedish":"sv", "Filipino":"fil", "Malay":"ms", "Romanian":"ro",
|
28 |
+
"Ukrainian":"uk", "Greek":"el", "Czech":"cs", "Danish":"da", "Finnish":"fi",
|
29 |
+
"Bulgarian":"bg", "Croatian":"hr", "Slovak":"sk"}
|
30 |
+
|
31 |
+
meeting_texts = []
|
32 |
+
n_participants = 4 # This can be adjusted based on the number of people in the call
|
33 |
+
language_choices = ["English", "Polish", "Hindi", "Arabic"]
|
34 |
+
|
35 |
+
|
36 |
+
def wait_for_dubbing_completion(dubbing_id: str) -> bool:
|
37 |
+
"""
|
38 |
+
Waits for the dubbing process to complete by periodically checking the status.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
dubbing_id (str): The dubbing project id.
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
bool: True if the dubbing is successful, False otherwise.
|
45 |
+
"""
|
46 |
+
MAX_ATTEMPTS = 120
|
47 |
+
CHECK_INTERVAL = 10 # In seconds
|
48 |
+
|
49 |
+
for _ in range(MAX_ATTEMPTS):
|
50 |
+
metadata = client.dubbing.get_dubbing_project_metadata(dubbing_id)
|
51 |
+
if metadata.status == "dubbed":
|
52 |
+
return True
|
53 |
+
elif metadata.status == "dubbing":
|
54 |
+
print(
|
55 |
+
"Dubbing in progress... Will check status again in",
|
56 |
+
CHECK_INTERVAL,
|
57 |
+
"seconds.",
|
58 |
+
)
|
59 |
+
time.sleep(CHECK_INTERVAL)
|
60 |
+
else:
|
61 |
+
print("Dubbing failed:", metadata.error_message)
|
62 |
+
return False
|
63 |
+
|
64 |
+
print("Dubbing timed out")
|
65 |
+
return False
|
66 |
+
|
67 |
+
def download_dubbed_file(dubbing_id: str, language_code: str) -> str:
|
68 |
+
"""
|
69 |
+
Downloads the dubbed file for a given dubbing ID and language code.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
dubbing_id: The ID of the dubbing project.
|
73 |
+
language_code: The language code for the dubbing.
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
The file path to the downloaded dubbed file.
|
77 |
+
"""
|
78 |
+
dir_path = f"data/{dubbing_id}"
|
79 |
+
os.makedirs(dir_path, exist_ok=True)
|
80 |
+
|
81 |
+
file_path = f"{dir_path}/{language_code}.mp4"
|
82 |
+
with open(file_path, "wb") as file:
|
83 |
+
for chunk in client.dubbing.get_dubbed_file(dubbing_id, language_code):
|
84 |
+
file.write(chunk)
|
85 |
+
|
86 |
+
return file_path
|
87 |
+
|
88 |
+
def create_dub_from_file(
|
89 |
+
input_file_path: str,
|
90 |
+
file_format: str,
|
91 |
+
source_language: str,
|
92 |
+
target_language: str,
|
93 |
+
):
|
94 |
+
# ) -> Optional[str]:
|
95 |
+
"""
|
96 |
+
Dubs an audio or video file from one language to another and saves the output.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
input_file_path (str): The file path of the audio or video to dub.
|
100 |
+
file_format (str): The file format of the input file.
|
101 |
+
source_language (str): The language of the input file.
|
102 |
+
target_language (str): The target language to dub into.
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
Optional[str]: The file path of the dubbed file or None if operation failed.
|
106 |
+
"""
|
107 |
+
if not os.path.isfile(input_file_path):
|
108 |
+
raise FileNotFoundError(f"The input file does not exist: {input_file_path}")
|
109 |
+
|
110 |
+
with open(input_file_path, "rb") as audio_file:
|
111 |
+
response = client.dubbing.dub_a_video_or_an_audio_file(
|
112 |
+
file=(os.path.basename(input_file_path), audio_file, file_format), # Optional file
|
113 |
+
target_lang=target_language, # The target language to dub the content into. Can be none if dubbing studio editor is enabled and running manual mode
|
114 |
+
# mode="automatic", # automatic or manual.
|
115 |
+
source_lang=source_language, # Source language
|
116 |
+
num_speakers=1, # Number of speakers to use for the dubbing.
|
117 |
+
watermark=True, # Whether to apply watermark to the output video.
|
118 |
+
)
|
119 |
+
|
120 |
+
# rest of the code
|
121 |
+
dubbing_id = response.dubbing_id
|
122 |
+
if wait_for_dubbing_completion(dubbing_id):
|
123 |
+
output_file_path = download_dubbed_file(dubbing_id, target_language)
|
124 |
+
return output_file_path
|
125 |
+
else:
|
126 |
+
return None
|
127 |
+
|
128 |
+
|
129 |
+
def summarize(meeting_texts=meeting_texts):
|
130 |
+
mt = ', '.join([f"{k}: {v}" for i in meeting_texts for k, v in i.items()])
|
131 |
+
meeting_date_time = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
132 |
+
meeting_texts = meeting_date_time + '\n' + mt
|
133 |
+
|
134 |
+
meeting_conversation_processed ='\n'.join(mt)
|
135 |
+
# print("M:", session_conversation_processed)
|
136 |
+
|
137 |
+
minutes_of_meeting = ""
|
138 |
+
for chunk in AI71(AI71_API_KEY.strip()).chat.completions.create(
|
139 |
+
model="tiiuae/falcon-180b-chat",
|
140 |
+
messages=[
|
141 |
+
{"role": "system", "content": """You are an expereiced Secretary who can summarize meeting discussions into minutes of meeting.
|
142 |
+
Summarize the meetings discussions provided as Speakerwise conversation. Ensure to mention the title as 'Minutes of Meeting held on {meeting_date_time} and present the summary with better viewing format and title in bold letters"""},
|
143 |
+
{"role": "user", "content": meeting_conversation_processed},
|
144 |
+
],
|
145 |
+
stream=True,
|
146 |
+
):
|
147 |
+
if chunk.choices[0].delta.content:
|
148 |
+
summary = chunk.choices[0].delta.content
|
149 |
+
minutes_of_meeting += summary
|
150 |
+
minutes_of_meeting = minutes_of_meeting.replace('User:', '').strip()
|
151 |
+
print("\n")
|
152 |
+
print(minutes_of_meeting)
|
153 |
+
return minutes_of_meeting
|
154 |
+
|
155 |
+
|
156 |
+
# Placeholder function for speech to text conversion
|
157 |
+
def speech_to_text(video):
|
158 |
+
print('Started transcribing')
|
159 |
+
# transcript = transcriber.transcribe(video).text
|
160 |
+
# print('transcript:', transcript)
|
161 |
+
# transcript = transcriber.transcribe(video).text
|
162 |
+
audio = AudioSegment.from_file(video, format="mp4")
|
163 |
+
audio.export('temp.mp3', format="mp3")
|
164 |
+
transcript= transcriber.transcribe('temp.mp3')['text']
|
165 |
+
print('transcript:', transcript)
|
166 |
+
return transcript
|
167 |
+
|
168 |
+
# Placeholder function for translating text
|
169 |
+
def translate_text(text, source_language,target_language):
|
170 |
+
tokenizer.src_lang = source_language
|
171 |
+
encoded_ln = tokenizer(text, return_tensors="pt")
|
172 |
+
generated_tokens = model.generate(**encoded_ln, forced_bos_token_id=tokenizer.get_lang_id(target_language))
|
173 |
+
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
174 |
+
print('translated_text:', translated_text)
|
175 |
+
return translated_text
|
176 |
+
|
177 |
+
# Placeholder function for dubbing (text-to-speech in another language)
|
178 |
+
def synthesize_speech(video, source_language,target_language):
|
179 |
+
print('Started dubbing')
|
180 |
+
dub_video = create_dub_from_file(input_file_path = video,
|
181 |
+
file_format = 'audio/mpeg',
|
182 |
+
source_language = source_language,
|
183 |
+
target_language = target_language)
|
184 |
+
# mp4 = open(dub_video,'rb').read()
|
185 |
+
# video_url = "data:video/mp4;base64," + b64encode(mp4).decode()
|
186 |
+
# display(HTML("""
|
187 |
+
# <video width=400 controls>
|
188 |
+
# <source src="%s" type="video/mp4">
|
189 |
+
# </video>
|
190 |
+
# """ % video_url))
|
191 |
+
# print(dub_video)
|
192 |
+
return dub_video
|
193 |
+
|
194 |
+
# This function handles the processing when any participant speaks
|
195 |
+
def process_speaker(video, speaker_idx, n_participants, *language_list):
|
196 |
+
transcript = speech_to_text(video)
|
197 |
+
|
198 |
+
# Create outputs for each participant
|
199 |
+
outputs = []
|
200 |
+
global meeting_texts
|
201 |
+
def process_translation_dubbing(i):
|
202 |
+
if i != speaker_idx:
|
203 |
+
participant_language = language_codes[language_list[i]]
|
204 |
+
speaker_language = language_codes[language_list[speaker_idx]]
|
205 |
+
translated_text = translate_text(transcript, speaker_language, participant_language)
|
206 |
+
dubbed_video = synthesize_speech(video, speaker_language, participant_language)
|
207 |
+
return translated_text, dubbed_video
|
208 |
+
return None, None
|
209 |
+
|
210 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
211 |
+
futures = [executor.submit(process_translation_dubbing, i) for i in range(n_participants)]
|
212 |
+
results = [f.result() for f in futures]
|
213 |
+
|
214 |
+
for i, (translated_text, dubbed_video) in enumerate(results):
|
215 |
+
if i == speaker_idx:
|
216 |
+
outputs.insert(0, transcript)
|
217 |
+
else:
|
218 |
+
outputs.append(translated_text)
|
219 |
+
outputs.append(dubbed_video)
|
220 |
+
if speaker_idx == 0:
|
221 |
+
meeting_texts.append({f"Speaker_{speaker_idx+1}":outputs[0]})
|
222 |
+
else:
|
223 |
+
meeting_texts.append({f"Speaker_{speaker_idx+1}":outputs[1]})
|
224 |
+
# for i in range(n_participants):
|
225 |
+
|
226 |
+
# if i == speaker_idx:
|
227 |
+
# # outputs.append(transcript)
|
228 |
+
# outputs.insert(0, transcript)
|
229 |
+
# # outputs.insert(1, None)
|
230 |
+
# else:
|
231 |
+
# participant_language = language_codes[language_list[i]]
|
232 |
+
# print('participant_language:', participant_language)
|
233 |
+
# speaker_language = language_codes[language_list[speaker_idx]]
|
234 |
+
# print('speaker_language:', speaker_language)
|
235 |
+
|
236 |
+
# translated_text = translate_text(transcript, speaker_language, participant_language)
|
237 |
+
# dubbed_video = synthesize_speech(video, speaker_language, participant_language)
|
238 |
+
# outputs.append(translated_text)
|
239 |
+
# outputs.append(dubbed_video)
|
240 |
+
print(len(outputs))
|
241 |
+
print(outputs)
|
242 |
+
print('meeting_texts: ',meeting_texts)
|
243 |
+
return outputs
|
244 |
+
|
245 |
+
def create_participant_row(i, language_choices):
|
246 |
+
"""Creates the UI for a single participant."""
|
247 |
+
with gr.Row():
|
248 |
+
video_input = gr.Video(label=f"Participant {i+1} Video", interactive=True)
|
249 |
+
language_dropdown = gr.Dropdown(choices=language_choices, label=f"Participant {i+1} Language", value=language_choices[i])
|
250 |
+
transcript_output = gr.Textbox(label=f"Participant {i+1} Transcript")
|
251 |
+
translated_text = gr.Textbox(label="Speaker's Translated Text")
|
252 |
+
dubbed_video = gr.Video(label="Speaker's Dubbed Video")
|
253 |
+
return video_input, language_dropdown, transcript_output, translated_text, dubbed_video
|
254 |
+
|
255 |
+
# Main dynamic Gradio interface
|
256 |
+
def create_gradio_interface(n_participants, language_choices):
|
257 |
+
with gr.Blocks() as demo:
|
258 |
+
gr.Markdown("# Multilingual Conference Call Simulation")
|
259 |
+
|
260 |
+
video_inputs = []
|
261 |
+
language_dropdowns = []
|
262 |
+
transcript_outputs = []
|
263 |
+
translated_texts = []
|
264 |
+
dubbed_videos = []
|
265 |
+
|
266 |
+
# Create a row for each participant
|
267 |
+
for i in range(n_participants):
|
268 |
+
video_input, language_dropdown, transcript_output, translated_text, dubbed_video = create_participant_row(i, language_choices)
|
269 |
+
video_inputs.append(video_input)
|
270 |
+
language_dropdowns.append(language_dropdown)
|
271 |
+
transcript_outputs.append(transcript_output)
|
272 |
+
translated_texts.append(translated_text)
|
273 |
+
dubbed_videos.append(dubbed_video)
|
274 |
+
|
275 |
+
# Create dynamic processing buttons for each participant
|
276 |
+
for i in range(n_participants):
|
277 |
+
gr.Button(f"Submit Speaker {i+1}'s Speech").click(
|
278 |
+
process_speaker,
|
279 |
+
[video_inputs[i], gr.State(i), gr.State(n_participants)] + [language_dropdowns[j] for j in range(n_participants)],
|
280 |
+
[transcript_outputs[i]] + [k for j in zip(translated_texts[:i]+translated_texts[i+1:], dubbed_videos[:i]+dubbed_videos[i+1:]) for k in j]
|
281 |
+
)
|
282 |
+
minutes = gr.Textbox(label="Minutes of Meeting")
|
283 |
+
gr.Button(f"Generate Minutes of meeting").click(summarize, None, minutes)
|
284 |
+
|
285 |
+
# Launch with .queue() to keep it running properly in Jupyter
|
286 |
+
demo.queue().launch(debug=True, share=True)
|
287 |
+
|
288 |
+
|
289 |
+
create_gradio_interface(n_participants, language_choices)
|