Baghdad99 commited on
Commit
75c9b3b
·
1 Parent(s): d53b668

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -36
app.py CHANGED
@@ -15,53 +15,53 @@ def translate_speech(audio_data_tuple):
15
  # Extract the audio data from the tuple
16
  sample_rate, audio_data = audio_data_tuple
17
 
18
- # Save the audio data to a temporary file
19
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
20
- sf.write(temp_audio_file.name, audio_data, sample_rate)
21
 
22
- # Prepare the input dictionary
23
- input_dict = asr_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt", padding=True)
24
 
25
- # Use the ASR model to get the logits
26
- logits = asr_model(input_dict.input_values.to("cpu")).logits
27
 
28
- # Get the predicted IDs
29
- pred_ids = torch.argmax(logits, dim=-1)[0]
30
 
31
- # Decode the predicted IDs to get the transcription
32
- transcription = asr_processor.decode(pred_ids)
33
- print(f"Transcription: {transcription}") # Print the transcription
34
 
35
- # Use the translation pipeline to translate the transcription
36
- translated_text = translator(transcription, return_tensors="pt")
37
- print(f"Translated text: {translated_text}") # Print the translated text
38
 
39
- # Check if the translated text contains 'generated_token_ids'
40
- if 'generated_token_ids' in translated_text[0]:
41
- # Decode the tokens into text
42
- translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
43
- print(f"Translated text string: {translated_text_str}") # Print the translated text string
44
- else:
45
- print("The translated text does not contain 'generated_token_ids'")
46
- return
47
 
48
- # Use the text-to-speech pipeline to synthesize the translated text
49
- synthesised_speech = tts(translated_text_str)
50
 
51
- # Check if the synthesised speech contains 'audio'
52
- if 'audio' in synthesised_speech:
53
- synthesised_speech_data = synthesised_speech['audio']
54
- else:
55
- print("The synthesised speech does not contain 'audio'")
56
- return
57
 
58
- # Flatten the audio data
59
- synthesised_speech_data = synthesised_speech_data.flatten()
60
 
61
- # Scale the audio data to the range of int16 format
62
- synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
 
 
63
 
64
- return 16000, synthesised_speech
65
 
66
  # Define the Gradio interface
67
  iface = gr.Interface(
 
15
  # Extract the audio data from the tuple
16
  sample_rate, audio_data = audio_data_tuple
17
 
18
+ # Resample the audio data to 16000 Hz
19
+ audio_data_resampled = librosa.resample(audio_data, sample_rate, 16000)
 
20
 
21
+ # Prepare the input dictionary
22
+ input_dict = asr_processor(audio_data_resampled, sampling_rate=16000, return_tensors="pt", padding=True) # Pass the resampled audio_data here
23
 
24
+ # Use the ASR model to get the logits
25
+ logits = asr_model(input_dict.input_values.to("cpu")).logits
26
 
27
+ # Get the predicted IDs
28
+ pred_ids = torch.argmax(logits, dim=-1)[0]
29
 
30
+ # Decode the predicted IDs to get the transcription
31
+ transcription = asr_processor.decode(pred_ids)
32
+ print(f"Transcription: {transcription}") # Print the transcription
33
 
34
+ # Use the translation pipeline to translate the transcription
35
+ translated_text = translator(transcription, return_tensors="pt")
36
+ print(f"Translated text: {translated_text}") # Print the translated text
37
 
38
+ # Check if the translated text contains 'generated_token_ids'
39
+ if 'generated_token_ids' in translated_text[0]:
40
+ # Decode the tokens into text
41
+ translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
42
+ print(f"Translated text string: {translated_text_str}") # Print the translated text string
43
+ else:
44
+ print("The translated text does not contain 'generated_token_ids'")
45
+ return
46
 
47
+ # Use the text-to-speech pipeline to synthesize the translated text
48
+ synthesised_speech = tts(translated_text_str)
49
 
50
+ # Check if the synthesised speech contains 'audio'
51
+ if 'audio' in synthesised_speech:
52
+ synthesised_speech_data = synthesised_speech['audio']
53
+ else:
54
+ print("The synthesised speech does not contain 'audio'")
55
+ return
56
 
57
+ # Flatten the audio data
58
+ synthesised_speech_data = synthesised_speech_data.flatten()
59
 
60
+ # Scale the audio data to the range of int16 format
61
+ synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
62
+
63
+ return 16000, synthesised_speech
64
 
 
65
 
66
  # Define the Gradio interface
67
  iface = gr.Interface(