mr2along commited on
Commit
e3a58c6
·
verified ·
1 Parent(s): 6369c87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -50
app.py CHANGED
@@ -2,41 +2,38 @@ import os
2
  import speech_recognition as sr
3
  import difflib
4
  import gradio as gr
5
- from gtts import gTTS
6
- import io
7
- from pydub import AudioSegment
8
 
9
- # Create audio directory if it doesn't exist
10
  if not os.path.exists('audio'):
11
  os.makedirs('audio')
12
 
13
- # Step 1: Transcribe the audio file
14
  def transcribe_audio(audio):
15
  if audio is None:
16
- return "No audio file provided." # Handle the case when no audio is uploaded
17
 
18
  recognizer = sr.Recognizer()
19
  audio_format = audio.split('.')[-1].lower()
20
 
21
- # Convert to WAV if the audio is not in a supported format
22
  if audio_format != 'wav':
23
  try:
24
- # Load the audio file with pydub
25
  audio_segment = AudioSegment.from_file(audio)
26
  wav_path = audio.replace(audio_format, 'wav')
27
- audio_segment.export(wav_path, format='wav') # Convert to WAV
28
- audio = wav_path # Update audio path to the converted file
29
  except Exception as e:
30
  return f"Error converting audio: {e}"
31
 
32
- # Convert audio into recognizable format for the Recognizer
33
  audio_file = sr.AudioFile(audio)
34
 
35
  with audio_file as source:
36
  audio_data = recognizer.record(source)
37
 
38
  try:
39
- # Recognize the audio using Google Web Speech API
40
  transcription = recognizer.recognize_google(audio_data)
41
  return transcription
42
  except sr.UnknownValueError:
@@ -44,78 +41,75 @@ def transcribe_audio(audio):
44
  except sr.RequestError as e:
45
  return f"Error with Google Speech Recognition service: {e}"
46
 
47
- # Step 2: Create pronunciation audio for incorrect words
48
  def create_pronunciation_audio(word):
49
- tts = gTTS(word)
50
- audio_file_path = f"audio/{word}.mp3" # Save the audio to a file
51
- tts.save(audio_file_path)
52
- return audio_file_path # Return the file path of the saved audio
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Step 3: Compare the transcribed text with the input paragraph
55
  def compare_texts(reference_text, transcribed_text):
 
56
  reference_words = reference_text.split()
57
  transcribed_words = transcribed_text.split()
58
- incorrect_words_audios = [] # Store audio paths for incorrect words
59
 
60
  sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
61
  similarity_score = round(sm.ratio() * 100, 2)
62
 
63
- # Construct HTML output
64
  html_output = f"<strong>Fidelity Class:</strong> {'CORRECT' if similarity_score > 50 else 'INCORRECT'}<br>"
65
  html_output += f"<strong>Quality Score:</strong> {similarity_score}<br>"
66
  html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
67
  html_output += "<strong>Word Score List:</strong><br>"
68
 
69
- # Generate colored word score list
70
  for i, word in enumerate(reference_words):
71
  try:
72
  if word.lower() == transcribed_words[i].lower():
73
- html_output += f'<span style="color: green;">{word}</span> ' # Correct words in green
74
  elif difflib.get_close_matches(word, transcribed_words):
75
- html_output += f'<span style="color: yellow;">{word}</span> ' # Close matches in yellow
76
  else:
77
- # Incorrect words in red
78
- html_output += f'<span style="color: red;">{word}</span> '
79
- # Create pronunciation audio for the incorrect word
80
  audio_file_path = create_pronunciation_audio(word)
81
  incorrect_words_audios.append((word, audio_file_path))
82
  except IndexError:
83
- html_output += f'<span style="color: red;">{word}</span> ' # Words in reference that were not transcribed
84
 
85
- # Provide audio for incorrect words
86
  if incorrect_words_audios:
87
  html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
88
  for word, audio in incorrect_words_audios:
89
- suggestion = difflib.get_close_matches(word, reference_words, n=1)
90
- suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
91
  html_output += f'{word}: '
92
- html_output += f'<audio controls><source src="{audio}" type="audio/mpeg">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'
93
-
94
 
95
  return html_output
96
 
97
- # Step 4: Text-to-Speech Function
98
  def text_to_speech(paragraph):
99
- if not paragraph:
100
- return None # Handle the case when no text is provided
101
-
102
- tts = gTTS(paragraph)
103
- audio_file_path = "audio/paragraph.mp3" # Save the audio to a file
104
- tts.save(audio_file_path)
105
- return audio_file_path # Return the file path instead of None
106
 
107
- # Gradio Interface Function
108
  def gradio_function(paragraph, audio):
109
- # Transcribe the audio
110
  transcribed_text = transcribe_audio(audio)
111
-
112
- # Compare the original paragraph with the transcribed text
113
  comparison_result = compare_texts(paragraph, transcribed_text)
114
-
115
- # Return comparison result
116
  return comparison_result
117
 
118
- # Gradio Interface using the updated API
119
  interface = gr.Interface(
120
  fn=gradio_function,
121
  inputs=[
@@ -127,7 +121,6 @@ interface = gr.Interface(
127
  description="Input a paragraph, record your audio, and compare the transcription to the original text."
128
  )
129
 
130
- # Gradio Interface for Text-to-Speech
131
  tts_interface = gr.Interface(
132
  fn=text_to_speech,
133
  inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
@@ -136,8 +129,8 @@ tts_interface = gr.Interface(
136
  description="This tool will read your input paragraph aloud."
137
  )
138
 
139
- # Combine both interfaces into one
140
  demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])
141
 
142
- # Launch Gradio app
143
  demo.launch()
 
2
  import speech_recognition as sr
3
  import difflib
4
  import gradio as gr
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ import soundfile as sf
8
 
9
+ # Tạo thư mục audio nếu chưa tồn tại
10
  if not os.path.exists('audio'):
11
  os.makedirs('audio')
12
 
13
+ # Bước 1: Chuyển đổi âm thanh thành văn bản
14
  def transcribe_audio(audio):
15
  if audio is None:
16
+ return "No audio file provided." # Xử trường hợp không tệp âm thanh
17
 
18
  recognizer = sr.Recognizer()
19
  audio_format = audio.split('.')[-1].lower()
20
 
21
+ # Chuyển đổi sang WAV nếu âm thanh không định dạng hỗ trợ
22
  if audio_format != 'wav':
23
  try:
 
24
  audio_segment = AudioSegment.from_file(audio)
25
  wav_path = audio.replace(audio_format, 'wav')
26
+ audio_segment.export(wav_path, format='wav') # Chuyển đổi sang WAV
27
+ audio = wav_path # Cập nhật đường dẫn âm thanh
28
  except Exception as e:
29
  return f"Error converting audio: {e}"
30
 
 
31
  audio_file = sr.AudioFile(audio)
32
 
33
  with audio_file as source:
34
  audio_data = recognizer.record(source)
35
 
36
  try:
 
37
  transcription = recognizer.recognize_google(audio_data)
38
  return transcription
39
  except sr.UnknownValueError:
 
41
  except sr.RequestError as e:
42
  return f"Error with Google Speech Recognition service: {e}"
43
 
44
+ # Bước 2: Tạo âm thanh phát âm cho các từ sai
45
  def create_pronunciation_audio(word):
46
+ model_name = "tts_models/en/ljspeech/tacotron2" # Mô hình TTS
47
+ model = AutoModelForCausalLM.from_pretrained(model_name)
48
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
49
+
50
+ inputs = tokenizer(word, return_tensors="pt")
51
+
52
+ # Tạo âm thanh từ văn bản
53
+ with torch.no_grad():
54
+ outputs = model.generate(**inputs)
55
+
56
+ # Lưu âm thanh vào tệp
57
+ audio_file_path = f"audio/{word}.wav"
58
+ sf.write(audio_file_path, outputs.numpy(), 22050) # Giả định tần số mẫu 22050Hz
59
+
60
+ return audio_file_path
61
 
62
+ # Bước 3: So sánh văn bản đã chuyển đổi với đoạn văn bản gốc
63
  def compare_texts(reference_text, transcribed_text):
64
+ word_scores = []
65
  reference_words = reference_text.split()
66
  transcribed_words = transcribed_text.split()
67
+ incorrect_words_audios = [] # Lưu trữ đường dẫn âm thanh cho các từ sai
68
 
69
  sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
70
  similarity_score = round(sm.ratio() * 100, 2)
71
 
72
+ # Tạo đầu ra HTML
73
  html_output = f"<strong>Fidelity Class:</strong> {'CORRECT' if similarity_score > 50 else 'INCORRECT'}<br>"
74
  html_output += f"<strong>Quality Score:</strong> {similarity_score}<br>"
75
  html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
76
  html_output += "<strong>Word Score List:</strong><br>"
77
 
78
+ # Tạo danh sách điểm số từ màu sắc
79
  for i, word in enumerate(reference_words):
80
  try:
81
  if word.lower() == transcribed_words[i].lower():
82
+ html_output += f'<span style="color: green;">{word}</span> ' # Từ đúng màu xanh
83
  elif difflib.get_close_matches(word, transcribed_words):
84
+ html_output += f'<span style="color: yellow;">{word}</span> ' # Từ gần đúng màu vàng
85
  else:
86
+ html_output += f'<span style="color: red;">{word}</span> ' # Từ sai màu đỏ
87
+ # Tạo âm thanh phát âm cho từ sai
 
88
  audio_file_path = create_pronunciation_audio(word)
89
  incorrect_words_audios.append((word, audio_file_path))
90
  except IndexError:
91
+ html_output += f'<span style="color: red;">{word}</span> ' # Từ tham chiếu không được chuyển đổi
92
 
93
+ # Cung cấp âm thanh cho các từ sai
94
  if incorrect_words_audios:
95
  html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
96
  for word, audio in incorrect_words_audios:
 
 
97
  html_output += f'{word}: '
98
+ html_output += f'<audio controls><source src="{audio}" type="audio/wav">Your browser does not support the audio tag.</audio><br>'
 
99
 
100
  return html_output
101
 
102
+ # Bước 4: Chức năng Text-to-Speech
103
  def text_to_speech(paragraph):
104
+ audio_file_path = create_pronunciation_audio(paragraph) # Sử dụng hàm đã sửa
105
+ return audio_file_path
 
 
 
 
 
106
 
107
+ # Giao diện Gradio
108
  def gradio_function(paragraph, audio):
 
109
  transcribed_text = transcribe_audio(audio)
 
 
110
  comparison_result = compare_texts(paragraph, transcribed_text)
 
 
111
  return comparison_result
112
 
 
113
  interface = gr.Interface(
114
  fn=gradio_function,
115
  inputs=[
 
121
  description="Input a paragraph, record your audio, and compare the transcription to the original text."
122
  )
123
 
 
124
  tts_interface = gr.Interface(
125
  fn=text_to_speech,
126
  inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
 
129
  description="This tool will read your input paragraph aloud."
130
  )
131
 
132
+ # Kết hợp cả hai giao diện
133
  demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])
134
 
135
+ # Khởi động ứng dụng Gradio
136
  demo.launch()