AI-Edify commited on
Commit
af1a473
·
verified ·
1 Parent(s): c562fea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -79
app.py CHANGED
@@ -1,90 +1,85 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import difflib
4
- from transformers import pipeline # Import transformers to load the speech-to-text model
5
-
6
- # Load Hugging Face Inference client
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
- # Load the speech-to-text model using transformers pipeline
10
- s2t_model = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h-lv60-self")
11
-
12
- def generate_text_with_huggingface(system_message, max_tokens, temperature, top_p):
13
- """
14
- Function to generate text using Hugging Face Inference API
15
- based on the system message, max tokens, temperature, and top-p.
16
- """
17
- messages = [{"role": "system", "content": system_message}]
18
- message = ""
19
-
20
- response = ""
21
-
22
- for message in client.chat_completion(
23
- messages,
24
- max_tokens=max_tokens,
25
- stream=True,
26
- temperature=temperature,
27
- top_p=top_p,
28
- ):
29
- token = message.choices[0].delta.content
30
- response += token
31
-
32
- return response.strip() # Return the generated text
33
-
34
-
35
- def pronunciation_feedback(transcription, reference_text):
36
- """
37
- Function to provide feedback on pronunciation based on differences
38
- between the transcription and the reference (expected) text.
39
- """
40
- diff = difflib.ndiff(reference_text.split(), transcription.split())
41
- # Identify words that are incorrect or missing in the transcription
42
- errors = [word for word in diff if word.startswith('- ')]
43
-
44
- if errors:
45
- feedback = "Mispronounced words: " + ', '.join([error[2:] for error in errors])
46
- else:
47
- feedback = "Great job! Your pronunciation is spot on."
48
-
49
- return feedback
50
-
51
-
52
- def transcribe_and_feedback(audio, system_message, max_tokens, temperature, top_p):
53
- """
54
- Transcribe the audio and provide pronunciation feedback using the generated text.
55
- """
56
- # Generate the reference text using Hugging Face Inference API
57
- reference_text = generate_text_with_huggingface(system_message, max_tokens, temperature, top_p)
58
-
59
- # Transcribe the audio using the speech-to-text model
60
- transcription = s2t_model(audio)["text"]
61
-
62
- # Provide pronunciation feedback based on the transcription and the generated text
63
- feedback = pronunciation_feedback(transcription, reference_text)
64
-
65
- return transcription, feedback, reference_text
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # Gradio interface
69
  demo = gr.Interface(
70
- fn=transcribe_and_feedback, # The function that transcribes audio and provides feedback
71
  inputs=[
72
- gr.Audio(type="filepath", label="Record Audio"), # Microphone input for recording
73
- gr.Textbox(value="Please read a simple sentence.", label="System message"), # Message used to generate text
74
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # Controls max token length for the generated text
75
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # Temperature control for text generation
76
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") # Top-p control for text generation
77
  ],
78
  outputs=[
79
- gr.Textbox(label="Transcription"), # Display transcription of the audio
80
- gr.Textbox(label="Pronunciation Feedback"), # Feedback on pronunciation
81
- gr.Textbox(label="Generated Text (What You Were Supposed to Read)") # Display the text generated by the API
82
  ],
83
- title="Speech-to-Text with Pronunciation Feedback",
84
- description="Record an audio sample and the system will transcribe it, "
85
- "compare your transcription to the generated text, and give pronunciation feedback.",
86
- live=True # Real-time interaction
87
  )
88
 
89
- # Enable queuing and launch the app
90
- demo.queue().launch(show_error=True)
 
 
1
+ import os
2
  import gradio as gr
3
+ import openai
4
+ from openai import OpenAI
5
+ import speech_recognition as sr
6
+ import threading
7
+ import time
8
+
9
+ # Initialize OpenAI client with API key from environment variable
10
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
11
+
12
+ # Create an assistant
13
+ assistant = client.beta.assistants.create(
14
+ name="Pronunciation Assistant",
15
+ instructions="You are a helpful pronunciation assistant. You compare the generated text with the user's transcription and then provide feedback on how the user can improve their pronunciation accordingly. You also single out specific words they pronounced incorrectly and give tips on how to improve like for example 'schedule' can be pronounced as 'sked-jool'.",
16
+ model="gpt-4-1106-preview"
17
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def generate_text():
20
+ response = client.chat.completions.create(
21
+ model="gpt-3.5-turbo",
22
+ messages=[
23
+ {"role": "system", "content": "Generate a short paragraph (2-3 sentences) for an English learner to read aloud."},
24
+ {"role": "user", "content": "Create a practice text."}
25
+ ]
26
+ )
27
+ return response.choices[0].message.content
28
+
29
+ def get_pronunciation_feedback(original_text, transcription):
30
+ thread = client.beta.threads.create()
31
+
32
+ message = client.beta.threads.messages.create(
33
+ thread_id=thread.id,
34
+ role="user",
35
+ content=f"Original text: '{original_text}'\nTranscription: '{transcription}'\nProvide pronunciation feedback."
36
+ )
37
+
38
+ run = client.beta.threads.runs.create(
39
+ thread_id=thread.id,
40
+ assistant_id=assistant.id
41
+ )
42
+
43
+ while run.status != "completed":
44
+ time.sleep(1)
45
+ run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
46
+
47
+ messages = client.beta.threads.messages.list(thread_id=thread.id)
48
+ return messages.data[0].content[0].text.value
49
+
50
+ def transcribe_audio_realtime(audio):
51
+ recognizer = sr.Recognizer()
52
+ with sr.AudioFile(audio) as source:
53
+ audio_data = recognizer.record(source)
54
+ try:
55
+ return recognizer.recognize_google(audio_data)
56
+ except sr.UnknownValueError:
57
+ return "Could not understand audio"
58
+ except sr.RequestError:
59
+ return "Could not request results from the speech recognition service"
60
+
61
+ def practice_pronunciation(audio):
62
+ original_text = generate_text()
63
+ transcription = transcribe_audio_realtime(audio)
64
+ feedback = get_pronunciation_feedback(original_text, transcription)
65
+ return original_text, transcription, feedback
66
 
67
  # Gradio interface
68
  demo = gr.Interface(
69
+ fn=practice_pronunciation,
70
  inputs=[
71
+ gr.Audio(source="microphone", type="filepath")
 
 
 
 
72
  ],
73
  outputs=[
74
+ gr.Textbox(label="Text to Read"),
75
+ gr.Textbox(label="Your Transcription"),
76
+ gr.Textbox(label="Pronunciation Feedback")
77
  ],
78
+ title="Pronunciation Practice Tool",
79
+ description="Read the generated text aloud. The system will transcribe your speech and provide pronunciation feedback.",
80
+ live=True
 
81
  )
82
 
83
+ # Launch the app
84
+ if __name__ == "__main__":
85
+ demo.launch()