Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,90 +1,85 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
""
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
messages = [{"role": "system", "content": system_message}]
|
18 |
-
message = ""
|
19 |
-
|
20 |
-
response = ""
|
21 |
-
|
22 |
-
for message in client.chat_completion(
|
23 |
-
messages,
|
24 |
-
max_tokens=max_tokens,
|
25 |
-
stream=True,
|
26 |
-
temperature=temperature,
|
27 |
-
top_p=top_p,
|
28 |
-
):
|
29 |
-
token = message.choices[0].delta.content
|
30 |
-
response += token
|
31 |
-
|
32 |
-
return response.strip() # Return the generated text
|
33 |
-
|
34 |
-
|
35 |
-
def pronunciation_feedback(transcription, reference_text):
|
36 |
-
"""
|
37 |
-
Function to provide feedback on pronunciation based on differences
|
38 |
-
between the transcription and the reference (expected) text.
|
39 |
-
"""
|
40 |
-
diff = difflib.ndiff(reference_text.split(), transcription.split())
|
41 |
-
# Identify words that are incorrect or missing in the transcription
|
42 |
-
errors = [word for word in diff if word.startswith('- ')]
|
43 |
-
|
44 |
-
if errors:
|
45 |
-
feedback = "Mispronounced words: " + ', '.join([error[2:] for error in errors])
|
46 |
-
else:
|
47 |
-
feedback = "Great job! Your pronunciation is spot on."
|
48 |
-
|
49 |
-
return feedback
|
50 |
-
|
51 |
-
|
52 |
-
def transcribe_and_feedback(audio, system_message, max_tokens, temperature, top_p):
|
53 |
-
"""
|
54 |
-
Transcribe the audio and provide pronunciation feedback using the generated text.
|
55 |
-
"""
|
56 |
-
# Generate the reference text using Hugging Face Inference API
|
57 |
-
reference_text = generate_text_with_huggingface(system_message, max_tokens, temperature, top_p)
|
58 |
-
|
59 |
-
# Transcribe the audio using the speech-to-text model
|
60 |
-
transcription = s2t_model(audio)["text"]
|
61 |
-
|
62 |
-
# Provide pronunciation feedback based on the transcription and the generated text
|
63 |
-
feedback = pronunciation_feedback(transcription, reference_text)
|
64 |
-
|
65 |
-
return transcription, feedback, reference_text
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# Gradio interface
|
69 |
demo = gr.Interface(
|
70 |
-
fn=
|
71 |
inputs=[
|
72 |
-
gr.Audio(
|
73 |
-
gr.Textbox(value="Please read a simple sentence.", label="System message"), # Message used to generate text
|
74 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # Controls max token length for the generated text
|
75 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # Temperature control for text generation
|
76 |
-
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") # Top-p control for text generation
|
77 |
],
|
78 |
outputs=[
|
79 |
-
gr.Textbox(label="
|
80 |
-
gr.Textbox(label="
|
81 |
-
gr.Textbox(label="
|
82 |
],
|
83 |
-
title="
|
84 |
-
description="
|
85 |
-
|
86 |
-
live=True # Real-time interaction
|
87 |
)
|
88 |
|
89 |
-
#
|
90 |
-
|
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
3 |
+
import openai
|
4 |
+
from openai import OpenAI
|
5 |
+
import speech_recognition as sr
|
6 |
+
import threading
|
7 |
+
import time
|
8 |
+
|
9 |
+
# Initialize OpenAI client with API key from environment variable
|
10 |
+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
11 |
+
|
12 |
+
# Create an assistant
|
13 |
+
assistant = client.beta.assistants.create(
|
14 |
+
name="Pronunciation Assistant",
|
15 |
+
instructions="You are a helpful pronunciation assistant. You compare the generated text with the user's transcription and then provide feedback on how the user can improve their pronunciation accordingly. You also single out specific words they pronounced incorrectly and give tips on how to improve like for example 'schedule' can be pronounced as 'sked-jool'.",
|
16 |
+
model="gpt-4-1106-preview"
|
17 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
def generate_text():
|
20 |
+
response = client.chat.completions.create(
|
21 |
+
model="gpt-3.5-turbo",
|
22 |
+
messages=[
|
23 |
+
{"role": "system", "content": "Generate a short paragraph (2-3 sentences) for an English learner to read aloud."},
|
24 |
+
{"role": "user", "content": "Create a practice text."}
|
25 |
+
]
|
26 |
+
)
|
27 |
+
return response.choices[0].message.content
|
28 |
+
|
29 |
+
def get_pronunciation_feedback(original_text, transcription):
|
30 |
+
thread = client.beta.threads.create()
|
31 |
+
|
32 |
+
message = client.beta.threads.messages.create(
|
33 |
+
thread_id=thread.id,
|
34 |
+
role="user",
|
35 |
+
content=f"Original text: '{original_text}'\nTranscription: '{transcription}'\nProvide pronunciation feedback."
|
36 |
+
)
|
37 |
+
|
38 |
+
run = client.beta.threads.runs.create(
|
39 |
+
thread_id=thread.id,
|
40 |
+
assistant_id=assistant.id
|
41 |
+
)
|
42 |
+
|
43 |
+
while run.status != "completed":
|
44 |
+
time.sleep(1)
|
45 |
+
run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
|
46 |
+
|
47 |
+
messages = client.beta.threads.messages.list(thread_id=thread.id)
|
48 |
+
return messages.data[0].content[0].text.value
|
49 |
+
|
50 |
+
def transcribe_audio_realtime(audio):
|
51 |
+
recognizer = sr.Recognizer()
|
52 |
+
with sr.AudioFile(audio) as source:
|
53 |
+
audio_data = recognizer.record(source)
|
54 |
+
try:
|
55 |
+
return recognizer.recognize_google(audio_data)
|
56 |
+
except sr.UnknownValueError:
|
57 |
+
return "Could not understand audio"
|
58 |
+
except sr.RequestError:
|
59 |
+
return "Could not request results from the speech recognition service"
|
60 |
+
|
61 |
+
def practice_pronunciation(audio):
|
62 |
+
original_text = generate_text()
|
63 |
+
transcription = transcribe_audio_realtime(audio)
|
64 |
+
feedback = get_pronunciation_feedback(original_text, transcription)
|
65 |
+
return original_text, transcription, feedback
|
66 |
|
67 |
# Gradio interface
|
68 |
demo = gr.Interface(
|
69 |
+
fn=practice_pronunciation,
|
70 |
inputs=[
|
71 |
+
gr.Audio(source="microphone", type="filepath")
|
|
|
|
|
|
|
|
|
72 |
],
|
73 |
outputs=[
|
74 |
+
gr.Textbox(label="Text to Read"),
|
75 |
+
gr.Textbox(label="Your Transcription"),
|
76 |
+
gr.Textbox(label="Pronunciation Feedback")
|
77 |
],
|
78 |
+
title="Pronunciation Practice Tool",
|
79 |
+
description="Read the generated text aloud. The system will transcribe your speech and provide pronunciation feedback.",
|
80 |
+
live=True
|
|
|
81 |
)
|
82 |
|
83 |
+
# Launch the app
|
84 |
+
if __name__ == "__main__":
|
85 |
+
demo.launch()
|