Fabriwin commited on
Commit
3175dca
·
verified ·
1 Parent(s): 24e657d

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +14 -14
  2. app.py +73 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -1,14 +1,14 @@
1
- ---
2
- title: Convx
3
- emoji: 🦀
4
- colorFrom: purple
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.9.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Conversational smal mixture modal
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Convx
3
+ emoji: 🦀
4
+ colorFrom: purple
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Conversational smal mixture modal
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import pipeline
4
+ import time
5
+ import logging
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.INFO)
9
+
10
+ # Define the models using pipeline
11
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30)
12
+ text_pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-360M", max_length=512, temperature=0.7, top_p=0.9)
13
+ tts_pipe = pipeline("text-to-speech", model="mussacharles60/swahili-tts-female-voice")
14
+
15
+ # Define conversation rules
16
+ MAX_INPUT_SIZE = 100
17
+ PREDEFINED_ATTRIBUTES = ["name", "age", "location"]
18
+ CONTEXT_HISTORY = []
19
+
20
+ # Define the function to recognize speech
21
+ def recognize_speech(audio):
22
+ retries = 3
23
+ for _ in range(retries):
24
+ try:
25
+ result = asr_pipe(audio, return_timestamps=True)
26
+ return result['text']
27
+ except Exception as e:
28
+ logging.error(f"ASR failed: {e}")
29
+ time.sleep(1)
30
+ return ""
31
+
32
+ # Define the function to generate text
33
+ def generate_text(prompt):
34
+ global CONTEXT_HISTORY
35
+ CONTEXT_HISTORY.append(prompt)
36
+ if len(CONTEXT_HISTORY) > 5:
37
+ CONTEXT_HISTORY.pop(0)
38
+ context = " ".join(CONTEXT_HISTORY)
39
+ outputs = text_pipe(context, max_length=512, num_return_sequences=1)
40
+ generated_text = outputs[0]['generated_text']
41
+ return generated_text
42
+
43
+ # Define the function to synthesize speech
44
+ def synthesize_speech(text):
45
+ audio = tts_pipe(text, output_format="wav", sample_rate=16000)
46
+ return audio
47
+
48
+ # Define the function to handle conversation
49
+ def handle_conversation(audio):
50
+ recognized_text = recognize_speech(audio)
51
+ if any(attr in recognized_text.lower() for attr in PREDEFINED_ATTRIBUTES):
52
+ generated_text = generate_text(f"Please provide your {recognized_text}")
53
+ else:
54
+ generated_text = generate_text(recognized_text)
55
+ synthesized_audio = synthesize_speech(generated_text)
56
+ return synthesized_audio, generated_text
57
+
58
+ # Define the Gradio app
59
+ demo = gr.Blocks()
60
+
61
+ # Define the input and output components
62
+ input_audio = gr.Audio(label="Input Audio")
63
+ output_audio = gr.Audio(label="Output Audio")
64
+ output_text = gr.Textbox(label="Output Text")
65
+
66
+ # Define the buttons
67
+ conversation_button = gr.Button("Start Conversation")
68
+
69
+ # Define the event listeners
70
+ conversation_button.click(handle_conversation, inputs=input_audio, outputs=[output_audio, output_text])
71
+
72
+ # Launch the app
73
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets[audio]
2
+ transformers==4.40.1
3
+ torchaudio
4
+ accelerate
5
+ evaluate
6
+ jiwer
7
+ tensorboard
8
+ gradio
9
+ spaces
10
+ logging