andercorral commited on
Commit
5e48419
·
1 Parent(s): 2ce6def

Updated app

Browse files
Files changed (5) hide show
  1. app.py +157 -37
  2. assets/code.webp +0 -0
  3. assets/orai_bw.svg +418 -0
  4. assets/sermas-logo.png +0 -0
  5. requirements.txt +1 -1
app.py CHANGED
@@ -1,62 +1,165 @@
1
  import gradio as gr
2
  import re
3
- import subprocess
4
- import math
5
- import shutil
6
- import soundfile as sf
7
- import tempfile
8
  import os
9
  import requests
10
  import time
 
 
11
 
12
 
13
- def _return_yt_html_embed(yt_url):
14
- video_id = yt_url.split("?v=")[-1]
15
- HTML_str = (
16
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
17
- " </center>"
18
- )
19
- return HTML_str
20
 
 
 
 
 
 
 
21
 
22
  def transcribe_base(audio, language):
23
- start_time = time.time()
 
 
 
 
 
 
 
 
 
24
 
25
- d, sr = sf.read(audio)
26
- data = {'audio': d.tolist(),
27
- 'sampling_rate': sr,
28
- 'language': language}
29
 
30
- response = requests.post(os.getenv("api_url"), json=data).json()
31
- transcription = response["text"]
32
- speaker_class_string = response["speaker_class_string"]
33
 
34
- end_time = time.time()
35
- print("-"*50)
36
- print(len(data["audio"])/float(sr))
37
- print(end_time-start_time)
38
- print("-"*50)
39
 
40
- return transcription, speaker_class_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
42
 
43
- def transcribe(audio_microphone, audio_upload, language):
44
- print("Transcription request")
45
- print(audio_microphone, audio_upload, language)
46
- audio = audio_microphone if audio_microphone is not None else audio_upload
47
- return transcribe_base(audio, language)
 
48
 
 
 
 
49
 
50
- demo = gr.Blocks()
 
 
 
51
  with demo:
52
- gr.Markdown("# Speech recognition using Whisper models")
53
- gr.Markdown("Orai NLP Technologies")
 
 
 
 
54
 
55
- with gr.Tab("Trancribe Audio"):
56
  iface = gr.Interface(
57
- fn=transcribe,
58
  inputs=[
59
  gr.Audio(sources="microphone", type="filepath"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  gr.Audio(sources="upload", type="filepath"),
61
  gr.Dropdown(choices=[("Basque", "eu"),
62
  ("Spanish", "es"),
@@ -71,5 +174,22 @@ with demo:
71
  ],
72
  allow_flagging="never",
73
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  demo.queue(max_size=1)
75
- demo.launch(share=False, max_threads=3, auth=(os.getenv("username"), os.getenv("password")), auth_message="Please provide a username and a password.")
 
1
  import gradio as gr
2
  import re
 
 
 
 
 
3
  import os
4
  import requests
5
  import time
6
+ import soundfile as sf
7
+ import io
8
 
9
 
10
+ def audio_to_bytes(audio):
11
+ data, sr = sf.read(audio)
12
+ audio_bytes = io.BytesIO()
13
+ sf.write(audio_bytes, data, sr, format='WAV')
14
+ audio_bytes.seek(0)
15
+ return audio_bytes
 
16
 
17
+ def langswitch_API_call(audio, language):
18
+ audio_bytes = audio_to_bytes(audio)
19
+ files = {'file': (f'audio_chunk.wav', audio_bytes, 'audio/wav')}
20
+ data = {'language': language}
21
+ response = requests.post(os.getenv("api_url"), files=files, data=data)
22
+ return response.json()
23
 
24
  def transcribe_base(audio, language):
25
+ response = langswitch_API_call(audio, language)
26
+ print(response)
27
+ transcription = response["transcription"]
28
+ is_new_speaker = response["is_new_speaker"]
29
+ speaker = response["classified_speaker"]
30
+ if is_new_speaker:
31
+ speaker_class_string = f'New speaker detected. Assigned new ID {speaker}'
32
+ else:
33
+ speaker_class_string = f'Speaker found in database, ID {speaker}'
34
+ return transcription, speaker_class_string
35
 
36
+ def transcribe_mic(audio_microphone, language):
37
+ print("Transcription microphone")
38
+ return transcribe_base(audio_microphone, language)
 
39
 
40
+ def transcribe_file(audio_upload, language):
41
+ print("Transcription local file")
42
+ return transcribe_base(audio_upload, language)
43
 
 
 
 
 
 
44
 
45
+ css_content = """
46
+ /*
47
+ .gradio-container{
48
+ padding: 0 !important;
49
+ }
50
+ .html-container{
51
+ padding: 0 !important;
52
+ }
53
+ */
54
+ #orai-info{
55
+ padding: 50px;
56
+ text-align: center;
57
+ font-size: 1rem;
58
+ background: url('gradio_api/file=assets/code.webp') rgba(0,0,0,0.8);
59
+ background-repeat: no-repeat;
60
+ background-position: center center;
61
+ background-size: cover;
62
+ background-blend-mode: multiply;
63
+ }
64
+ #orai-info-text p{
65
+ color: white !important;
66
+ }
67
+ /*
68
+ #orai-info img{
69
+ margin: auto;
70
+ display: block;
71
+ margin-bottom: 1rem;
72
+ }*/
73
+ .bold{
74
+ font-weight: bold;
75
+ color: inherit !important;
76
+ }
77
+ footer{
78
+ display:none !important
79
+ }
80
+
81
+ .logos{
82
+ display: flex;
83
+ justify-content: center;
84
+ }
85
+ .sermas-logo{
86
+ display: flex;
87
+ align-items: center;
88
+ margin-right: 3rem;
89
+ }
90
+ .sermas-logo span{
91
+ color: white !important;
92
+ font-size: 2.5rem;
93
+ font-family: Verdana, Geneva, sans-serif !important;
94
+ font-weight: bold;
95
+ }
96
+
97
+ .text-elhuyar{
98
+ color: #0045e7;
99
+ }
100
+
101
+ #header{
102
+ padding: 50px;
103
+ padding-top: 30px;
104
+ background-color: #5b65a7;
105
+ }
106
+ #header h1,h3{
107
+ color: white;
108
+ }
109
 
110
+ button.primary{
111
+ background-color: #5b65a7;
112
+ }
113
+ button.primary:hover{
114
+ background-color: #3c4687;
115
+ }
116
 
117
+ button.selected{
118
+ color: #5b65a7 !important;
119
+ }
120
+ button.selected::after{
121
+ background-color: #5b65a7;
122
+ }
123
 
124
+ .record-button::before{
125
+ background: #5b65a7;
126
+ }
127
 
128
+ """
129
+
130
+
131
+ demo = gr.Blocks(css=css_content) #, fill_width=True)
132
  with demo:
133
+ gr.HTML("""
134
+ <div id="header">
135
+ <h1>LANGSWITCH</h1>
136
+ <h3>Multilingual Automatic Speech Recognition in noisy environments</h3>
137
+ </div>
138
+ """)
139
 
140
+ with gr.Tab("Transcribe microphone"):
141
  iface = gr.Interface(
142
+ fn=transcribe_mic,
143
  inputs=[
144
  gr.Audio(sources="microphone", type="filepath"),
145
+ gr.Dropdown(choices=[("Basque", "eu"),
146
+ ("Spanish", "es"),
147
+ ("English", "en")],
148
+ #("French", "fr"),
149
+ #("Italian", "it"),
150
+ value="en")
151
+ ],
152
+ outputs=[
153
+ gr.Textbox(label="Transcription", autoscroll=False),
154
+ gr.Textbox(label="Speaker Identification", autoscroll=False)
155
+ ],
156
+ allow_flagging="never",
157
+ )
158
+
159
+ with gr.Tab("Transcribe local file"):
160
+ iface = gr.Interface(
161
+ fn=transcribe_file,
162
+ inputs=[
163
  gr.Audio(sources="upload", type="filepath"),
164
  gr.Dropdown(choices=[("Basque", "eu"),
165
  ("Spanish", "es"),
 
174
  ],
175
  allow_flagging="never",
176
  )
177
+
178
+ gr.HTML("""
179
+ <div id="orai-info">
180
+ <div class="logos">
181
+ <div class="sermas-logo">
182
+ <img src="gradio_api/file=assets/sermas-logo.png" width=100/>
183
+ <span>SERMAS</span>
184
+ </div>
185
+ <img src="gradio_api/file=assets/orai_bw.svg" width=175/>
186
+ </div>
187
+ <div id="orai-info-text">
188
+ <p>The <span class="bold">LANGSWITCH</span> sub-project is part of the Open Call 1 of the <span class="bold">SERMAS</span> project. The goal of the <span class="bold">SERMAS</span> project is to provide socially-acceptable extended reality models and systems.</p>
189
+ <p>The technology powering LANGSWITCH was developed by <span class="bold">Orai NLP Teknologiak</span></p>
190
+ <p><span class="bold">Orai NLP Teknologiak</span> specializes in research, development, and innovation in artificial intelligence, with a focus on fostering a more competitive industrial and business landscape, enhancing public administration efficiency, and promoting a more inclusive society.</p>
191
+ </div>
192
+ </div>
193
+ <p>""")
194
  demo.queue(max_size=1)
195
+ demo.launch(share=False, max_threads=3, allowed_paths=[f"{os.getcwd()}/assets/"], auth=(os.getenv("username"), os.getenv("password")), auth_message="Please provide a username and a password.")
assets/code.webp ADDED
assets/orai_bw.svg ADDED
assets/sermas-logo.png ADDED
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  soundfile==0.12.1
2
  requests==2.31.0
3
- PyYAML==6.0.1
 
1
  soundfile==0.12.1
2
  requests==2.31.0
3
+ gradio==5.9.1