app.py
CHANGED
@@ -27,20 +27,13 @@ pipe = pipeline(
|
|
27 |
|
28 |
|
29 |
@spaces.GPU
|
30 |
-
def transcribe(audio: str
|
31 |
-
if not audio:
|
32 |
-
return "No audio file", 0
|
33 |
filename = Path(audio).name
|
34 |
-
logger.info(f"Model: {model}")
|
35 |
-
logger.info(f"Audio: {filename}")
|
36 |
# Read and resample audio to 16kHz
|
37 |
y, sr = librosa.load(audio, mono=True, sr=16000)
|
38 |
# Get duration of audio
|
39 |
-
duration = librosa.get_duration(y=y, sr=sr)
|
40 |
-
logger.info(f"Duration: {duration:.2f}s")
|
41 |
-
start_time = time.time()
|
42 |
result = pipe(y, generate_kwargs=generate_kwargs)["text"]
|
43 |
-
|
44 |
return result
|
45 |
|
46 |
|
@@ -53,9 +46,8 @@ A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitc
|
|
53 |
with gr.Blocks() as app:
|
54 |
gr.Markdown(initial_md)
|
55 |
audio = gr.Audio(type="filepath")
|
56 |
-
transcript = gr.Button("Transcribe with Galgame-Whisper (WIP)")
|
57 |
output = gr.Textbox(label="Result")
|
58 |
-
transcript.click(transcribe(audio=audio,
|
59 |
|
60 |
|
61 |
# app.load(warmup, inputs=[], outputs=[warmup_result], queue=True)
|
|
|
27 |
|
28 |
|
29 |
@spaces.GPU
|
30 |
+
def transcribe(audio: str) -> tuple[str, float]:
|
|
|
|
|
31 |
filename = Path(audio).name
|
|
|
|
|
32 |
# Read and resample audio to 16kHz
|
33 |
y, sr = librosa.load(audio, mono=True, sr=16000)
|
34 |
# Get duration of audio
|
|
|
|
|
|
|
35 |
result = pipe(y, generate_kwargs=generate_kwargs)["text"]
|
36 |
+
print(result)
|
37 |
return result
|
38 |
|
39 |
|
|
|
46 |
with gr.Blocks() as app:
|
47 |
gr.Markdown(initial_md)
|
48 |
audio = gr.Audio(type="filepath")
|
|
|
49 |
output = gr.Textbox(label="Result")
|
50 |
+
transcript.click(transcribe(audio=audio),inputs=[audio], outputs=[output])
|
51 |
|
52 |
|
53 |
# app.load(warmup, inputs=[], outputs=[warmup_result], queue=True)
|