mrmuminov commited on
Commit
51d1944
·
verified ·
1 Parent(s): 270cde7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -29
app.py CHANGED
@@ -1,19 +1,11 @@
1
  import torch
2
  import gradio as gr
3
- import yt_dlp as youtube_dl
4
  from transformers import pipeline
5
  from transformers.pipelines.audio_utils import ffmpeg_read
6
- from urllib.parse import urlparse, parse_qs
7
-
8
- import tempfile
9
- import time
10
- import os
11
  import numpy as np
12
 
13
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
14
  BATCH_SIZE = 8
15
- FILE_LIMIT_MB = 1000
16
- YT_LENGTH_LIMIT_S = 3600 # 1 hour limit
17
 
18
  device = 0 if torch.cuda.is_available() else "cpu"
19
 
@@ -23,7 +15,6 @@ pipe = pipeline(
23
  chunk_length_s=9,
24
  device=device,
25
  model_kwargs={
26
- # "torch_dtype": torch.float16,
27
  "attn_implementation": "eager"
28
  },
29
  )
@@ -35,38 +26,29 @@ def transcribe(audio_file):
35
  with open(audio_file, "rb") as f:
36
  audio_data = f.read()
37
 
38
- audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
39
  duration = len(audio_array) / pipe.feature_extractor.sampling_rate
40
  print(f"Audio duration: {duration:.2f} seconds")
41
 
42
- inputs = {
43
- "array": np.array(audio_array),
44
- "sampling_rate": pipe.feature_extractor.sampling_rate
45
- }
46
-
47
- generate_kwargs = {
48
- "task": "transcribe",
49
- "no_speech_threshold": 0.4,
50
- "logprob_threshold": -1.0,
51
- "compression_ratio_threshold": 2.4
52
- }
53
-
54
  result = pipe(
55
- inputs,
56
  batch_size=BATCH_SIZE,
57
- generate_kwargs=generate_kwargs,
58
- return_timestamps=False
 
 
 
 
 
59
  )
60
 
61
- return result["text"]
62
 
63
  demo = gr.Blocks()
64
 
65
  file_transcribe = gr.Interface(
66
  fn=transcribe,
67
- inputs=[
68
- gr.Audio(type="filepath", label="Audio file"),
69
- ],
70
  outputs="text",
71
  title="Whisper Large V3: Transcribe Audio",
72
  description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
 
1
  import torch
2
  import gradio as gr
 
3
  from transformers import pipeline
4
  from transformers.pipelines.audio_utils import ffmpeg_read
 
 
 
 
 
5
  import numpy as np
6
 
7
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
8
  BATCH_SIZE = 8
 
 
9
 
10
  device = 0 if torch.cuda.is_available() else "cpu"
11
 
 
15
  chunk_length_s=9,
16
  device=device,
17
  model_kwargs={
 
18
  "attn_implementation": "eager"
19
  },
20
  )
 
26
  with open(audio_file, "rb") as f:
27
  audio_data = f.read()
28
 
29
+ audio_array = ffmpeg_read(audio_data, sampling_rate=pipe.feature_extractor.sampling_rate)
30
  duration = len(audio_array) / pipe.feature_extractor.sampling_rate
31
  print(f"Audio duration: {duration:.2f} seconds")
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  result = pipe(
34
+ inputs=audio_array,
35
  batch_size=BATCH_SIZE,
36
+ return_timestamps=False,
37
+ generate_kwargs={
38
+ "task": "transcribe",
39
+ "no_speech_threshold": 0.4,
40
+ "logprob_threshold": -1.0,
41
+ "compression_ratio_threshold": 2.4
42
+ }
43
  )
44
 
45
+ return result["text"] if isinstance(result, dict) else result
46
 
47
  demo = gr.Blocks()
48
 
49
  file_transcribe = gr.Interface(
50
  fn=transcribe,
51
+ inputs=gr.Audio(type="filepath", label="Audio file"),
 
 
52
  outputs="text",
53
  title="Whisper Large V3: Transcribe Audio",
54
  description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",