alakxender commited on
Commit
6fb3e63
·
0 Parent(s):
Files changed (7) hide show
  1. .gitattributes +36 -0
  2. .gitignore +1 -0
  3. README.md +12 -0
  4. app.py +110 -0
  5. packages.txt +1 -0
  6. requirements.txt +1 -0
  7. sample.mp3 +3 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio_cached_examples
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Dhivehi Whisper Demo
3
+ emoji: 🏆
4
+ colorFrom: pink
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.7.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ import gradio as gr
4
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
5
+ import tempfile
6
+ import os
7
+
8
+ # Model configuration, this model contains synthetic data
9
+ MODEL_ID = "alakxender/whisper-small-dv-full"
10
+ BATCH_SIZE = 8
11
+ FILE_LIMIT_MB = 1000
12
+ CHUNK_LENGTH_S = 30
13
+ STRIDE_LENGTH_S = 5
14
+
15
+ # Device and dtype setup
16
+ device = 0 if torch.cuda.is_available() else "cpu"
17
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
18
+
19
+ # Initialize model with memory optimizations
20
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
21
+ MODEL_ID,
22
+ torch_dtype=torch_dtype,
23
+ low_cpu_mem_usage=True,
24
+ use_safetensors=True
25
+ )
26
+ model.to(device)
27
+
28
+ # Initialize processor
29
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
30
+
31
+ # Single pipeline initialization with all components
32
+ pipe = pipeline(
33
+ "automatic-speech-recognition",
34
+ model=model,
35
+ tokenizer=processor.tokenizer,
36
+ feature_extractor=processor.feature_extractor,
37
+ chunk_length_s=CHUNK_LENGTH_S,
38
+ stride_length_s=STRIDE_LENGTH_S,
39
+ batch_size=BATCH_SIZE,
40
+ torch_dtype=torch_dtype,
41
+ device=device,
42
+ )
43
+
44
+ # Define the generation arguments
45
+
46
+ generate_kwargs = {
47
+ "max_new_tokens": model.config.max_target_positions-4,
48
+ "num_beams": 4,
49
+ "condition_on_prev_tokens": False,
50
+ "compression_ratio_threshold": 1.35,
51
+ #"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
52
+ #"logprob_threshold": -1.0,
53
+ #"no_speech_threshold": 0.6,
54
+ #"return_timestamps"=True
55
+ }
56
+
57
+ @spaces.GPU
58
+ def transcribe(audio_input):
59
+ if audio_input is None:
60
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
61
+
62
+ try:
63
+ # Use the defined generate_kwargs dictionary
64
+ result = pipe(
65
+ audio_input,
66
+ generate_kwargs=generate_kwargs
67
+ )
68
+ return result["text"]
69
+ except Exception as e:
70
+ # More detailed error logging might be helpful here if issues persist
71
+ print(f"Detailed Error: {e}")
72
+ raise gr.Error(f"Transcription failed: {str(e)}")
73
+
74
+ # Custom CSS with modern Gradio styling
75
+ custom_css = """
76
+ .thaana-textbox textarea {
77
+ font-size: 18px !important;
78
+ font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma', 'Noto Sans Thaana', 'MV Boli' !important;
79
+ line-height: 1.8 !important;
80
+ direction: rtl !important;
81
+ }
82
+ """
83
+
84
+ demo = gr.Blocks(css=custom_css)
85
+
86
+ file_transcribe = gr.Interface(
87
+ fn=transcribe,
88
+ inputs=[
89
+ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio file"),
90
+ ],
91
+ outputs=gr.Textbox(
92
+ label="",
93
+ lines=2,
94
+ elem_classes=["thaana-textbox"],
95
+ rtl=True
96
+ ),
97
+ title="Transcribe Dhivehi Audio",
98
+ description=(
99
+ "Upload an audio file or record using your microphone to transcribe."
100
+ ),
101
+ allow_flagging="never",
102
+ examples=[
103
+ ["sample.mp3"]
104
+ ],
105
+ )
106
+
107
+ with demo:
108
+ gr.TabbedInterface([file_transcribe], ["Audio file"])
109
+
110
+ demo.queue().launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ transformers
sample.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01db7c01f8f9dc7cb22c1252e04bfd46785ccb5cc50776b2f92195b64942cec5
3
+ size 1213926