Files changed (1) hide show
  1. app.py +199 -95
app.py CHANGED
@@ -1,106 +1,210 @@
1
  import torch
2
-
3
  import gradio as gr
4
  import pytube as pt
5
  from transformers import pipeline
6
- from huggingface_hub import model_info
7
- #from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
8
-
9
- MODEL_NAME = "ihanif/wav2vec2-xls-r-300m-pashto"
10
- lang = "ps"
11
 
12
- #load pre-trained model and tokenizer
13
- #processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
14
- #model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
15
 
 
16
  device = 0 if torch.cuda.is_available() else "cpu"
17
- pipe = pipeline(
18
- task="automatic-speech-recognition",
19
- model=MODEL_NAME,
20
- #chunk_length_s=30,
21
- device=device,
22
- )
23
 
24
- #pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def transcribe(microphone, file_upload):
27
- warn_output = ""
28
- # if (microphone is not None) and (file_upload is not None):
29
- # warn_output = (
30
- # "WARNING: You've uploaded an audio file and used the microphone. "
31
- # "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
32
- # )
33
-
34
- # elif (microphone is None) and (file_upload is None):
35
- # return "ERROR: You have to either use the microphone or upload an audio file"
36
-
37
- if (microphone is None) and (file_upload is None):
38
- return "ERROR: You have to either use the microphone or upload an audio file"
 
 
39
 
40
- file = microphone if microphone is not None else file_upload
41
-
42
- text = pipe(file)["text"]
43
- #transcription = wav2vec_model(audio)["text"]
44
-
45
- return warn_output + text
46
-
47
-
48
- def _return_yt_html_embed(yt_url):
49
- video_id = yt_url.split("?v=")[-1]
50
- HTML_str = (
51
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
52
- " </center>"
53
- )
54
- return HTML_str
55
-
56
-
57
- def yt_transcribe(yt_url):
58
- yt = pt.YouTube(yt_url)
59
- html_embed_str = _return_yt_html_embed(yt_url)
60
- stream = yt.streams.filter(only_audio=True)[0]
61
- stream.download(filename="audio.mp3")
62
-
63
- text = pipe("audio.mp3")["text"]
64
-
65
- return html_embed_str, text
66
-
67
-
68
- demo = gr.Blocks()
69
-
70
- examples=[["example-1.wav","example-2.wav"]]
71
- # examples=["example-1.wav"]
72
-
73
- mf_transcribe = gr.Interface(
74
- fn=transcribe,
75
- inputs=[
76
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
77
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
78
- ],
79
- outputs="text",
80
- layout="horizontal",
81
- theme="huggingface",
82
- title="(Pashto ASR) د پښتو اتوماتیک وینا پیژندنه",
83
- description=(
84
- "</p> تاسو کولی شئ یو آډیو فایل اپلوډ کړئ یا په خپل وسیله مایکروفون وکاروئ. مهرباني وکړئ ډاډ ترلاسه کړئ چې تاسو اجازه ورکړې ده<p>"
85
- ),
86
- allow_flagging="never",
87
- examples=examples,
88
- )
89
-
90
- yt_transcribe = gr.Interface(
91
- fn=yt_transcribe,
92
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
93
- outputs=["html", "text"],
94
- layout="horizontal",
95
- theme="huggingface",
96
- title="(Transcribe YouTube) د پښتو اتوماتیک وینا پیژندنه",
97
- description=(
98
- "مهرباني وکړئ د خپل غږ په کارولو سره د پښتو لیکلو لپاره لاندې اپلیکیشن وکاروئ. تاسو کولی شئ یو آډیو فایل اپلوډ کړئ یا په خپل وسیله مایکروفون وکاروئ. مهرباني وکړئ ډاډ ترلاسه کړئ چې تاسو اجازه ورکړې ده"
99
- ),
100
- allow_flagging="never",
101
- )
102
-
103
- with demo:
104
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
105
-
106
- demo.launch(enable_queue=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
 
2
  import gradio as gr
3
  import pytube as pt
4
  from transformers import pipeline
5
+ import os
6
+ import tempfile
7
+ import warnings
 
 
8
 
9
+ # Suppress warnings for cleaner output in Spaces
10
+ warnings.filterwarnings("ignore")
 
11
 
12
+ MODEL_NAME = "ihanif/wav2vec2-xls-r-300m-pashto"
13
  device = 0 if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
14
 
15
+ # Initialize pipeline globally to avoid reloading
16
+ print("🔄 Loading Pashto ASR model...")
17
+ try:
18
+ pipe = pipeline(
19
+ task="automatic-speech-recognition",
20
+ model=MODEL_NAME,
21
+ device=device,
22
+ )
23
+ print("✅ Model loaded successfully!")
24
+ except Exception as e:
25
+ print(f"❌ Failed to load model: {e}")
26
+ pipe = None
27
 
28
  def transcribe(microphone, file_upload):
29
+ """Transcribe audio from microphone or uploaded file"""
30
+ if pipe is None:
31
+ return "❌ Model not available. Please try again later."
32
+
33
+ if microphone is None and file_upload is None:
34
+ return "⚠️ Please provide audio input through microphone or file upload."
35
+
36
+ # Use microphone input if available, otherwise use uploaded file
37
+ audio_input = microphone if microphone is not None else file_upload
38
+
39
+ try:
40
+ # Process the audio
41
+ result = pipe(audio_input)
42
+ transcription = result["text"] if isinstance(result, dict) else str(result)
43
 
44
+ if not transcription.strip():
45
+ return "⚠️ No speech detected. Please ensure the audio contains clear Pashto speech."
46
+
47
+ return f"📝 **Transcription:**\n\n{transcription}"
48
+
49
+ except Exception as e:
50
+ return f"❌ Transcription failed: {str(e)}"
51
+
52
+ def create_youtube_embed(yt_url):
53
+ """Create YouTube embed HTML"""
54
+ try:
55
+ # Extract video ID from various YouTube URL formats
56
+ if "youtu.be/" in yt_url:
57
+ video_id = yt_url.split("youtu.be/")[-1].split("?")[0]
58
+ elif "watch?v=" in yt_url:
59
+ video_id = yt_url.split("watch?v=")[-1].split("&")[0]
60
+ else:
61
+ return '<div style="text-align: center; color: red;">Invalid YouTube URL</div>'
62
+
63
+ return f'''
64
+ <div style="text-align: center;">
65
+ <iframe width="560" height="315"
66
+ src="https://www.youtube.com/embed/{video_id}"
67
+ frameborder="0"
68
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
69
+ allowfullscreen>
70
+ </iframe>
71
+ </div>
72
+ '''
73
+ except Exception:
74
+ return '<div style="text-align: center; color: red;">Error creating video embed</div>'
75
+
76
+ def transcribe_youtube(yt_url):
77
+ """Transcribe audio from YouTube video"""
78
+ if pipe is None:
79
+ return "", "❌ Model not available. Please try again later."
80
+
81
+ if not yt_url or not yt_url.strip():
82
+ return "", "⚠️ Please provide a valid YouTube URL."
83
+
84
+ try:
85
+ # Create video embed
86
+ embed_html = create_youtube_embed(yt_url)
87
+
88
+ # Download YouTube audio
89
+ yt = pt.YouTube(yt_url)
90
+
91
+ # Get the best audio stream
92
+ audio_stream = yt.streams.filter(only_audio=True).first()
93
+ if not audio_stream:
94
+ return embed_html, "❌ No audio stream found in this video."
95
+
96
+ # Create temporary file for audio
97
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
98
+ audio_file = tmp_file.name
99
+
100
+ try:
101
+ # Download audio
102
+ audio_stream.download(filename=audio_file)
103
+
104
+ # Transcribe
105
+ result = pipe(audio_file)
106
+ transcription = result["text"] if isinstance(result, dict) else str(result)
107
+
108
+ if not transcription.strip():
109
+ return embed_html, "⚠️ No Pashto speech detected in the video."
110
+
111
+ return embed_html, f"📝 **Transcription:**\n\n{transcription}"
112
+
113
+ finally:
114
+ # Clean up temporary file
115
+ if os.path.exists(audio_file):
116
+ os.remove(audio_file)
117
+
118
+ except Exception as e:
119
+ return create_youtube_embed(yt_url), f"❌ YouTube transcription failed: {str(e)}"
120
+
121
+ # Create Gradio interface
122
+ with gr.Blocks(
123
+ title="Pashto ASR - د پښتو وینا پیژندنه",
124
+ theme=gr.themes.Soft(),
125
+ css="""
126
+ .gradio-container {
127
+ max-width: 900px !important;
128
+ margin: auto !important;
129
+ }
130
+ """
131
+ ) as demo:
132
+
133
+ gr.Markdown("""
134
+ # 🎤 Pashto Speech Recognition
135
+ # د پښتو اتوماتیک وینا پیژندنه
136
+
137
+ This application transcribes Pashto speech to text using advanced AI models.
138
+ """)
139
+
140
+ with gr.Tabs():
141
+ with gr.TabItem("🎵 Audio Transcription"):
142
+ gr.Markdown("### Upload an audio file or record using your microphone")
143
+
144
+ with gr.Row():
145
+ with gr.Column():
146
+ microphone_input = gr.Audio(
147
+ source="microphone",
148
+ type="filepath",
149
+ label="🎤 Record Audio"
150
+ )
151
+ file_input = gr.Audio(
152
+ source="upload",
153
+ type="filepath",
154
+ label="📁 Upload Audio File"
155
+ )
156
+ transcribe_btn = gr.Button("🔄 Transcribe", variant="primary")
157
+
158
+ with gr.Column():
159
+ audio_output = gr.Textbox(
160
+ label="📝 Transcription Result",
161
+ lines=8,
162
+ placeholder="Transcription will appear here..."
163
+ )
164
+
165
+ transcribe_btn.click(
166
+ fn=transcribe,
167
+ inputs=[microphone_input, file_input],
168
+ outputs=audio_output
169
+ )
170
+
171
+ with gr.TabItem("📺 YouTube Transcription"):
172
+ gr.Markdown("### Enter a YouTube URL to transcribe Pashto content")
173
+
174
+ with gr.Row():
175
+ youtube_url = gr.Textbox(
176
+ label="🔗 YouTube URL",
177
+ placeholder="https://www.youtube.com/watch?v=...",
178
+ lines=1
179
+ )
180
+ youtube_btn = gr.Button("🔄 Transcribe YouTube", variant="primary")
181
+
182
+ youtube_video = gr.HTML(label="📺 Video Preview")
183
+ youtube_output = gr.Textbox(
184
+ label="📝 Transcription Result",
185
+ lines=8,
186
+ placeholder="YouTube transcription will appear here..."
187
+ )
188
+
189
+ youtube_btn.click(
190
+ fn=transcribe_youtube,
191
+ inputs=youtube_url,
192
+ outputs=[youtube_video, youtube_output]
193
+ )
194
+
195
+ gr.Markdown("""
196
+ ---
197
+ ### 📋 Instructions:
198
+ - **Audio Transcription**: Upload a Pashto audio file or record directly using your microphone
199
+ - **YouTube Transcription**: Paste a YouTube URL containing Pashto speech
200
+ - **Supported formats**: WAV, MP3, MP4, and other common audio formats
201
+ - **Note**: This model works best with clear Pashto speech
202
+
203
+ ### 🔧 Powered by:
204
+ - Model: `ihanif/wav2vec2-xls-r-300m-pashto`
205
+ - Framework: Transformers + Gradio
206
+ """)
207
+
208
+ # Launch the app
209
+ if __name__ == "__main__":
210
+ demo.launch()