wav2vec-pashto-asr

Runtime error

App Files Files Community

Update app.py

by raz-135 - opened Jun 4

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+199

-95

Files changed (1) hide show

app.py +199 -95

app.py CHANGED Viewed

@@ -1,106 +1,210 @@
 import torch
 import gradio as gr
 import pytube as pt
 from transformers import pipeline
-from huggingface_hub import model_info
-#from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-MODEL_NAME = "ihanif/wav2vec2-xls-r-300m-pashto"
-lang = "ps"
-#load pre-trained model and tokenizer
-#processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-#model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
 device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    #chunk_length_s=30,
-    device=device,
-)
-#pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 def transcribe(microphone, file_upload):
-    warn_output = ""
-    # if (microphone is not None) and (file_upload is not None):
-    #     warn_output = (
-    #         "WARNING: You've uploaded an audio file and used the microphone. "
-    #         "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-    #     )
-    # elif (microphone is None) and (file_upload is None):
-    #     return "ERROR: You have to either use the microphone or upload an audio file"
-    if (microphone is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    #transcription = wav2vec_model(audio)["text"]
-    return warn_output + text
-def _return_yt_html_embed(yt_url):
-    video_id = yt_url.split("?v=")[-1]
-    HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-        " </center>"
-    )
-    return HTML_str
-def yt_transcribe(yt_url):
-    yt = pt.YouTube(yt_url)
-    html_embed_str = _return_yt_html_embed(yt_url)
-    stream = yt.streams.filter(only_audio=True)[0]
-    stream.download(filename="audio.mp3")
-    text = pipe("audio.mp3")["text"]
-    return html_embed_str, text
-demo = gr.Blocks()
-examples=[["example-1.wav","example-2.wav"]]
-# examples=["example-1.wav"]
-mf_transcribe = gr.Interface(
-    fn=transcribe,
-    inputs=[
-        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
-        gr.inputs.Audio(source="upload", type="filepath", optional=True),
-    ],
-    outputs="text",
-    layout="horizontal",
-    theme="huggingface",
-    title="(Pashto ASR) د پښتو اتوماتیک وینا پیژندنه",
-    description=(
-        "</p> تاسو کولی شئ یو آډیو فایل اپلوډ کړئ یا په خپل وسیله مایکروفون وکاروئ. مهرباني وکړئ ډاډ ترلاسه کړئ چې تاسو اجازه ورکړې ده<p>"
-    ),
-    allow_flagging="never",
-    examples=examples,
-)
-yt_transcribe = gr.Interface(
-    fn=yt_transcribe,
-    inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
-    outputs=["html", "text"],
-    layout="horizontal",
-    theme="huggingface",
-    title="(Transcribe YouTube) د پښتو اتوماتیک وینا پیژندنه",
-    description=(
-        "مهرباني وکړئ د خپل غږ په کارولو سره د پښتو لیکلو لپاره لاندې اپلیکیشن وکاروئ. تاسو کولی شئ یو آډیو فایل اپلوډ کړئ یا په خپل وسیله مایکروفون وکاروئ. مهرباني وکړئ ډاډ ترلاسه کړئ چې تاسو اجازه ورکړې ده"
-    ),
-    allow_flagging="never",
-)
-with demo:
-    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
-demo.launch(enable_queue=False)

 import torch
 import gradio as gr
 import pytube as pt
 from transformers import pipeline
+import os
+import tempfile
+import warnings
+# Suppress warnings for cleaner output in Spaces
+warnings.filterwarnings("ignore")
+MODEL_NAME = "ihanif/wav2vec2-xls-r-300m-pashto"
 device = 0 if torch.cuda.is_available() else "cpu"
+# Initialize pipeline globally to avoid reloading
+print("🔄 Loading Pashto ASR model...")
+try:
+    pipe = pipeline(
+        task="automatic-speech-recognition",
+        model=MODEL_NAME,
+        device=device,
+    )
+    print("✅ Model loaded successfully!")
+except Exception as e:
+    print(f"❌ Failed to load model: {e}")
+    pipe = None
 def transcribe(microphone, file_upload):
+    """Transcribe audio from microphone or uploaded file"""
+    if pipe is None:
+        return "❌ Model not available. Please try again later."
+    if microphone is None and file_upload is None:
+        return "⚠️ Please provide audio input through microphone or file upload."
+    # Use microphone input if available, otherwise use uploaded file
+    audio_input = microphone if microphone is not None else file_upload
+    try:
+        # Process the audio
+        result = pipe(audio_input)
+        transcription = result["text"] if isinstance(result, dict) else str(result)
+        if not transcription.strip():
+            return "⚠️ No speech detected. Please ensure the audio contains clear Pashto speech."
+        return f"📝 **Transcription:**\n\n{transcription}"
+    except Exception as e:
+        return f"❌ Transcription failed: {str(e)}"
+def create_youtube_embed(yt_url):
+    """Create YouTube embed HTML"""
+    try:
+        # Extract video ID from various YouTube URL formats
+        if "youtu.be/" in yt_url:
+            video_id = yt_url.split("youtu.be/")[-1].split("?")[0]
+        elif "watch?v=" in yt_url:
+            video_id = yt_url.split("watch?v=")[-1].split("&")[0]
+        else:
+            return '<div style="text-align: center; color: red;">Invalid YouTube URL</div>'
+        return f'''
+        <div style="text-align: center;">
+            <iframe width="560" height="315"
+                    src="https://www.youtube.com/embed/{video_id}"
+                    frameborder="0"
+                    allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
+                    allowfullscreen>
+            </iframe>
+        </div>
+        '''
+    except Exception:
+        return '<div style="text-align: center; color: red;">Error creating video embed</div>'
+def transcribe_youtube(yt_url):
+    """Transcribe audio from YouTube video"""
+    if pipe is None:
+        return "", "❌ Model not available. Please try again later."
+    if not yt_url or not yt_url.strip():
+        return "", "⚠️ Please provide a valid YouTube URL."
+    try:
+        # Create video embed
+        embed_html = create_youtube_embed(yt_url)
+        # Download YouTube audio
+        yt = pt.YouTube(yt_url)
+        # Get the best audio stream
+        audio_stream = yt.streams.filter(only_audio=True).first()
+        if not audio_stream:
+            return embed_html, "❌ No audio stream found in this video."
+        # Create temporary file for audio
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
+            audio_file = tmp_file.name
+        try:
+            # Download audio
+            audio_stream.download(filename=audio_file)
+            # Transcribe
+            result = pipe(audio_file)
+            transcription = result["text"] if isinstance(result, dict) else str(result)
+            if not transcription.strip():
+                return embed_html, "⚠️ No Pashto speech detected in the video."
+            return embed_html, f"📝 **Transcription:**\n\n{transcription}"
+        finally:
+            # Clean up temporary file
+            if os.path.exists(audio_file):
+                os.remove(audio_file)
+    except Exception as e:
+        return create_youtube_embed(yt_url), f"❌ YouTube transcription failed: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(
+    title="Pashto ASR - د پښتو وینا پیژندنه",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 900px !important;
+        margin: auto !important;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎤 Pashto Speech Recognition
+    # د پښتو اتوماتیک وینا پیژندنه
+    This application transcribes Pashto speech to text using advanced AI models.
+    """)
+    with gr.Tabs():
+        with gr.TabItem("🎵 Audio Transcription"):
+            gr.Markdown("### Upload an audio file or record using your microphone")
+            with gr.Row():
+                with gr.Column():
+                    microphone_input = gr.Audio(
+                        source="microphone",
+                        type="filepath",
+                        label="🎤 Record Audio"
+                    )
+                    file_input = gr.Audio(
+                        source="upload",
+                        type="filepath",
+                        label="📁 Upload Audio File"
+                    )
+                    transcribe_btn = gr.Button("🔄 Transcribe", variant="primary")
+                with gr.Column():
+                    audio_output = gr.Textbox(
+                        label="📝 Transcription Result",
+                        lines=8,
+                        placeholder="Transcription will appear here..."
+                    )
+            transcribe_btn.click(
+                fn=transcribe,
+                inputs=[microphone_input, file_input],
+                outputs=audio_output
+            )
+        with gr.TabItem("📺 YouTube Transcription"):
+            gr.Markdown("### Enter a YouTube URL to transcribe Pashto content")
+            with gr.Row():
+                youtube_url = gr.Textbox(
+                    label="🔗 YouTube URL",
+                    placeholder="https://www.youtube.com/watch?v=...",
+                    lines=1
+                )
+                youtube_btn = gr.Button("🔄 Transcribe YouTube", variant="primary")
+            youtube_video = gr.HTML(label="📺 Video Preview")
+            youtube_output = gr.Textbox(
+                label="📝 Transcription Result",
+                lines=8,
+                placeholder="YouTube transcription will appear here..."
+            )
+            youtube_btn.click(
+                fn=transcribe_youtube,
+                inputs=youtube_url,
+                outputs=[youtube_video, youtube_output]
+            )
+    gr.Markdown("""
+    ---
+    ### 📋 Instructions:
+    - **Audio Transcription**: Upload a Pashto audio file or record directly using your microphone
+    - **YouTube Transcription**: Paste a YouTube URL containing Pashto speech
+    - **Supported formats**: WAV, MP3, MP4, and other common audio formats
+    - **Note**: This model works best with clear Pashto speech
+    ### 🔧 Powered by:
+    - Model: `ihanif/wav2vec2-xls-r-300m-pashto`
+    - Framework: Transformers + Gradio
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()