tinyyy / app.py
hackergeek98's picture
Update app.py
d10f84e verified
raw
history blame
2.52 kB
import torch
import torchaudio
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pydub import AudioSegment
import os
import gradio as gr
# Load the model and processor
model_id = "hackergeek98/whisper-fa-tinyyy"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to(device)
processor = AutoProcessor.from_pretrained(model_id)
# Create ASR pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=0 if torch.cuda.is_available() else -1,
)
# Convert audio to WAV format
def convert_to_wav(audio_path):
audio = AudioSegment.from_file(audio_path)
wav_path = "converted_audio.wav"
audio.export(wav_path, format="wav")
return wav_path
# Split long audio into chunks
def split_audio(audio_path, chunk_length_ms=30000): # Default: 30 sec per chunk
audio = AudioSegment.from_wav(audio_path)
chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
chunk_paths = []
for i, chunk in enumerate(chunks):
chunk_path = f"chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
chunk_paths.append(chunk_path)
return chunk_paths
# **๐Ÿ”น Fixed: Read Audio Before Passing to Model**
def transcribe_audio_chunk(chunk_path):
waveform, sampling_rate = torchaudio.load(chunk_path) # Load audio
waveform = waveform.numpy() # Convert to numpy
result = pipe({"raw": waveform, "sampling_rate": sampling_rate}) # Pass raw data
return result["text"]
# Transcribe a long audio file
def transcribe_long_audio(audio_path):
wav_path = convert_to_wav(audio_path)
chunk_paths = split_audio(wav_path)
transcription = ""
for chunk in chunk_paths:
transcription += transcribe_audio_chunk(chunk) + "\n"
os.remove(chunk) # Remove processed chunk
os.remove(wav_path) # Cleanup original file
return transcription
# Gradio interface
def transcribe_interface(audio_file):
if not audio_file:
return "No file uploaded."
return transcribe_long_audio(audio_file)
iface = gr.Interface(
fn=transcribe_interface,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Whisper ASR - Transcription",
description="Upload an audio file, and the model will transcribe it."
)
if __name__ == "__main__":
iface.launch()