Spaces:

peterkros
/

transcribeapi

Sleeping

File size: 1,802 Bytes

f345224
4ca61bc
d2753e9
4ec81f7
d222613
d2753e9
 
 
 
d222613
d2753e9
 
 
 
4ca61bc
 
 
 
 
 
4ec81f7
4ca61bc
d2753e9
 
 
 
 
 
 
 
 
 
 
 
f345224
 
 
 
4ca61bc
f345224
 
dd6327f
f345224
d222613
f345224
4ca61bc

import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import soundfile as sf

# Load Whisper model and processor from Hugging Face
model_name = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Ensure the model is using the correct device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def transcribe(audio):
    # Gradio passes audio as a numpy array, so no need to load from file.
    # If the input is a file path, load the audio from the file:
    if isinstance(audio, str):  # Assuming it's a file path
        audio, sampling_rate = sf.read(audio)
    
    # Process the audio to get input features
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(device)

    # Generate transcription with attention_mask and correct input_features
    attention_mask = torch.ones(input_features.shape, dtype=torch.long, device=device)
    generated_ids = model.generate(
        input_features=input_features,
        attention_mask=attention_mask,
        language="en"  # Force translation to English
    )
    
    # Decode transcription
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

# Create a Gradio Interface
interface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(sources="upload", type="numpy"),  # Correct handling of audio as numpy array
    outputs="text",
    title="Whisper Speech-to-Text API",
    description="Upload an audio file and get a transcription using OpenAI's Whisper model from Hugging Face."
)

# Launch the interface as an API
interface.launch()