import gradio as gr from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration import librosa import torch import os from huggingface_hub import login # Obtener el token desde las variables de entorno token = os.getenv("HF_TOKEN") if token: login(token=token) else: raise ValueError("El token de Hugging Face no está configurado en las variables de entorno.") device = "cuda" if torch.cuda.is_available() else "cpu" # Cargar el modelo repo_name = "HugoZeballos/rapa_nui_asr_2" # Ajusta al nombre de tu modelo en Hugging Face processor = Speech2TextProcessor.from_pretrained(repo_name) model = Speech2TextForConditionalGeneration.from_pretrained(repo_name).to(device) # Cambiar `source` a una configuración válida o eliminarlo inputs = gr.Audio(type="filepath") outputs = gr.Textbox(label="Transcripción") def transcribe(audio_path): audio, sr = librosa.load(audio_path, sr=16000) inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding="longest").to("cuda") with torch.no_grad(): predicted_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription # Crear interfaz interface = gr.Interface( fn=transcribe, inputs=inputs, outputs=outputs, title="ASR Demo" ) # Ejecutar la app if __name__ == "__main__": interface.launch()