Spaces:
Running
Running
File size: 4,421 Bytes
cef05ee dff819c 0067ac9 cef05ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import sys
import logging
import gradio as gr
import shutil
from demucs_handler import DemucsProcessor, check_dependencies, configure_model
from whisper_handler import WhisperTranscriber
import tempfile
import torch
import torchaudio
import soundfile as sf
import librosa
import numpy as np
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def validate_environment():
try:
import torch
import torchaudio
import demucs
logging.info(f"PyTorch version: {torch.__version__}")
logging.info(f"Torchaudio version: {torchaudio.__version__}")
logging.info(f"CUDA available: {torch.cuda.is_available()}")
except ImportError as e:
logging.error(f"Environment validation failed: {e}")
sys.exit(1)
def create_interface():
validate_environment()
processor = DemucsProcessor()
transcriber = WhisperTranscriber()
def process_audio(audio_file, whisper_model="base", progress=gr.Progress()):
if audio_file is None:
return None, "Please upload an audio file."
temp_files = []
try:
progress(0, desc="Starting processing")
logging.info(f"Processing file: {audio_file}")
with tempfile.TemporaryDirectory() as temp_dir:
temp_audio_path = os.path.join(temp_dir, "input.wav")
vocals_output_path = os.path.join(temp_dir, "vocals.wav")
# Convert to WAV first
audio, sr = librosa.load(audio_file, sr=44100)
# Fixed: use samplerate instead of sr
sf.write(temp_audio_path, audio, samplerate=sr)
temp_files.append(temp_audio_path)
progress(0.1, desc="Separating vocals")
try:
vocals_path = processor.separate_vocals(temp_audio_path)
# Copy vocals to output path
shutil.copy2(vocals_path, vocals_output_path)
temp_files.append(vocals_output_path)
except RuntimeError as e:
logging.error(f"Vocal separation failed: {str(e)}")
return None, f"Vocal separation failed: {str(e)}"
# Load the processed vocals for playback
vocals_audio, vocals_sr = librosa.load(vocals_output_path, sr=None)
progress(0.75, desc="Transcribing")
lyrics = transcriber.transcribe(vocals_output_path)
progress(1.0, desc="Processing complete")
# Return the audio data tuple and lyrics
return (vocals_sr, vocals_audio), lyrics
except Exception as e:
error_message = f"Processing error: {str(e)}"
logging.error(error_message)
return None, error_message
finally:
# Cleanup temporary files
for file in temp_files:
if file and os.path.exists(file):
try:
os.remove(file)
except:
pass
interface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(label="Upload Audio File", type="filepath"),
gr.Dropdown(
choices=["tiny", "base", "small", "medium", "large-v2"],
value="medium",
label="Whisper Model Size"
)
],
outputs=[
gr.Audio(label="Isolated Vocals", type="numpy"),
gr.Textbox(label="Transcribed Lyrics", lines=10, max_lines=20)
],
title="Audio Lyrics Extractor",
description="Upload an audio file to extract vocals and transcribe lyrics\n"+
" Created by Ever Olivares - Looking for Summer 2025 Internship Opportunities\n" +
" Connect with me: [LinkedIn](https://www.linkedin.com/in/everolivares/)"+" Currently not working as intended on HF tested on LightningAI with T4 running largeV2",
analytics_enabled=False
)
return interface
if __name__ == "__main__":
if not check_dependencies():
print("Please install missing dependencies")
exit(1)
interface = create_interface()
interface.launch()
|