import os os.environ["KERAS_BACKEND"] = "jax" os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' import logging from pathlib import Path import numpy as np import librosa import tensorflow_hub as hub from flask import Flask, render_template, request, jsonify, session from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import keras import torch from werkzeug.utils import secure_filename import traceback # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('app.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) # Environment setup class AudioProcessor: _instance = None _initialized = False def __new__(cls): if cls._instance is None: cls._instance = super(AudioProcessor, cls).__new__(cls) return cls._instance def __init__(self): if not AudioProcessor._initialized: self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 self.initialize_models() AudioProcessor._initialized = True def initialize_models(self): try: logger.info("Initializing models...") # Initialize transcription model model_id = "distil-whisper/distil-large-v3" self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) self.transcription_model.to(self.device) self.processor = AutoProcessor.from_pretrained(model_id) # Initialize classification model self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification") # Initialize pipeline self.pipe = pipeline( "automatic-speech-recognition", model=self.transcription_model, tokenizer=self.processor.tokenizer, feature_extractor=self.processor.feature_extractor, max_new_tokens=128, chunk_length_s=25, batch_size=16, torch_dtype=self.torch_dtype, device=self.device, ) # Initialize YAMNet model self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') logger.info("Models initialized successfully") except Exception as e: logger.error(f"Error initializing models: {str(e)}") raise def load_wav_16k_mono(self, filename): try: wav, sr = librosa.load(filename, mono=True, sr=None) if sr != 16000: wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) return wav except Exception as e: logger.error(f"Error loading audio file: {str(e)}") raise def get_features_yamnet_extract_embedding(self, wav_data): try: scores, embeddings, spectrogram = self.yamnet_model(wav_data) return np.mean(embeddings.numpy(), axis=0) except Exception as e: logger.error(f"Error extracting YAMNet embeddings: {str(e)}") raise # Initialize Flask application app = Flask(__name__) app.secret_key = 'your_secret_key_here' app.config['UPLOAD_FOLDER'] = Path('uploads') app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # Create upload folder app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True) # Initialize audio processor (will only happen once) audio_processor = AudioProcessor() @app.route('/') def index(): session.clear() return render_template('terminal.html') @app.route('/process', methods=['POST']) def process(): try: data = request.json command = data.get('command', '').strip().lower() if command in ['classify', 'transcribe']: session['operation'] = command return jsonify({ 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.', 'upload': True }) else: return jsonify({ 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".' }) except Exception as e: logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}") session.pop('operation', None) return jsonify({'result': f'root@math:~$ Error: {str(e)}'}) @app.route('/upload', methods=['POST']) def upload(): filepath = None try: operation = session.get('operation') if not operation: return jsonify({ 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".' }) if 'file' not in request.files: return jsonify({'result': 'root@math:~$ No file uploaded.'}) file = request.files['file'] if file.filename == '' or not file.filename.lower().endswith('.mp3'): return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'}) filename = secure_filename(file.filename) filepath = app.config['UPLOAD_FOLDER'] / filename file.save(filepath) wav_data = audio_processor.load_wav_16k_mono(filepath) if operation == 'classify': embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data) embeddings = np.reshape(embeddings, (-1, 1024)) result = np.argmax(audio_processor.classification_model.predict(embeddings)) elif operation == 'transcribe': result = audio_processor.pipe(str(filepath))['text'] else: result = 'Invalid operation' return jsonify({ 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".', 'upload': False }) except Exception as e: logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}") return jsonify({ 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".' }) finally: session.pop('operation', None) if filepath and Path(filepath).exists(): try: Path(filepath).unlink() except Exception as e: logger.error(f"Error deleting file {filepath}: {str(e)}") if __name__ == '__main__': # Set debug=False to prevent reloading app.run(debug=False, host='0.0.0.0', port=5000)