# import os # os.environ["KERAS_BACKEND"] = "jax" # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # import logging # from pathlib import Path # import numpy as np # import librosa # import tensorflow_hub as hub # from flask import Flask, render_template, request, jsonify, session # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline # import keras # import torch # from werkzeug.utils import secure_filename # import traceback # # Configure logging # logging.basicConfig( # level=logging.INFO, # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # handlers=[ # logging.FileHandler('app.log'), # logging.StreamHandler() # ] # ) # logger = logging.getLogger(__name__) # # Environment setup # class AudioProcessor: # _instance = None # _initialized = False # def __new__(cls): # if cls._instance is None: # cls._instance = super(AudioProcessor, cls).__new__(cls) # return cls._instance # def __init__(self): # if not AudioProcessor._initialized: # self.device = "cuda:0" if torch.cuda.is_available() else "cpu" # self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # self.initialize_models() # AudioProcessor._initialized = True # def initialize_models(self): # try: # logger.info("Initializing models...") # # Initialize transcription model # model_id = "distil-whisper/distil-large-v3" # self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained( # model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True # ) # self.transcription_model.to(self.device) # self.processor = AutoProcessor.from_pretrained(model_id) # # Initialize classification model # self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification") # # Initialize pipeline # self.pipe = pipeline( # "automatic-speech-recognition", # model=self.transcription_model, # tokenizer=self.processor.tokenizer, # feature_extractor=self.processor.feature_extractor, # max_new_tokens=128, # chunk_length_s=25, # batch_size=16, # torch_dtype=self.torch_dtype, # device=self.device, # ) # # Initialize YAMNet model # self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') # logger.info("Models initialized successfully") # except Exception as e: # logger.error(f"Error initializing models: {str(e)}") # raise # def load_wav_16k_mono(self, filename): # try: # wav, sr = librosa.load(filename, mono=True, sr=None) # if sr != 16000: # wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) # return wav # except Exception as e: # logger.error(f"Error loading audio file: {str(e)}") # raise # def get_features_yamnet_extract_embedding(self, wav_data): # try: # scores, embeddings, spectrogram = self.yamnet_model(wav_data) # return np.mean(embeddings.numpy(), axis=0) # except Exception as e: # logger.error(f"Error extracting YAMNet embeddings: {str(e)}") # raise # # Initialize Flask application # app = Flask(__name__) # app.secret_key = 'your_secret_key_here' # app.config['UPLOAD_FOLDER'] = Path('uploads') # app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # # Create upload folder # app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True) # # Initialize audio processor (will only happen once) # audio_processor = AudioProcessor() # @app.route('/') # def index(): # session.clear() # return render_template('terminal.html') # @app.route('/process', methods=['POST']) # def process(): # try: # data = request.json # command = data.get('command', '').strip().lower() # if command in ['classify', 'transcribe']: # session['operation'] = command # return jsonify({ # 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.', # 'upload': True # }) # else: # return jsonify({ # 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".' # }) # except Exception as e: # logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}") # session.pop('operation', None) # return jsonify({'result': f'root@math:~$ Error: {str(e)}'}) # @app.route('/upload', methods=['POST']) # def upload(): # filepath = None # try: # operation = session.get('operation') # if not operation: # return jsonify({ # 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".' # }) # if 'file' not in request.files: # return jsonify({'result': 'root@math:~$ No file uploaded.'}) # file = request.files['file'] # if file.filename == '' or not file.filename.lower().endswith('.mp3'): # return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'}) # filename = secure_filename(file.filename) # filepath = app.config['UPLOAD_FOLDER'] / filename # file.save(filepath) # wav_data = audio_processor.load_wav_16k_mono(filepath) # if operation == 'classify': # embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data) # embeddings = np.reshape(embeddings, (-1, 1024)) # result = np.argmax(audio_processor.classification_model.predict(embeddings)) # elif operation == 'transcribe': # result = audio_processor.pipe(str(filepath))['text'] # else: # result = 'Invalid operation' # return jsonify({ # 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".', # 'upload': False # }) # except Exception as e: # logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}") # return jsonify({ # 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".' # }) # finally: # session.pop('operation', None) # if filepath and Path(filepath).exists(): # try: # Path(filepath).unlink() # except Exception as e: # logger.error(f"Error deleting file {filepath}: {str(e)}") import os os.environ["KERAS_BACKEND"] = "jax" os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' import logging import numpy as np import librosa import tensorflow_hub as hub from flask import Flask, render_template, request, jsonify, session from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import keras import torch import io import traceback # Configure logging to print to terminal only logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class AudioProcessor: _instance = None _initialized = False def __new__(cls): if cls._instance is None: cls._instance = super(AudioProcessor, cls).__new__(cls) return cls._instance def __init__(self): if not AudioProcessor._initialized: self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 self.initialize_models() AudioProcessor._initialized = True def initialize_models(self): try: logger.info("Initializing models...") # Initialize transcription model model_id = "distil-whisper/distil-large-v3" self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) self.transcription_model.to(self.device) self.processor = AutoProcessor.from_pretrained(model_id) # Initialize classification model self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification") # Initialize pipeline self.pipe = pipeline( "automatic-speech-recognition", model=self.transcription_model, tokenizer=self.processor.tokenizer, feature_extractor=self.processor.feature_extractor, max_new_tokens=128, chunk_length_s=25, batch_size=16, torch_dtype=self.torch_dtype, device=self.device, ) # Initialize YAMNet model self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') logger.info("Models initialized successfully") except Exception as e: logger.error(f"Error initializing models: {str(e)}") raise def load_wav_16k_mono(self, audio_data): try: # Load audio from bytes buffer instead of file wav, sr = librosa.load(io.BytesIO(audio_data), mono=True, sr=None) if sr != 16000: wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) return wav except Exception as e: logger.error(f"Error loading audio data: {str(e)}") raise def get_features_yamnet_extract_embedding(self, wav_data): try: scores, embeddings, spectrogram = self.yamnet_model(wav_data) return np.mean(embeddings.numpy(), axis=0) except Exception as e: logger.error(f"Error extracting YAMNet embeddings: {str(e)}") raise # Initialize Flask application app = Flask(__name__) app.secret_key = 'your_secret_key_here' app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # Initialize audio processor (will only happen once) audio_processor = AudioProcessor() @app.route('/') def index(): session.clear() return render_template('terminal.html') @app.route('/process', methods=['POST']) def process(): try: data = request.json command = data.get('command', '').strip().lower() if command in ['classify', 'transcribe']: session['operation'] = command return jsonify({ 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.', 'upload': True }) else: return jsonify({ 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".' }) except Exception as e: logger.error(f"Error in process route: {str(e)}") session.pop('operation', None) return jsonify({'result': f'root@math:~$ Error: {str(e)}'}) @app.route('/upload', methods=['POST']) def upload(): try: operation = session.get('operation') if not operation: return jsonify({ 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".' }) if 'file' not in request.files: return jsonify({'result': 'root@math:~$ No file uploaded.'}) file = request.files['file'] if file.filename == '' or not file.filename.lower().endswith('.mp3'): return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'}) # Read file content into memory audio_data = file.read() wav_data = audio_processor.load_wav_16k_mono(audio_data) if operation == 'classify': embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data) embeddings = np.reshape(embeddings, (-1, 1024)) result = np.argmax(audio_processor.classification_model.predict(embeddings)) elif operation == 'transcribe': # Create temporary buffer for transcription audio_buffer = io.BytesIO(audio_data) result = audio_processor.pipe(audio_buffer)['text'] else: result = 'Invalid operation' return jsonify({ 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".', 'upload': False }) except Exception as e: logger.error(f"Error in upload route: {str(e)}") return jsonify({ 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".' }) finally: session.pop('operation', None) # if __name__ == '__main__': # app.run(host='0.0.0.0', port=7860)