Spaces:

datasciencesage
/

AudioModels

Sleeping

App Files Files Community

datasciencesage commited on Dec 29, 2024

Commit

8fa04cd

1 Parent(s): c6db08c

app.py

Browse files

Files changed (1) hide show

app.py +202 -28

app.py CHANGED Viewed

@@ -1,8 +1,193 @@
 import os
 os.environ["KERAS_BACKEND"] = "jax"
 os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 import logging
-from pathlib import Path
 import numpy as np
 import librosa
 import tensorflow_hub as hub
@@ -10,23 +195,19 @@ from flask import Flask, render_template, request, jsonify, session
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import keras
 import torch
-from werkzeug.utils import secure_filename
 import traceback
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     handlers=[
-        logging.FileHandler('app.log'),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
-# Environment setup
 class AudioProcessor:
     _instance = None
     _initialized = False
@@ -78,14 +259,15 @@ class AudioProcessor:
             logger.error(f"Error initializing models: {str(e)}")
             raise
-    def load_wav_16k_mono(self, filename):
         try:
-            wav, sr = librosa.load(filename, mono=True, sr=None)
             if sr != 16000:
                 wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
             return wav
         except Exception as e:
-            logger.error(f"Error loading audio file: {str(e)}")
             raise
     def get_features_yamnet_extract_embedding(self, wav_data):
@@ -99,12 +281,8 @@ class AudioProcessor:
 # Initialize Flask application
 app = Flask(__name__)
 app.secret_key = 'your_secret_key_here'
-app.config['UPLOAD_FOLDER'] = Path('uploads')
 app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
-# Create upload folder
-app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True)
 # Initialize audio processor (will only happen once)
 audio_processor = AudioProcessor()
@@ -130,13 +308,12 @@ def process():
                 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
             })
     except Exception as e:
-        logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}")
         session.pop('operation', None)
         return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
 @app.route('/upload', methods=['POST'])
 def upload():
-    filepath = None
     try:
         operation = session.get('operation')
         if not operation:
@@ -151,18 +328,18 @@ def upload():
         if file.filename == '' or not file.filename.lower().endswith('.mp3'):
             return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
-        filename = secure_filename(file.filename)
-        filepath = app.config['UPLOAD_FOLDER'] / filename
-        file.save(filepath)
-        wav_data = audio_processor.load_wav_16k_mono(filepath)
         if operation == 'classify':
             embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
             embeddings = np.reshape(embeddings, (-1, 1024))
             result = np.argmax(audio_processor.classification_model.predict(embeddings))
         elif operation == 'transcribe':
-            result = audio_processor.pipe(str(filepath))['text']
         else:
             result = 'Invalid operation'
@@ -172,15 +349,12 @@ def upload():
         })
     except Exception as e:
-        logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}")
         return jsonify({
             'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
         })
     finally:
         session.pop('operation', None)
-        if filepath and Path(filepath).exists():
-            try:
-                Path(filepath).unlink()
-            except Exception as e:
-                logger.error(f"Error deleting file {filepath}: {str(e)}")

+# import os
+# os.environ["KERAS_BACKEND"] = "jax"
+# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+# import logging
+# from pathlib import Path
+# import numpy as np
+# import librosa
+# import tensorflow_hub as hub
+# from flask import Flask, render_template, request, jsonify, session
+# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+# import keras
+# import torch
+# from werkzeug.utils import secure_filename
+# import traceback
+# # Configure logging
+# logging.basicConfig(
+#     level=logging.INFO,
+#     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+#     handlers=[
+#         logging.FileHandler('app.log'),
+#         logging.StreamHandler()
+#     ]
+# )
+# logger = logging.getLogger(__name__)
+# # Environment setup
+# class AudioProcessor:
+#     _instance = None
+#     _initialized = False
+#     def __new__(cls):
+#         if cls._instance is None:
+#             cls._instance = super(AudioProcessor, cls).__new__(cls)
+#         return cls._instance
+#     def __init__(self):
+#         if not AudioProcessor._initialized:
+#             self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+#             self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+#             self.initialize_models()
+#             AudioProcessor._initialized = True
+#     def initialize_models(self):
+#         try:
+#             logger.info("Initializing models...")
+#             # Initialize transcription model
+#             model_id = "distil-whisper/distil-large-v3"
+#             self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+#                 model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+#             )
+#             self.transcription_model.to(self.device)
+#             self.processor = AutoProcessor.from_pretrained(model_id)
+#             # Initialize classification model
+#             self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification")
+#             # Initialize pipeline
+#             self.pipe = pipeline(
+#                 "automatic-speech-recognition",
+#                 model=self.transcription_model,
+#                 tokenizer=self.processor.tokenizer,
+#                 feature_extractor=self.processor.feature_extractor,
+#                 max_new_tokens=128,
+#                 chunk_length_s=25,
+#                 batch_size=16,
+#                 torch_dtype=self.torch_dtype,
+#                 device=self.device,
+#             )
+#             # Initialize YAMNet model
+#             self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
+#             logger.info("Models initialized successfully")
+#         except Exception as e:
+#             logger.error(f"Error initializing models: {str(e)}")
+#             raise
+#     def load_wav_16k_mono(self, filename):
+#         try:
+#             wav, sr = librosa.load(filename, mono=True, sr=None)
+#             if sr != 16000:
+#                 wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
+#             return wav
+#         except Exception as e:
+#             logger.error(f"Error loading audio file: {str(e)}")
+#             raise
+#     def get_features_yamnet_extract_embedding(self, wav_data):
+#         try:
+#             scores, embeddings, spectrogram = self.yamnet_model(wav_data)
+#             return np.mean(embeddings.numpy(), axis=0)
+#         except Exception as e:
+#             logger.error(f"Error extracting YAMNet embeddings: {str(e)}")
+#             raise
+# # Initialize Flask application
+# app = Flask(__name__)
+# app.secret_key = 'your_secret_key_here'
+# app.config['UPLOAD_FOLDER'] = Path('uploads')
+# app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
+# # Create upload folder
+# app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True)
+# # Initialize audio processor (will only happen once)
+# audio_processor = AudioProcessor()
+# @app.route('/')
+# def index():
+#     session.clear()
+#     return render_template('terminal.html')
+# @app.route('/process', methods=['POST'])
+# def process():
+#     try:
+#         data = request.json
+#         command = data.get('command', '').strip().lower()
+#         if command in ['classify', 'transcribe']:
+#             session['operation'] = command
+#             return jsonify({
+#                 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.',
+#                 'upload': True
+#             })
+#         else:
+#             return jsonify({
+#                 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
+#             })
+#     except Exception as e:
+#         logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}")
+#         session.pop('operation', None)
+#         return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
+# @app.route('/upload', methods=['POST'])
+# def upload():
+#     filepath = None
+#     try:
+#         operation = session.get('operation')
+#         if not operation:
+#             return jsonify({
+#                 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".'
+#             })
+#         if 'file' not in request.files:
+#             return jsonify({'result': 'root@math:~$ No file uploaded.'})
+#         file = request.files['file']
+#         if file.filename == '' or not file.filename.lower().endswith('.mp3'):
+#             return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
+#         filename = secure_filename(file.filename)
+#         filepath = app.config['UPLOAD_FOLDER'] / filename
+#         file.save(filepath)
+#         wav_data = audio_processor.load_wav_16k_mono(filepath)
+#         if operation == 'classify':
+#             embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
+#             embeddings = np.reshape(embeddings, (-1, 1024))
+#             result = np.argmax(audio_processor.classification_model.predict(embeddings))
+#         elif operation == 'transcribe':
+#             result = audio_processor.pipe(str(filepath))['text']
+#         else:
+#             result = 'Invalid operation'
+#         return jsonify({
+#             'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".',
+#             'upload': False
+#         })
+#     except Exception as e:
+#         logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}")
+#         return jsonify({
+#             'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
+#         })
+#     finally:
+#         session.pop('operation', None)
+#         if filepath and Path(filepath).exists():
+#             try:
+#                 Path(filepath).unlink()
+#             except Exception as e:
+#                 logger.error(f"Error deleting file {filepath}: {str(e)}")
 import os
 os.environ["KERAS_BACKEND"] = "jax"
 os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 import logging
 import numpy as np
 import librosa
 import tensorflow_hub as hub
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import keras
 import torch
+import io
 import traceback
+# Configure logging to print to terminal only
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     handlers=[
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
 class AudioProcessor:
     _instance = None
     _initialized = False
             logger.error(f"Error initializing models: {str(e)}")
             raise
+    def load_wav_16k_mono(self, audio_data):
         try:
+            # Load audio from bytes buffer instead of file
+            wav, sr = librosa.load(io.BytesIO(audio_data), mono=True, sr=None)
             if sr != 16000:
                 wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
             return wav
         except Exception as e:
+            logger.error(f"Error loading audio data: {str(e)}")
             raise
     def get_features_yamnet_extract_embedding(self, wav_data):
 # Initialize Flask application
 app = Flask(__name__)
 app.secret_key = 'your_secret_key_here'
 app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
 # Initialize audio processor (will only happen once)
 audio_processor = AudioProcessor()
                 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
             })
     except Exception as e:
+        logger.error(f"Error in process route: {str(e)}")
         session.pop('operation', None)
         return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
 @app.route('/upload', methods=['POST'])
 def upload():
     try:
         operation = session.get('operation')
         if not operation:
         if file.filename == '' or not file.filename.lower().endswith('.mp3'):
             return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
+        # Read file content into memory
+        audio_data = file.read()
+        wav_data = audio_processor.load_wav_16k_mono(audio_data)
         if operation == 'classify':
             embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
             embeddings = np.reshape(embeddings, (-1, 1024))
             result = np.argmax(audio_processor.classification_model.predict(embeddings))
         elif operation == 'transcribe':
+            # Create temporary buffer for transcription
+            audio_buffer = io.BytesIO(audio_data)
+            result = audio_processor.pipe(audio_buffer)['text']
         else:
             result = 'Invalid operation'
         })
     except Exception as e:
+        logger.error(f"Error in upload route: {str(e)}")
         return jsonify({
             'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
         })
     finally:
         session.pop('operation', None)
+# if __name__ == '__main__':
+#     app.run(host='0.0.0.0', port=7860)