datasciencesage commited on
Commit
8fa04cd
·
1 Parent(s): c6db08c
Files changed (1) hide show
  1. app.py +202 -28
app.py CHANGED
@@ -1,8 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  os.environ["KERAS_BACKEND"] = "jax"
3
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
4
  import logging
5
- from pathlib import Path
6
  import numpy as np
7
  import librosa
8
  import tensorflow_hub as hub
@@ -10,23 +195,19 @@ from flask import Flask, render_template, request, jsonify, session
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
11
  import keras
12
  import torch
13
- from werkzeug.utils import secure_filename
14
  import traceback
15
 
16
- # Configure logging
17
  logging.basicConfig(
18
  level=logging.INFO,
19
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20
  handlers=[
21
- logging.FileHandler('app.log'),
22
  logging.StreamHandler()
23
  ]
24
  )
25
  logger = logging.getLogger(__name__)
26
 
27
- # Environment setup
28
-
29
-
30
  class AudioProcessor:
31
  _instance = None
32
  _initialized = False
@@ -78,14 +259,15 @@ class AudioProcessor:
78
  logger.error(f"Error initializing models: {str(e)}")
79
  raise
80
 
81
- def load_wav_16k_mono(self, filename):
82
  try:
83
- wav, sr = librosa.load(filename, mono=True, sr=None)
 
84
  if sr != 16000:
85
  wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
86
  return wav
87
  except Exception as e:
88
- logger.error(f"Error loading audio file: {str(e)}")
89
  raise
90
 
91
  def get_features_yamnet_extract_embedding(self, wav_data):
@@ -99,12 +281,8 @@ class AudioProcessor:
99
  # Initialize Flask application
100
  app = Flask(__name__)
101
  app.secret_key = 'your_secret_key_here'
102
- app.config['UPLOAD_FOLDER'] = Path('uploads')
103
  app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
104
 
105
- # Create upload folder
106
- app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True)
107
-
108
  # Initialize audio processor (will only happen once)
109
  audio_processor = AudioProcessor()
110
 
@@ -130,13 +308,12 @@ def process():
130
  'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
131
  })
132
  except Exception as e:
133
- logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}")
134
  session.pop('operation', None)
135
  return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
136
 
137
  @app.route('/upload', methods=['POST'])
138
  def upload():
139
- filepath = None
140
  try:
141
  operation = session.get('operation')
142
  if not operation:
@@ -151,18 +328,18 @@ def upload():
151
  if file.filename == '' or not file.filename.lower().endswith('.mp3'):
152
  return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
153
 
154
- filename = secure_filename(file.filename)
155
- filepath = app.config['UPLOAD_FOLDER'] / filename
156
-
157
- file.save(filepath)
158
- wav_data = audio_processor.load_wav_16k_mono(filepath)
159
 
160
  if operation == 'classify':
161
  embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
162
  embeddings = np.reshape(embeddings, (-1, 1024))
163
  result = np.argmax(audio_processor.classification_model.predict(embeddings))
164
  elif operation == 'transcribe':
165
- result = audio_processor.pipe(str(filepath))['text']
 
 
166
  else:
167
  result = 'Invalid operation'
168
 
@@ -172,15 +349,12 @@ def upload():
172
  })
173
 
174
  except Exception as e:
175
- logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}")
176
  return jsonify({
177
  'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
178
  })
179
  finally:
180
  session.pop('operation', None)
181
- if filepath and Path(filepath).exists():
182
- try:
183
- Path(filepath).unlink()
184
- except Exception as e:
185
- logger.error(f"Error deleting file {filepath}: {str(e)}")
186
 
 
 
 
1
+ # import os
2
+ # os.environ["KERAS_BACKEND"] = "jax"
3
+ # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
4
+ # import logging
5
+ # from pathlib import Path
6
+ # import numpy as np
7
+ # import librosa
8
+ # import tensorflow_hub as hub
9
+ # from flask import Flask, render_template, request, jsonify, session
10
+ # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
11
+ # import keras
12
+ # import torch
13
+ # from werkzeug.utils import secure_filename
14
+ # import traceback
15
+
16
+ # # Configure logging
17
+ # logging.basicConfig(
18
+ # level=logging.INFO,
19
+ # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20
+ # handlers=[
21
+ # logging.FileHandler('app.log'),
22
+ # logging.StreamHandler()
23
+ # ]
24
+ # )
25
+ # logger = logging.getLogger(__name__)
26
+
27
+ # # Environment setup
28
+
29
+
30
+ # class AudioProcessor:
31
+ # _instance = None
32
+ # _initialized = False
33
+
34
+ # def __new__(cls):
35
+ # if cls._instance is None:
36
+ # cls._instance = super(AudioProcessor, cls).__new__(cls)
37
+ # return cls._instance
38
+
39
+ # def __init__(self):
40
+ # if not AudioProcessor._initialized:
41
+ # self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
42
+ # self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
43
+ # self.initialize_models()
44
+ # AudioProcessor._initialized = True
45
+
46
+ # def initialize_models(self):
47
+ # try:
48
+ # logger.info("Initializing models...")
49
+ # # Initialize transcription model
50
+ # model_id = "distil-whisper/distil-large-v3"
51
+ # self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
52
+ # model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
53
+ # )
54
+ # self.transcription_model.to(self.device)
55
+ # self.processor = AutoProcessor.from_pretrained(model_id)
56
+
57
+ # # Initialize classification model
58
+ # self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification")
59
+
60
+ # # Initialize pipeline
61
+ # self.pipe = pipeline(
62
+ # "automatic-speech-recognition",
63
+ # model=self.transcription_model,
64
+ # tokenizer=self.processor.tokenizer,
65
+ # feature_extractor=self.processor.feature_extractor,
66
+ # max_new_tokens=128,
67
+ # chunk_length_s=25,
68
+ # batch_size=16,
69
+ # torch_dtype=self.torch_dtype,
70
+ # device=self.device,
71
+ # )
72
+
73
+ # # Initialize YAMNet model
74
+ # self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
75
+
76
+ # logger.info("Models initialized successfully")
77
+ # except Exception as e:
78
+ # logger.error(f"Error initializing models: {str(e)}")
79
+ # raise
80
+
81
+ # def load_wav_16k_mono(self, filename):
82
+ # try:
83
+ # wav, sr = librosa.load(filename, mono=True, sr=None)
84
+ # if sr != 16000:
85
+ # wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
86
+ # return wav
87
+ # except Exception as e:
88
+ # logger.error(f"Error loading audio file: {str(e)}")
89
+ # raise
90
+
91
+ # def get_features_yamnet_extract_embedding(self, wav_data):
92
+ # try:
93
+ # scores, embeddings, spectrogram = self.yamnet_model(wav_data)
94
+ # return np.mean(embeddings.numpy(), axis=0)
95
+ # except Exception as e:
96
+ # logger.error(f"Error extracting YAMNet embeddings: {str(e)}")
97
+ # raise
98
+
99
+ # # Initialize Flask application
100
+ # app = Flask(__name__)
101
+ # app.secret_key = 'your_secret_key_here'
102
+ # app.config['UPLOAD_FOLDER'] = Path('uploads')
103
+ # app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
104
+
105
+ # # Create upload folder
106
+ # app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True)
107
+
108
+ # # Initialize audio processor (will only happen once)
109
+ # audio_processor = AudioProcessor()
110
+
111
+ # @app.route('/')
112
+ # def index():
113
+ # session.clear()
114
+ # return render_template('terminal.html')
115
+
116
+ # @app.route('/process', methods=['POST'])
117
+ # def process():
118
+ # try:
119
+ # data = request.json
120
+ # command = data.get('command', '').strip().lower()
121
+
122
+ # if command in ['classify', 'transcribe']:
123
+ # session['operation'] = command
124
+ # return jsonify({
125
+ # 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.',
126
+ # 'upload': True
127
+ # })
128
+ # else:
129
+ # return jsonify({
130
+ # 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
131
+ # })
132
+ # except Exception as e:
133
+ # logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}")
134
+ # session.pop('operation', None)
135
+ # return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
136
+
137
+ # @app.route('/upload', methods=['POST'])
138
+ # def upload():
139
+ # filepath = None
140
+ # try:
141
+ # operation = session.get('operation')
142
+ # if not operation:
143
+ # return jsonify({
144
+ # 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".'
145
+ # })
146
+
147
+ # if 'file' not in request.files:
148
+ # return jsonify({'result': 'root@math:~$ No file uploaded.'})
149
+
150
+ # file = request.files['file']
151
+ # if file.filename == '' or not file.filename.lower().endswith('.mp3'):
152
+ # return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
153
+
154
+ # filename = secure_filename(file.filename)
155
+ # filepath = app.config['UPLOAD_FOLDER'] / filename
156
+
157
+ # file.save(filepath)
158
+ # wav_data = audio_processor.load_wav_16k_mono(filepath)
159
+
160
+ # if operation == 'classify':
161
+ # embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
162
+ # embeddings = np.reshape(embeddings, (-1, 1024))
163
+ # result = np.argmax(audio_processor.classification_model.predict(embeddings))
164
+ # elif operation == 'transcribe':
165
+ # result = audio_processor.pipe(str(filepath))['text']
166
+ # else:
167
+ # result = 'Invalid operation'
168
+
169
+ # return jsonify({
170
+ # 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".',
171
+ # 'upload': False
172
+ # })
173
+
174
+ # except Exception as e:
175
+ # logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}")
176
+ # return jsonify({
177
+ # 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
178
+ # })
179
+ # finally:
180
+ # session.pop('operation', None)
181
+ # if filepath and Path(filepath).exists():
182
+ # try:
183
+ # Path(filepath).unlink()
184
+ # except Exception as e:
185
+ # logger.error(f"Error deleting file {filepath}: {str(e)}")
186
+
187
  import os
188
  os.environ["KERAS_BACKEND"] = "jax"
189
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
190
  import logging
 
191
  import numpy as np
192
  import librosa
193
  import tensorflow_hub as hub
 
195
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
196
  import keras
197
  import torch
198
+ import io
199
  import traceback
200
 
201
+ # Configure logging to print to terminal only
202
  logging.basicConfig(
203
  level=logging.INFO,
204
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
205
  handlers=[
 
206
  logging.StreamHandler()
207
  ]
208
  )
209
  logger = logging.getLogger(__name__)
210
 
 
 
 
211
  class AudioProcessor:
212
  _instance = None
213
  _initialized = False
 
259
  logger.error(f"Error initializing models: {str(e)}")
260
  raise
261
 
262
+ def load_wav_16k_mono(self, audio_data):
263
  try:
264
+ # Load audio from bytes buffer instead of file
265
+ wav, sr = librosa.load(io.BytesIO(audio_data), mono=True, sr=None)
266
  if sr != 16000:
267
  wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
268
  return wav
269
  except Exception as e:
270
+ logger.error(f"Error loading audio data: {str(e)}")
271
  raise
272
 
273
  def get_features_yamnet_extract_embedding(self, wav_data):
 
281
  # Initialize Flask application
282
  app = Flask(__name__)
283
  app.secret_key = 'your_secret_key_here'
 
284
  app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
285
 
 
 
 
286
  # Initialize audio processor (will only happen once)
287
  audio_processor = AudioProcessor()
288
 
 
308
  'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
309
  })
310
  except Exception as e:
311
+ logger.error(f"Error in process route: {str(e)}")
312
  session.pop('operation', None)
313
  return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
314
 
315
  @app.route('/upload', methods=['POST'])
316
  def upload():
 
317
  try:
318
  operation = session.get('operation')
319
  if not operation:
 
328
  if file.filename == '' or not file.filename.lower().endswith('.mp3'):
329
  return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
330
 
331
+ # Read file content into memory
332
+ audio_data = file.read()
333
+ wav_data = audio_processor.load_wav_16k_mono(audio_data)
 
 
334
 
335
  if operation == 'classify':
336
  embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
337
  embeddings = np.reshape(embeddings, (-1, 1024))
338
  result = np.argmax(audio_processor.classification_model.predict(embeddings))
339
  elif operation == 'transcribe':
340
+ # Create temporary buffer for transcription
341
+ audio_buffer = io.BytesIO(audio_data)
342
+ result = audio_processor.pipe(audio_buffer)['text']
343
  else:
344
  result = 'Invalid operation'
345
 
 
349
  })
350
 
351
  except Exception as e:
352
+ logger.error(f"Error in upload route: {str(e)}")
353
  return jsonify({
354
  'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
355
  })
356
  finally:
357
  session.pop('operation', None)
 
 
 
 
 
358
 
359
+ # if __name__ == '__main__':
360
+ # app.run(host='0.0.0.0', port=7860)