Spaces:

ASLP-lab
/

OSUM

Runtime error

App Files Files Community

tomxxie commited on Feb 15

Commit

66817ed

1 Parent(s): d845e75

适配zeroGPU

Browse files

Files changed (4) hide show

.idea/OSUM.iml +1 -1
.idea/misc.xml +1 -1
app.py +78 -84
实验室.png → lab.png +0 -0

.idea/OSUM.iml CHANGED Viewed

@@ -4,7 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/venv" />
     </content>
-    <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">

     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/venv" />
     </content>
+    <orderEntry type="jdk" jdkName="k2_gxl" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">

.idea/misc.xml CHANGED Viewed

@@ -3,5 +3,5 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.12 (OSUM)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (OSUM)" project-jdk-type="Python SDK" />
 </project>

   <component name="Black">
     <option name="sdkName" value="Python 3.12 (OSUM)" />
   </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="k2_gxl" project-jdk-type="Python SDK" />
 </project>

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import base64
 import json
 import time
 import spaces
 import gradio as gr
@@ -9,19 +11,18 @@ import os
 import sys
-# sys.path.insert(0, '../../../../')
-# from gxl_ai_utils.utils import utils_file
-# from wenet.utils.init_tokenizer import init_tokenizer
-# from gxl_ai_utils.config.gxl_config import GxlNode
-# from wenet.utils.init_model import init_model
 import logging
-# import librosa
-# import torch
-# import torchaudio
-# import numpy as np
 # 将图片转换为 Base64
-with open("./实验室.png", "rb") as image_file:
     encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
 # with open("./cat.jpg", "rb") as image_file:
@@ -44,81 +45,74 @@ TASK_PROMPT_MAPPING = {
     "STTC (Speech to Text Chat)": "首先将语音转录为文字，然后对语音内容进行回复，转录和文字之间使用<开始回答>分割。"
 }
-gpu_id = 4
-# def init_model_my():
-#     logging.basicConfig(level=logging.DEBUG,
-#                         format='%(asctime)s %(levelname)s %(message)s')
-#     config_path = "/home/node54_tmpdata/xlgeng/code/wenet_undersdand_and_speech_xlgeng/examples/wenetspeech/whisper/exp/update_data/epoch_1_with_token/epoch_11.yaml"
-#     #config_path = "/home/work_nfs15/asr_data/ckpt/understanding_model/step_24999.yaml"
-#
-#     checkpoint_path = "/home/node54_tmpdata/xlgeng/code/wenet_undersdand_and_speech_xlgeng/examples/wenetspeech/whisper/exp/update_data/epoch_1_with_token/epoch_11.pt"
-#     checkpoint_path = "/home/work_nfs15/asr_data/ckpt/understanding_model/epoch4/step_21249.pt"
-#     checkpoint_path = "/home/work_nfs15/asr_data/ckpt/understanding_model/epoch_13_with_asr-chat_full_data/step_32499/step_32499.pt"
-#     args = GxlNode({
-#         "checkpoint": checkpoint_path,
-#     })
-#     configs = utils_file.load_dict_from_yaml(config_path)
-#     model, configs = init_model(args, configs)
-#     model = model.cuda(gpu_id)
-#     tokenizer = init_tokenizer(configs)
-#     print(model)
-#     return model, tokenizer
-#
 # model, tokenizer = init_model_my()
-#
-# def do_resample(input_wav_path, output_wav_path):
-#     """"""
-#     print(f'input_wav_path: {input_wav_path}, output_wav_path: {output_wav_path}')
-#     waveform, sample_rate = torchaudio.load(input_wav_path)
-#     # 检查音频的维度
-#     num_channels = waveform.shape[0]
-#     # 如果音频是多通道的，则进行通道平均
-#     if num_channels > 1:
-#         waveform = torch.mean(waveform, dim=0, keepdim=True)
-#     waveform = torchaudio.transforms.Resample(
-#         orig_freq=sample_rate, new_freq=16000)(waveform)
-#     utils_file.makedir_for_file(output_wav_path)
-#     torchaudio.save(output_wav_path, waveform, 16000)
-#
-# def true_decode_fuc(input_wav_path, input_prompt):
-#     # input_prompt = TASK_PROMPT_MAPPING.get(input_prompt, "未知任务类型")
-#     print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
-#     timestamp_ms = int(time.time() * 1000)
-#     now_file_tmp_path_resample = f'/home/xlgeng/.cache/.temp/{timestamp_ms}_resample.wav'
-#     do_resample(input_wav_path, now_file_tmp_path_resample)
-#     # tmp_vad_path = f'/home/xlgeng/.cache/.temp/{timestamp_ms}_vad.wav'
-#     # remove_silence_torchaudio_ends(now_file_tmp_path_resample, tmp_vad_path)
-#     # input_wav_path  = tmp_vad_path
-#     input_wav_path = now_file_tmp_path_resample
-#     waveform, sample_rate = torchaudio.load(input_wav_path)
-#     waveform = waveform.squeeze(0)  # (channel=1, sample) -> (sample,)
-#     print(f'wavform shape: {waveform.shape}, sample_rate: {sample_rate}')
-#     window = torch.hann_window(400)
-#     stft = torch.stft(waveform,
-#                       400,
-#                       160,
-#                       window=window,
-#                       return_complex=True)
-#     magnitudes = stft[..., :-1].abs() ** 2
-#
-#     filters = torch.from_numpy(
-#         librosa.filters.mel(sr=sample_rate,
-#                             n_fft=400,
-#                             n_mels=80))
-#     mel_spec = filters @ magnitudes
-#
-#     # NOTE(xcsong): https://github.com/openai/whisper/discussions/269
-#     log_spec = torch.clamp(mel_spec, min=1e-10).log10()
-#     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
-#     log_spec = (log_spec + 4.0) / 4.0
-#     feat = log_spec.transpose(0, 1)
-#     feat_lens = torch.tensor([feat.shape[0]], dtype=torch.int64).to(gpu_id)
-#     feat = feat.unsqueeze(0).to(gpu_id)
-#     # feat = feat.half()
-#     # feat_lens = feat_lens.half()
-#     res_text = model.generate(wavs=feat, wavs_len=feat_lens, prompt=input_prompt)[0]
-#     print("耿雪龙哈哈：", res_text)
-#     return res_text, now_file_tmp_path_resample
 @spaces.GPU
 def do_decode(input_wav_path, input_prompt):
     print(f'input_wav_path= {input_wav_path}, input_prompt= {input_prompt}')

 import base64
 import json
 import time
+from types import SimpleNamespace
 import spaces
 import gradio as gr
 import sys
+sys.path.insert(0, './')
+from gxl_ai_utils.utils import utils_file
+from wenet.utils.init_tokenizer import init_tokenizer
+from wenet.utils.init_model import init_model
 import logging
+import librosa
+import torch
+import torchaudio
+import numpy as np
 # 将图片转换为 Base64
+with open("lab.png", "rb") as image_file:
     encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
 # with open("./cat.jpg", "rb") as image_file:
     "STTC (Speech to Text Chat)": "首先将语音转录为文字，然后对语音内容进行回复，转录和文字之间使用<开始回答>分割。"
 }
+def init_model_my():
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    config_path = "/home/node54_tmpdata/xlgeng/code/wenet_undersdand_and_speech_xlgeng/examples/wenetspeech/whisper/exp/update_data/epoch_1_with_token/epoch_11.yaml"
+    checkpoint_path = "/home/work_nfs15/asr_data/ckpt/understanding_model/epoch_13_with_asr-chat_full_data/step_32499/step_32499.pt"
+    args = SimpleNamespace(**{
+        "checkpoint": checkpoint_path,
+    })
+    configs = utils_file.load_dict_from_yaml(config_path)
+    model, configs = init_model(args, configs)
+    model = model.cuda()
+    tokenizer = init_tokenizer(configs)
+    print(model)
+    return model, tokenizer
 # model, tokenizer = init_model_my()
+print("model init success")
+def do_resample(input_wav_path, output_wav_path):
+    """"""
+    print(f'input_wav_path: {input_wav_path}, output_wav_path: {output_wav_path}')
+    waveform, sample_rate = torchaudio.load(input_wav_path)
+    # 检查音频的维度
+    num_channels = waveform.shape[0]
+    # 如果音频是多通道的，则进行通道平均
+    if num_channels > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    waveform = torchaudio.transforms.Resample(
+        orig_freq=sample_rate, new_freq=16000)(waveform)
+    utils_file.makedir_for_file(output_wav_path)
+    torchaudio.save(output_wav_path, waveform, 16000)
+def true_decode_fuc(input_wav_path, input_prompt):
+    # input_prompt = TASK_PROMPT_MAPPING.get(input_prompt, "未知任务类型")
+    print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
+    timestamp_ms = int(time.time() * 1000)
+    now_file_tmp_path_resample = f'/home/xlgeng/.cache/.temp/{timestamp_ms}_resample.wav'
+    do_resample(input_wav_path, now_file_tmp_path_resample)
+    input_wav_path = now_file_tmp_path_resample
+    waveform, sample_rate = torchaudio.load(input_wav_path)
+    waveform = waveform.squeeze(0)  # (channel=1, sample) -> (sample,)
+    print(f'wavform shape: {waveform.shape}, sample_rate: {sample_rate}')
+    window = torch.hann_window(400)
+    stft = torch.stft(waveform,
+                      400,
+                      160,
+                      window=window,
+                      return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = torch.from_numpy(
+        librosa.filters.mel(sr=sample_rate,
+                            n_fft=400,
+                            n_mels=80))
+    mel_spec = filters @ magnitudes
+    # NOTE(xcsong): https://github.com/openai/whisper/discussions/269
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    feat = log_spec.transpose(0, 1)
+    feat_lens = torch.tensor([feat.shape[0]], dtype=torch.int64).cuda()
+    feat = feat.unsqueeze(0).cuda()
+    # feat = feat.half()
+    # feat_lens = feat_lens.half()
+    model = None
+    res_text = model.generate(wavs=feat, wavs_len=feat_lens, prompt=input_prompt)[0]
+    print("耿雪龙哈哈：", res_text)
+    return res_text, now_file_tmp_path_resample
 @spaces.GPU
 def do_decode(input_wav_path, input_prompt):
     print(f'input_wav_path= {input_wav_path}, input_prompt= {input_prompt}')

实验室.png → lab.png RENAMED Viewed

File without changes