from modelscope.pipelines import pipeline as pipeline_ali from modelscope.utils.constant import Tasks from moviepy.editor import VideoFileClip import httpx, json import os ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) import ffmpeg from faster_whisper import WhisperModel import math import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline from slicer2 import Slicer import librosa import soundfile from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess # 指定本地目录 local_dir_root = "./models_from_modelscope" # model_dir_cirm = snapshot_download('damo/speech_frcrn_ans_cirm_16k', cache_dir=local_dir_root) # model_dir_ins = snapshot_download('damo/nlp_csanmt_translation_en2zh', cache_dir=local_dir_root) model_dir_cirm = f'{ROOT_DIR}/models_from_modelscope/damo/speech_frcrn_ans_cirm_16k' model_dir_ins = f'{ROOT_DIR}/models_from_modelscope/damo/nlp_csanmt_translation_en2zh' device = "cuda" if torch.cuda.is_available() else "cpu" import ollama def deep_tran(text,_s,_t): deeplx_api = "http://127.0.0.1:1188/translate" data = { "text": text, "source_lang": _s, "target_lang": _t } post_data = json.dumps(data) r = httpx.post(url = deeplx_api, data = post_data).json() print(r["data"]) return r["data"] # 合并字幕 def merge_sub(video_path,srt_path): if os.path.exists("test_srt.mp4"): os.remove("test_srt.mp4") ffmpeg.input(video_path).output("test_srt.mp4", vf="subtitles=" + srt_path).run() return "test_srt.mp4" def make_tran_ja2zh_neverLife(srt_path): model_path = "neverLife/nllb-200-distilled-600M-ja-zh" model = AutoModelForSeq2SeqLM.from_pretrained(model_path, from_pt=True) tokenizer = AutoTokenizer.from_pretrained(model_path, src_lang="jpn_Jpan", tgt_lang="zho_Hans", from_pt=True) # pipe = pipeline(model="larryvrh/mt5-translation-ja_zh") with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists("./two.srt"): os.remove("./two.srt") for res in result: line_srt = res.split("\n") try: # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text'] # print(translated_text) input_ids = tokenizer.encode(line_srt[2], max_length=128, padding=True, return_tensors='pt') outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print(translated_text) except IndexError as e: # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open("./two.srt","r",encoding="utf-8") as f: content = f.read() return content def make_tran_ko2zh(srt_path): # pipe = pipeline(model="yesj1234/mbart_cycle1_ko-zh",device=device,from_pt=True) model_path = "./model_from_hg/ko-zh/" tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True) with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists("./two.srt"): os.remove("./two.srt") for res in result: line_srt = res.split("\n") try: # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text'] # print(translated_text) input_ids = tokenizer.encode(line_srt[2], max_length=128, padding=True, return_tensors='pt') outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print(translated_text) except IndexError as e: # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open("./two.srt","r",encoding="utf-8") as f: content = f.read() return content def make_tran_ja2zh(srt_path): # pipe = pipeline(model="larryvrh/mt5-translation-ja_zh",device=device) model_path = "./model_from_hg/ja-zh/" tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True) with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists("./two.srt"): os.remove("./two.srt") for res in result: line_srt = res.split("\n") try: # translated_text = pipe(f'<-ja2zh-> {line_srt[2]}')[0]['translation_text'] # print(translated_text) input_ids = tokenizer.encode(f'<-ja2zh-> {line_srt[2]}', max_length=128, padding=True, return_tensors='pt') outputs = model.generate(input_ids, num_beams=4, max_new_tokens=128) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print(translated_text) except IndexError as e: # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open("./two.srt","r",encoding="utf-8") as f: content = f.read() return content def make_tran_zh2en(srt_path): model_path = "./model_from_hg/zh-en/" tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True) with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists("./two.srt"): os.remove("./two.srt") for res in result: line_srt = res.split("\n") try: tokenized_text = tokenizer.prepare_seq2seq_batch([line_srt[2]], return_tensors='pt') translation = model.generate(**tokenized_text) translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0] translated_text = translated_text.replace("","").replace("","").strip() print(translated_text) except IndexError as e: # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open("./two.srt","r",encoding="utf-8") as f: content = f.read() return content # 翻译字幕 英译中 def make_tran(srt_path): model_path = "./model_from_hg/en-zh/" tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_path,local_files_only=True) with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists("./two.srt"): os.remove("./two.srt") for res in result: line_srt = res.split("\n") try: tokenized_text = tokenizer.prepare_seq2seq_batch([line_srt[2]], return_tensors='pt') translation = model.generate(**tokenized_text) translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0] translated_text = translated_text.replace("","").replace("","").strip() print(translated_text) except IndexError as e: # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open("./two.srt","r",encoding="utf-8") as f: content = f.read() return content # 翻译字幕 deepl def make_tran_deep(srt_path,_s,_t): with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists(f"{ROOT_DIR}/output/two.srt"): os.remove(f"{ROOT_DIR}/output/two.srt") if os.path.exists(f"{ROOT_DIR}/output/t_sin_{_t}.srt"): os.remove(f"{ROOT_DIR}/output/t_sin_{_t}.srt") for res in result: line_srt = res.split("\n") try: text = line_srt[2] translated_text = deep_tran(text,_s,_t) with open(f"{ROOT_DIR}/output/two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open(f"{ROOT_DIR}/output/t_sin_{_t}.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{translated_text}\n\n") except IndexError as e: print(str(e)) # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open(f"{ROOT_DIR}/output/two.srt","r",encoding="utf-8") as f: content = f.read() with open(f"{ROOT_DIR}/output/t_sin_{_t}.srt","r",encoding="utf-8") as f: content_2 = f.read() return content,content_2,f"{ROOT_DIR}/output/t_sin_{_t}.srt" # 翻译字幕 英译中 qwen2 def make_tran_qwen2(model_name,srt_path,lang): with open(srt_path, 'r',encoding="utf-8") as file: gweight_data = file.read() result = gweight_data.split("\n\n") if os.path.exists(f"{ROOT_DIR}/output/two.srt"): os.remove(f"{ROOT_DIR}/output/two.srt") if os.path.exists(f"{ROOT_DIR}/output/two_single.srt"): os.remove(f"{ROOT_DIR}/output/two_single.srt") for res in result: line_srt = res.split("\n") try: if lang == "zh": lang = "中文" elif lang == "en": lang = "英文" elif lang == "ja": lang = "日文" elif lang == "ko": lang = "韩文" text = line_srt[2] content = f'"{text}" 翻译为{lang},只给我文本的翻译,别添加其他的内容,因为我要做字幕,谢谢' response = ollama.chat(model=model_name,messages=[ { 'role':'user', 'content':content }]) translated_text = response['message']['content'] print(translated_text) except IndexError as e: # 处理下标越界异常 print(f"翻译完毕") break except Exception as e: print(str(e)) with open(f"{ROOT_DIR}/output/two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{translated_text}\n\n") with open(f"{ROOT_DIR}/output/two_single.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{translated_text}\n\n") with open(f"{ROOT_DIR}/output/two.srt","r",encoding="utf-8") as f: content = f.read() with open(f"{ROOT_DIR}/output/two_single.srt","r",encoding="utf-8") as f: content_2 = f.read() return content,content_2 # # 翻译字幕 # def make_tran_ali(): # pipeline_ins = pipeline(task=Tasks.translation, model=model_dir_ins) # with open("./video.srt", 'r',encoding="utf-8") as file: # gweight_data = file.read() # result = gweight_data.split("\n\n") # if os.path.exists("./two.srt"): # os.remove("./two.srt") # for res in result: # line_srt = res.split("\n") # try: # outputs = pipeline_ins(input=line_srt[2]) # print(outputs['translation']) # except IndexError as e: # # 处理下标越界异常 # print(f"翻译完毕") # break # except Exception as e: # print(str(e)) # with open("./two.srt","a",encoding="utf-8")as f:f.write(f"{line_srt[0]}\n{line_srt[1]}\n{line_srt[2]}\n{outputs['translation']}\n\n") # return "翻译完毕" def convert_seconds_to_hms(seconds): hours, remainder = divmod(seconds, 3600) minutes, seconds = divmod(remainder, 60) milliseconds = math.floor((seconds % 1) * 1000) output = f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}" return output emo_dict = { "<|HAPPY|>": "😊", "<|SAD|>": "😔", "<|ANGRY|>": "😡", "<|NEUTRAL|>": "", "<|FEARFUL|>": "😰", "<|DISGUSTED|>": "🤢", "<|SURPRISED|>": "😮", } event_dict = { "<|BGM|>": "🎼", "<|Speech|>": "", "<|Applause|>": "👏", "<|Laughter|>": "😀", "<|Cry|>": "😭", "<|Sneeze|>": "🤧", "<|Breath|>": "", "<|Cough|>": "🤧", } emoji_dict = { "<|nospeech|><|Event_UNK|>": "", "<|zh|>": "", "<|en|>": "", "<|yue|>": "", "<|ja|>": "", "<|ko|>": "", "<|nospeech|>": "", "<|HAPPY|>": "", "<|SAD|>": "", "<|ANGRY|>": "", "<|NEUTRAL|>": "", "<|BGM|>": "", "<|Speech|>": "", "<|Applause|>": "", "<|Laughter|>": "", "<|FEARFUL|>": "", "<|DISGUSTED|>": "", "<|SURPRISED|>": "", "<|Cry|>": "", "<|EMO_UNKNOWN|>": "", "<|Sneeze|>": "", "<|Breath|>": "", "<|Cough|>": "", "<|Sing|>": "", "<|Speech_Noise|>": "", "<|withitn|>": "", "<|woitn|>": "", "<|GBG|>": "", "<|Event_UNK|>": "", } lang_dict = { "<|zh|>": "<|lang|>", "<|en|>": "<|lang|>", "<|yue|>": "<|lang|>", "<|ja|>": "<|lang|>", "<|ko|>": "<|lang|>", "<|nospeech|>": "<|lang|>", } emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"} event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",} lang2token = { 'zh': "ZH|", 'ja': "JP|", "en": "EN|", "ko": "KO|", "yue": "YUE|", } def format_str(s): for sptk in emoji_dict: s = s.replace(sptk, emoji_dict[sptk]) return s def format_str_v2(s): sptk_dict = {} for sptk in emoji_dict: sptk_dict[sptk] = s.count(sptk) s = s.replace(sptk, "") emo = "<|NEUTRAL|>" for e in emo_dict: if sptk_dict[e] > sptk_dict[emo]: emo = e for e in event_dict: if sptk_dict[e] > 0: s = event_dict[e] + s s = s + emo_dict[emo] for emoji in emo_set.union(event_set): s = s.replace(" " + emoji, emoji) s = s.replace(emoji + " ", emoji) return s.strip() def format_str_v3(s): def get_emo(s): return s[-1] if s[-1] in emo_set else None def get_event(s): return s[0] if s[0] in event_set else None s = s.replace("<|nospeech|><|Event_UNK|>", "❓") for lang in lang_dict: s = s.replace(lang, "<|lang|>") s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] new_s = " " + s_list[0] cur_ent_event = get_event(new_s) for i in range(1, len(s_list)): if len(s_list[i]) == 0: continue if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: s_list[i] = s_list[i][1:] #else: cur_ent_event = get_event(s_list[i]) if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): new_s = new_s[:-1] new_s += s_list[i].strip().lstrip() new_s = new_s.replace("The.", " ") return new_s.strip() def ms_to_srt_time(ms): N = int(ms) hours, remainder = divmod(N, 3600000) minutes, remainder = divmod(remainder, 60000) seconds, milliseconds = divmod(remainder, 1000) timesrt = f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" # print(timesrt) return timesrt def time_to_srt(time_in_seconds): """ 将秒数转换为 SRT 时间戳格式。 Args: time_in_seconds: 秒数。 Returns: 一个 SRT 时间戳字符串。 """ milliseconds = int(time_in_seconds * 1000) hours = milliseconds // 3600000 minutes = (milliseconds % 3600000) // 60000 seconds = (milliseconds % 60000) // 1000 milliseconds = milliseconds % 1000 return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" # 制作字幕文件 阿里 def make_srt_sv(file_path): model_dir = "iic/SenseVoiceSmall" input_file = (file_path) model = AutoModel(model=model_dir, vad_model="fsmn-vad", vad_kwargs={"max_single_segment_time": 30000}, trust_remote_code=True, device="cuda:0") res = model.generate( input=input_file, cache={}, language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" use_itn=False, batch_size_s=0, ) print(res) text = res[0]["text"] # text = format_str_v3(text) text = rich_transcription_postprocess(text) print(text) return text # for filename in os.listdir("./wavs"): # if filename.endswith(".wav"): # filepath = os.path.join("./wavs/", filename) # try: # if os.path.isfile(filepath): # os.remove(filepath) # print(f"已删除文件: {filepath}") # except Exception as e: # print(f"删除文件时出错: {filepath} - {e}") # # 第一步,先切片 # audio, sr = librosa.load(file_path, sr=None, mono=False) # # 创建Slicer对象 # slicer = Slicer( # sr=sr, # threshold=-40, # min_length=1500, # min_interval=300, # hop_size=1, # max_sil_kept=150000 # ) # # 切割音频 # chunks = slicer.slice(audio) # for i, chunk in enumerate(chunks): # if len(chunk.shape) > 1: # chunk = chunk.T # Swap axes if the audio is stereo. # soundfile.write(f'./wavs/chunk_{i}.wav', chunk, sr) # srtlines = [] # audio_samples = 0 # audio_opt = [] # for filename in os.listdir("./wavs"): # if filename.endswith(".wav"): # filepath = os.path.join("./wavs/", filename) # print(filepath) # model_dir = "iic/SenseVoiceSmall" # input_file = (filepath) # model = AutoModel(model=model_dir, # vad_model="fsmn-vad", # vad_kwargs={"max_single_segment_time": 30000}, # trust_remote_code=True, device="cuda:0") # res = model.generate( # input=input_file, # cache={}, # language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" # use_itn=False, # batch_size_s=0, # ) # # print(res) # text = res[0]["text"] # # text = format_str_v3(text) # text = rich_transcription_postprocess(text) # print(text) # audio, sampling_rate = soundfile.read(filepath) # audio_opt.append(audio) # srtline_begin=ms_to_srt_time(audio_samples*1000.0 / sampling_rate) # audio_samples += audio.size # srtline_end=ms_to_srt_time(audio_samples*1000.0 / sampling_rate) # srtlines.append(f"{len(audio_opt)}\n") # srtlines.append(srtline_begin+' --> '+srtline_end+"\n") # srtlines.append(text+"\n\n") # exit(-1) with open('./video.srt', 'w', encoding='utf-8') as f: f.writelines(srtlines) with open("./video.srt","r",encoding="utf-8") as f: content = f.read() return content # 制作字幕文件 def make_srt(file_path,model_name="small"): # if device == "cuda": # model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False) # else: # model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False) # or run on GPU with INT8 # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") if model_name != "faster-whisper-large-v3-turbo-ct2": if device == "cuda": try: model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False) except Exception as e: model = WhisperModel(model_name, device="cuda", compute_type="int8_float16",download_root="./model_from_whisper",local_files_only=False) else: model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False) else: model_name = f"{ROOT_DIR}/faster-whisper-large-v3-turbo-ct2" print(model_name) if device == "cuda": try: model = WhisperModel(model_name, device="cuda", compute_type="float16") except Exception as e: model = WhisperModel(model_name, device="cuda", compute_type="int8_float16") else: model = WhisperModel(model_name, device="cpu", compute_type="int8") segments, info = model.transcribe(file_path, beam_size=5,vad_filter=True,vad_parameters=dict(min_silence_duration_ms=500)) print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) count = 0 with open(f'{ROOT_DIR}/output/video.srt', 'w',encoding="utf-8") as f: # Open file for writing for segment in segments: count +=1 duration = f"{convert_seconds_to_hms(segment.start)} --> {convert_seconds_to_hms(segment.end)}\n" text = f"{segment.text.lstrip()}\n\n" f.write(f"{count}\n{duration}{text}") # Write formatted string to the file print(f"{duration}{text}",end='') with open(f"{ROOT_DIR}/output/video.srt","r",encoding="utf-8") as f: content = f.read() return content # 提取人声 def movie2audio(video_path): # 读取视频文件 video = VideoFileClip(video_path) # 提取视频文件中的声音 audio = video.audio # 将声音保存为WAV格式 audio.write_audiofile(f"{ROOT_DIR}/audio.wav") ans = pipeline_ali( Tasks.acoustic_noise_suppression, model=model_dir_cirm) ans(f'{ROOT_DIR}/audio.wav',output_path=f'{ROOT_DIR}/output.wav') return f"{ROOT_DIR}/output.wav"