daihui.zhang
commited on
Commit
·
3acb7f3
1
Parent(s):
afeed8e
change config folder structures
Browse files
config/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .settings import *
|
2 |
+
from .prompt import *
|
{moyoyo_asr_models → config}/hotwords.json
RENAMED
File without changes
|
{moyoyo_asr_models → config}/hotwords.txt
RENAMED
File without changes
|
config/keyword_list.txt
ADDED
File without changes
|
config.py → config/prompt.py
RENAMED
@@ -1,71 +1,6 @@
|
|
1 |
-
import
|
2 |
-
import re
|
3 |
-
import logging
|
4 |
import json
|
5 |
|
6 |
-
|
7 |
-
DEBUG = False
|
8 |
-
LOG_LEVEL = logging.DEBUG if DEBUG else logging.WARNING
|
9 |
-
|
10 |
-
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
11 |
-
logging.basicConfig(
|
12 |
-
level=LOG_LEVEL,
|
13 |
-
format="%(asctime)s - %(levelname)s - %(message)s",
|
14 |
-
filename='translator.log',
|
15 |
-
datefmt="%H:%M:%S"
|
16 |
-
)
|
17 |
-
# save pipelines data to disk
|
18 |
-
SAVE_DATA_SAVE = False
|
19 |
-
# Add terminal log
|
20 |
-
console_handler = logging.StreamHandler()
|
21 |
-
console_handler.setLevel(LOG_LEVEL)
|
22 |
-
console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
23 |
-
console_handler.setFormatter(console_formatter)
|
24 |
-
logging.getLogger().addHandler(console_handler)
|
25 |
-
|
26 |
-
# 音频段的决策时间
|
27 |
-
FRAME_SCOPE_TIME_THRESHOLD = 4
|
28 |
-
# 最长语音时长
|
29 |
-
MAX_SPEECH_DURATION_S = 15
|
30 |
-
|
31 |
-
BASE_DIR = pathlib.Path(__file__).parent
|
32 |
-
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
33 |
-
ASSERT_DIR = BASE_DIR / "assets"
|
34 |
-
|
35 |
-
SAMPLE_RATE = 16000
|
36 |
-
# 标点
|
37 |
-
SENTENCE_END_MARKERS = ['.', '!', '?', '。', '!', '?', ';', ';', ':', ':']
|
38 |
-
PAUSE_END_MARKERS = [',', ',', '、']
|
39 |
-
# 合并所有标点
|
40 |
-
ALL_MARKERS = SENTENCE_END_MARKERS + PAUSE_END_MARKERS
|
41 |
-
# 构造正则表达式字符类
|
42 |
-
REGEX_MARKERS = re.compile(r'[' + re.escape(''.join(ALL_MARKERS)) + r']$')
|
43 |
-
|
44 |
-
sentence_end_chars = ''.join([re.escape(char) for char in SENTENCE_END_MARKERS])
|
45 |
-
SENTENCE_END_PATTERN = re.compile(f'[{sentence_end_chars}]')
|
46 |
-
|
47 |
-
# Method 2: Alternative approach with a character class
|
48 |
-
pattern_string = '[' + ''.join([re.escape(char) for char in PAUSE_END_MARKERS]) + r']$'
|
49 |
-
PAUSE_END_PATTERN = re.compile(pattern_string)
|
50 |
-
# whisper推理参数
|
51 |
-
WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
|
52 |
-
MAX_LENGTH_ZH = 4
|
53 |
-
|
54 |
-
WHISPER_PROMPT_EN = "" # "The following is an English sentence."
|
55 |
-
MAX_LENGTH_EN = 8
|
56 |
-
|
57 |
-
WHISPER_MODEL_EN = 'medium-q5_0'
|
58 |
-
# WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
59 |
-
# WHISPER_MODEL_ZH = 'small'
|
60 |
-
WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
|
61 |
-
# LLM
|
62 |
-
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
63 |
-
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
64 |
-
# LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
|
65 |
-
|
66 |
-
# VAD
|
67 |
-
VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
|
68 |
-
|
69 |
LLM_SYS_PROMPT_ZH = """
|
70 |
你是一个中英文翻译专家,将用户输入的中文翻译成英文。对于非中文内容,它将提供中文翻译结果。用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合中文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将原文翻译成具有信达雅标准的译文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。注意,翻译的文本只能包含拼音化字符,不能包含任何中文字符。
|
71 |
"""
|
@@ -74,29 +9,32 @@ LLM_SYS_PROMPT_EN = """
|
|
74 |
你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
|
75 |
"""
|
76 |
|
77 |
-
hotwords_file =
|
78 |
-
hotwords_json = json.loads((
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
83 |
keywords_mapping_string = '\n'.join([
|
84 |
-
f'
|
85 |
-
for
|
86 |
])
|
87 |
|
88 |
LLM_SYS_7B_PROMPT_EN = """
|
89 |
-
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
1. 保持专业术语的准确性,严格按照关键词对照表翻译
|
97 |
-
2. 保持原文的段落结构和格式
|
98 |
-
3. 翻译应当流畅自然,符合英语表达习惯
|
99 |
-
4. 如遇到关键词对照表中未包含的专业术语,请尽量使用通用翻译
|
100 |
|
101 |
文本:
|
102 |
""".format(keywords_mapping_string=keywords_mapping_string)
|
@@ -115,4 +53,4 @@ LLM_SYS_7B_PROMPT_ZH = """
|
|
115 |
- 翻译内容符合中文表达习惯
|
116 |
- 保持专业性和准确性
|
117 |
如遇到难以判断是否需要保留英文的情况,请优先保留原始英文形式。
|
118 |
-
文本:"""
|
|
|
1 |
+
from .settings import CONFIG_DIR
|
|
|
|
|
2 |
import json
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
LLM_SYS_PROMPT_ZH = """
|
5 |
你是一个中英文翻译专家,将用户输入的中文翻译成英文。对于非中文内容,它将提供中文翻译结果。用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合中文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将原文翻译成具有信达雅标准的译文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。注意,翻译的文本只能包含拼音化字符,不能包含任何中文字符。
|
6 |
"""
|
|
|
9 |
你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
|
10 |
"""
|
11 |
|
12 |
+
hotwords_file = CONFIG_DIR / 'hotwords.txt'
|
13 |
+
hotwords_json = json.loads((CONFIG_DIR / 'hotwords.json').read_text())
|
14 |
|
15 |
+
# 翻译提示词
|
16 |
+
keywords_list = [
|
17 |
+
"GOSIM",
|
18 |
+
"GO SIM",
|
19 |
+
'Rust',
|
20 |
+
]
|
21 |
keywords_mapping_string = '\n'.join([
|
22 |
+
f' * {value}'
|
23 |
+
for value in keywords_list
|
24 |
])
|
25 |
|
26 |
LLM_SYS_7B_PROMPT_EN = """
|
27 |
+
你是一位精通简体中文的专业翻译,尤其擅长将专业学术论文翻译成浅显易懂的科普文章。请你帮我将以下英文段落翻译成中文,风格与中文科普读物相似。
|
28 |
|
29 |
+
规则:
|
30 |
+
- 翻译时要准确传达原文的事实和背景;
|
31 |
+
- 即使上意译也要保留原始段落格式,以及保留术语,例如 FLAC,JPEG 等。保留公司缩写,例如 Microsoft, Amazon, OpenAI 等;
|
32 |
+
- 人物的名称不需要翻译;
|
33 |
+
- 全角括号换成半角括号,并在左括号前面加半角空格,右括号后面加半角空格;
|
34 |
+
- 在翻译专业术语时,第一次出现时要在括号里面写上英文原文,例如:“生成式 AI (Generative AI)”,之后就可以只写中文了;
|
35 |
+
- 以下是常见的AI相关术语,这部分的术语不需要翻译;
|
36 |
|
37 |
+
{keywords_mapping_string}
|
|
|
|
|
|
|
|
|
38 |
|
39 |
文本:
|
40 |
""".format(keywords_mapping_string=keywords_mapping_string)
|
|
|
53 |
- 翻译内容符合中文表达习惯
|
54 |
- 保持专业性和准确性
|
55 |
如遇到难以判断是否需要保留英文的情况,请优先保留原始英文形式。
|
56 |
+
文本:"""
|
config/settings.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pathlib
|
2 |
+
import re
|
3 |
+
import logging
|
4 |
+
import json
|
5 |
+
|
6 |
+
DEBUG = False
|
7 |
+
LOG_LEVEL = logging.DEBUG if DEBUG else logging.WARNING
|
8 |
+
|
9 |
+
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
10 |
+
logging.basicConfig(
|
11 |
+
level=LOG_LEVEL,
|
12 |
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
13 |
+
filename='translator.log',
|
14 |
+
datefmt="%H:%M:%S"
|
15 |
+
)
|
16 |
+
# save pipelines data to disk
|
17 |
+
SAVE_DATA_SAVE = False
|
18 |
+
# Add terminal log
|
19 |
+
console_handler = logging.StreamHandler()
|
20 |
+
console_handler.setLevel(LOG_LEVEL)
|
21 |
+
console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
22 |
+
console_handler.setFormatter(console_formatter)
|
23 |
+
logging.getLogger().addHandler(console_handler)
|
24 |
+
|
25 |
+
# 音频段的决策时间
|
26 |
+
FRAME_SCOPE_TIME_THRESHOLD = 4
|
27 |
+
# 最长语音时长
|
28 |
+
MAX_SPEECH_DURATION_S = 15
|
29 |
+
|
30 |
+
BASE_DIR = pathlib.Path(__file__).parent.parent
|
31 |
+
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
32 |
+
ASSERT_DIR = BASE_DIR / "assets"
|
33 |
+
CONFIG_DIR = BASE_DIR / "config"
|
34 |
+
|
35 |
+
SAMPLE_RATE = 16000
|
36 |
+
# 标点
|
37 |
+
SENTENCE_END_MARKERS = ['.', '!', '?', '。', '!', '?', ';', ';', ':', ':']
|
38 |
+
PAUSE_END_MARKERS = [',', ',', '、']
|
39 |
+
# 合并所有标点
|
40 |
+
ALL_MARKERS = SENTENCE_END_MARKERS + PAUSE_END_MARKERS
|
41 |
+
# 构造正则表达式字符类
|
42 |
+
REGEX_MARKERS = re.compile(r'[' + re.escape(''.join(ALL_MARKERS)) + r']$')
|
43 |
+
|
44 |
+
sentence_end_chars = ''.join([re.escape(char) for char in SENTENCE_END_MARKERS])
|
45 |
+
SENTENCE_END_PATTERN = re.compile(f'[{sentence_end_chars}]')
|
46 |
+
|
47 |
+
# Method 2: Alternative approach with a character class
|
48 |
+
pattern_string = '[' + ''.join([re.escape(char) for char in PAUSE_END_MARKERS]) + r']$'
|
49 |
+
PAUSE_END_PATTERN = re.compile(pattern_string)
|
50 |
+
# whisper推理参数
|
51 |
+
WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
|
52 |
+
MAX_LENGTH_ZH = 4
|
53 |
+
|
54 |
+
WHISPER_PROMPT_EN = "" # "The following is an English sentence."
|
55 |
+
MAX_LENGTH_EN = 8
|
56 |
+
|
57 |
+
WHISPER_MODEL_EN = 'medium-q5_0'
|
58 |
+
# WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
59 |
+
# WHISPER_MODEL_ZH = 'small'
|
60 |
+
WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
|
61 |
+
# LLM
|
62 |
+
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
63 |
+
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
64 |
+
# LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
|
65 |
+
|
66 |
+
# VAD
|
67 |
+
VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
|
transcribe/pipelines/pipe_translate.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
from .base import MetaItem, BasePipe, Segment
|
3 |
from llama_cpp import Llama
|
4 |
from ..helpers.translator import QwenTranslator
|
5 |
-
from config import LLM_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH, LLM_LARGE_MODEL_PATH, ALL_MARKERS
|
6 |
|
7 |
|
8 |
class TranslatePipe(BasePipe):
|
@@ -33,7 +33,7 @@ class Translate7BPipe(TranslatePipe):
|
|
33 |
@classmethod
|
34 |
def init(cls):
|
35 |
if cls.translator is None:
|
36 |
-
cls.translator = QwenTranslator(LLM_LARGE_MODEL_PATH,
|
37 |
|
38 |
|
39 |
|
|
|
2 |
from .base import MetaItem, BasePipe, Segment
|
3 |
from llama_cpp import Llama
|
4 |
from ..helpers.translator import QwenTranslator
|
5 |
+
from config import LLM_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH, LLM_LARGE_MODEL_PATH, ALL_MARKERS, LLM_SYS_7B_PROMPT_EN, LLM_SYS_7B_PROMPT_ZH
|
6 |
|
7 |
|
8 |
class TranslatePipe(BasePipe):
|
|
|
33 |
@classmethod
|
34 |
def init(cls):
|
35 |
if cls.translator is None:
|
36 |
+
cls.translator = QwenTranslator(LLM_LARGE_MODEL_PATH, LLM_SYS_7B_PROMPT_EN, LLM_SYS_7B_PROMPT_ZH)
|
37 |
|
38 |
|
39 |
|
transcribe/whisper_llm_serve.py
CHANGED
@@ -107,7 +107,7 @@ class WhisperTranscriptionService:
|
|
107 |
|
108 |
if frame_np is None:
|
109 |
continue
|
110 |
-
|
111 |
with self.lock:
|
112 |
self.frames_np = np.append(self.frames_np, frame_np)
|
113 |
|
@@ -123,6 +123,8 @@ class WhisperTranscriptionService:
|
|
123 |
self.frames_np = np.array([], dtype=np.float32)
|
124 |
|
125 |
# 音频结束信号的时候 整合当前缓冲区
|
|
|
|
|
126 |
elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
|
127 |
time_diff = time.time() - self.frames_np_start_timestamp
|
128 |
if time_diff >= config.FRAME_SCOPE_TIME_THRESHOLD:
|
@@ -141,7 +143,7 @@ class WhisperTranscriptionService:
|
|
141 |
frame_epoch = 1
|
142 |
|
143 |
while not self._translate_thread_stop.is_set():
|
144 |
-
|
145 |
if len(self.frames_np) ==0:
|
146 |
time.sleep(0.1)
|
147 |
continue
|
|
|
107 |
|
108 |
if frame_np is None:
|
109 |
continue
|
110 |
+
# logger.critical(f"frame np:{frame_np.shape}, {speech_status}")
|
111 |
with self.lock:
|
112 |
self.frames_np = np.append(self.frames_np, frame_np)
|
113 |
|
|
|
123 |
self.frames_np = np.array([], dtype=np.float32)
|
124 |
|
125 |
# 音频结束信号的时候 整合当前缓冲区
|
126 |
+
# START -- END -- START -- END 通常
|
127 |
+
# START -- END -- END end块带有音频信息的通常是4096内断的一个短音
|
128 |
elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
|
129 |
time_diff = time.time() - self.frames_np_start_timestamp
|
130 |
if time_diff >= config.FRAME_SCOPE_TIME_THRESHOLD:
|
|
|
143 |
frame_epoch = 1
|
144 |
|
145 |
while not self._translate_thread_stop.is_set():
|
146 |
+
|
147 |
if len(self.frames_np) ==0:
|
148 |
time.sleep(0.1)
|
149 |
continue
|