Spaces:

stepfun-ai
/

Step-Audio

Running

Step-Audio / funasr_detach /models /ct_transformer /utils.py

martin

initial

67c46fd 5 months ago

3.07 kB

	#!/usr/bin/env python3
	# -- encoding: utf-8 --
	# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
	# MIT License (https://opensource.org/licenses/MIT)

	import re


	def split_to_mini_sentence(words: list, word_limit: int = 20):
	assert word_limit > 1
	if len(words) <= word_limit:
	return [words]
	sentences = []
	length = len(words)
	sentence_len = length // word_limit
	for i in range(sentence_len):
	sentences.append(words[i * word_limit : (i + 1) * word_limit])
	if length % word_limit > 0:
	sentences.append(words[sentence_len * word_limit :])
	return sentences


	def split_words(text: str, jieba_usr_dict=None, **kwargs):
	if jieba_usr_dict:
	input_list = text.split()
	token_list_all = []
	langauge_list = []
	token_list_tmp = []
	language_flag = None
	for token in input_list:
	if isEnglish(token) and language_flag == "Chinese":
	token_list_all.append(token_list_tmp)
	langauge_list.append("Chinese")
	token_list_tmp = []
	elif not isEnglish(token) and language_flag == "English":
	token_list_all.append(token_list_tmp)
	langauge_list.append("English")
	token_list_tmp = []

	token_list_tmp.append(token)

	if isEnglish(token):
	language_flag = "English"
	else:
	language_flag = "Chinese"

	if token_list_tmp:
	token_list_all.append(token_list_tmp)
	langauge_list.append(language_flag)

	result_list = []
	for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
	if language_flag == "English":
	result_list.extend(token_list_tmp)
	else:
	seg_list = jieba_usr_dict.cut(
	join_chinese_and_english(token_list_tmp), HMM=False
	)
	result_list.extend(seg_list)

	return result_list

	else:
	words = []
	segs = text.split()
	for seg in segs:
	# There is no space in seg.
	current_word = ""
	for c in seg:
	if len(c.encode()) == 1:
	# This is an ASCII char.
	current_word += c
	else:
	# This is a Chinese char.
	if len(current_word) > 0:
	words.append(current_word)
	current_word = ""
	words.append(c)
	if len(current_word) > 0:
	words.append(current_word)
	return words


	def isEnglish(text: str):
	if re.search("^[a-zA-Z']+$", text):
	return True
	else:
	return False


	def join_chinese_and_english(input_list):
	line = ""
	for token in input_list:
	if isEnglish(token):
	line = line + " " + token
	else:
	line = line + token

	line = line.strip()
	return line