Spaces:
Running
Running
# IMPORTANT NOTE: DO NOT DIRECTLY EDIT THIS FILE | |
# This file was manually ported from `normalize-punctuation.perl` | |
# TODO: Only supports English, add others | |
import regex as re | |
multispace_regex = re.compile("[ ]{2,}") | |
multidots_regex = re.compile(r"\.{2,}") | |
end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])") | |
digit_space_percent = re.compile(r"(\d) %") | |
double_quot_punc = re.compile(r"\"([,\.]+)") | |
digit_nbsp_digit = re.compile(r"(\d) (\d)") | |
def punc_norm(text, lang="en"): | |
text = text.replace('\r', '') \ | |
.replace('(', " (") \ | |
.replace(')', ") ") \ | |
\ | |
.replace("( ", "(") \ | |
.replace(" )", ")") \ | |
\ | |
.replace(" :", ':') \ | |
.replace(" ;", ';') \ | |
.replace('`', "'") \ | |
\ | |
.replace('„', '"') \ | |
.replace('“', '"') \ | |
.replace('”', '"') \ | |
.replace('–', '-') \ | |
.replace('—', " - ") \ | |
.replace('´', "'") \ | |
.replace('‘', "'") \ | |
.replace('‚', "'") \ | |
.replace('’', "'") \ | |
.replace("''", "\"") \ | |
.replace("´´", '"') \ | |
.replace('…', "...") \ | |
.replace(" « ", " \"") \ | |
.replace("« ", '"') \ | |
.replace('«', '"') \ | |
.replace(" » ", "\" ") \ | |
.replace(" »", '"') \ | |
.replace('»', '"') \ | |
.replace(" %", '%') \ | |
.replace("nº ", "nº ") \ | |
.replace(" :", ':') \ | |
.replace(" ºC", " ºC") \ | |
.replace(" cm", " cm") \ | |
.replace(" ?", '?') \ | |
.replace(" !", '!') \ | |
.replace(" ;", ';') \ | |
.replace(", ", ", ") \ | |
text = multispace_regex.sub(' ', text) | |
text = multidots_regex.sub('.', text) | |
text = end_bracket_space_punc_regex.sub(r")\1", text) | |
text = digit_space_percent.sub(r"\1%", text) | |
text = double_quot_punc.sub(r'\1"', text) # English "quotation," followed by comma, style | |
text = digit_nbsp_digit.sub(r"\1.\2", text) # What does it mean? | |
return text.strip(' ') |