|
|
|
|
|
|
|
|
|
import regex as re |
|
multispace_regex = re.compile("[ ]{2,}") |
|
multidots_regex = re.compile(r"\.{2,}") |
|
end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])") |
|
digit_space_percent = re.compile(r"(\d) %") |
|
double_quot_punc = re.compile(r"\"([,\.]+)") |
|
digit_nbsp_digit = re.compile(r"(\d) (\d)") |
|
|
|
def punc_norm(text, lang="en"): |
|
text = text.replace('\r', '') \ |
|
.replace('(', " (") \ |
|
.replace(')', ") ") \ |
|
\ |
|
.replace("( ", "(") \ |
|
.replace(" )", ")") \ |
|
\ |
|
.replace(" :", ':') \ |
|
.replace(" ;", ';') \ |
|
.replace('`', "'") \ |
|
\ |
|
.replace('„', '"') \ |
|
.replace('“', '"') \ |
|
.replace('”', '"') \ |
|
.replace('–', '-') \ |
|
.replace('—', " - ") \ |
|
.replace('´', "'") \ |
|
.replace('‘', "'") \ |
|
.replace('‚', "'") \ |
|
.replace('’', "'") \ |
|
.replace("''", "\"") \ |
|
.replace("´´", '"') \ |
|
.replace('…', "...") \ |
|
.replace(" « ", " \"") \ |
|
.replace("« ", '"') \ |
|
.replace('«', '"') \ |
|
.replace(" » ", "\" ") \ |
|
.replace(" »", '"') \ |
|
.replace('»', '"') \ |
|
.replace(" %", '%') \ |
|
.replace("nº ", "nº ") \ |
|
.replace(" :", ':') \ |
|
.replace(" ºC", " ºC") \ |
|
.replace(" cm", " cm") \ |
|
.replace(" ?", '?') \ |
|
.replace(" !", '!') \ |
|
.replace(" ;", ';') \ |
|
.replace(", ", ", ") \ |
|
|
|
|
|
text = multispace_regex.sub(' ', text) |
|
text = multidots_regex.sub('.', text) |
|
text = end_bracket_space_punc_regex.sub(r")\1", text) |
|
text = digit_space_percent.sub(r"\1%", text) |
|
text = double_quot_punc.sub(r'\1"', text) |
|
text = digit_nbsp_digit.sub(r"\1.\2", text) |
|
return text.strip(' ') |