|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
""" |
|
Text clean time |
|
""" |
|
|
|
_abbreviations = [ |
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) |
|
for x in [ |
|
("M", "monsieur"), |
|
("Mlle", "mademoiselle"), |
|
("Mlles", "mesdemoiselles"), |
|
("Mme", "Madame"), |
|
("Mmes", "Mesdames"), |
|
("N.B", "nota bene"), |
|
("M", "monsieur"), |
|
("p.c.q", "parce que"), |
|
("Pr", "professeur"), |
|
("qqch", "quelque chose"), |
|
("rdv", "rendez-vous"), |
|
("max", "maximum"), |
|
("min", "minimum"), |
|
("no", "numéro"), |
|
("adr", "adresse"), |
|
("dr", "docteur"), |
|
("st", "saint"), |
|
("co", "companie"), |
|
("jr", "junior"), |
|
("sgt", "sergent"), |
|
("capt", "capitain"), |
|
("col", "colonel"), |
|
("av", "avenue"), |
|
("av. J.-C", "avant Jésus-Christ"), |
|
("apr. J.-C", "après Jésus-Christ"), |
|
("art", "article"), |
|
("boul", "boulevard"), |
|
("c.-à-d", "c’est-à-dire"), |
|
("etc", "et cetera"), |
|
("ex", "exemple"), |
|
("excl", "exclusivement"), |
|
("boul", "boulevard"), |
|
] |
|
] + [ |
|
(re.compile("\\b%s" % x[0]), x[1]) |
|
for x in [ |
|
("Mlle", "mademoiselle"), |
|
("Mlles", "mesdemoiselles"), |
|
("Mme", "Madame"), |
|
("Mmes", "Mesdames"), |
|
] |
|
] |
|
|
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
"·": ",", |
|
"、": ",", |
|
"...": ".", |
|
"…": ".", |
|
"$": ".", |
|
"“": "", |
|
"”": "", |
|
"‘": "", |
|
"’": "", |
|
"(": "", |
|
")": "", |
|
"(": "", |
|
")": "", |
|
"《": "", |
|
"》": "", |
|
"【": "", |
|
"】": "", |
|
"[": "", |
|
"]": "", |
|
"—": "", |
|
"~": "-", |
|
"~": "-", |
|
"「": "", |
|
"」": "", |
|
"¿": "", |
|
"¡": "", |
|
} |
|
|
|
|
|
def collapse_whitespace(text): |
|
|
|
_whitespace_re = re.compile(r"\s+") |
|
return re.sub(_whitespace_re, " ", text).strip() |
|
|
|
|
|
def remove_punctuation_at_begin(text): |
|
return re.sub(r"^[,.!?]+", "", text) |
|
|
|
|
|
def remove_aux_symbols(text): |
|
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) |
|
return text |
|
|
|
|
|
def replace_symbols(text): |
|
text = text.replace(";", ",") |
|
text = text.replace("-", " ") |
|
text = text.replace(":", ",") |
|
text = text.replace("&", " et ") |
|
return text |
|
|
|
|
|
def expand_abbreviations(text): |
|
for regex, replacement in _abbreviations: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def replace_punctuation(text): |
|
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
|
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
|
return replaced_text |
|
|
|
|
|
def text_normalize(text): |
|
text = expand_abbreviations(text) |
|
text = replace_punctuation(text) |
|
text = replace_symbols(text) |
|
text = remove_aux_symbols(text) |
|
text = remove_punctuation_at_begin(text) |
|
text = collapse_whitespace(text) |
|
text = re.sub(r"([^\.,!\?\-…])$", r"\1", text) |
|
return text |
|
|
|
|
|
def french_to_ipa(text, text_tokenizer): |
|
if type(text) == str: |
|
text = text_normalize(text) |
|
phonemes = text_tokenizer(text) |
|
return phonemes |
|
else: |
|
for i, t in enumerate(text): |
|
text[i] = text_normalize(t) |
|
return text_tokenizer(text) |
|
|