|
import regex as re |
|
import string |
|
|
|
def keep_devnagri(text:str): |
|
""" |
|
Remove all non Devnagri characters from the text. |
|
Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py |
|
|
|
@param text: str Text to be cleaned |
|
@return: Union[str, bool] |
|
""" |
|
pattern = r'[\p{Devanagari}0-9।\s\.\!]+' |
|
|
|
|
|
punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]") |
|
|
|
|
|
cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)]) |
|
|
|
|
|
cleaned = re.sub(r"[ ]+", " ", cleaned) |
|
|
|
|
|
is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0 |
|
|
|
return cleaned, is_just_punctuation |
|
|
|
def keep_devnagri_hf_doc(document): |
|
if isinstance(document['text'], str): |
|
batched = False |
|
elif isinstance(document['text'], list): |
|
batched = True |
|
else: |
|
raise TypeError("Document must be a dictionary or list.") |
|
|
|
def get_clean_text(text): |
|
cleaned_text, is_just_punctuation = keep_devnagri(text) |
|
|
|
|
|
|
|
cleaned_text = cleaned_text if not is_just_punctuation else " " |
|
return cleaned_text |
|
|
|
if batched: |
|
document['text'] = [get_clean_text(text) for text in document['text']] |
|
else: |
|
document['text'] = get_clean_text(document['text']) |
|
|
|
return document |