Spaces:
Sleeping
Sleeping
import re | |
from .constants import VALID_ARABIC | |
from itertools import product, combinations | |
_whitespace_re = re.compile(r"\s+") | |
def collapse_whitespace(text): | |
text = re.sub(_whitespace_re, " ", text) | |
return text | |
def basic_cleaners(text): | |
text = collapse_whitespace(text) | |
return text.strip() | |
# def valid_arabic_cleaners(text): | |
# text = filter(lambda char: char in VALID_ARABIC, text) | |
# text = collapse_whitespace(''.join(list(text))) | |
# return text.strip() | |
harakat = ["\u0650", "\u064E", "\u064F"] # [kasra, fatha, damma, ] | |
sukun = ["\u0652"] # [sukun] | |
mostly_saken = [ | |
"\u0627", | |
"\u0648", | |
"\u0649", | |
"\u064A", | |
] # [alef, waw, alef maqsurah, ya'a] | |
always_saken = [ | |
"\u0627", | |
"\u0649", | |
] | |
tnween_chars = [ | |
"\u064c", | |
"\u064d", | |
"\u064b", | |
] # damm tanween, kasra tanween, fatha tanween, maddah | |
shadda_chars = ["\u0651"] | |
all_tashkeel = harakat+tnween_chars+sukun+shadda_chars | |
all_chars = list("إةابتثجحخدذرزسشصضطظعغفقكلمنهويىأءئؤ ") | |
prem_chars = harakat + sukun + mostly_saken + tnween_chars + shadda_chars + all_chars | |
def not_valid_tashkeel_comb(comb): | |
all_comb = list(product(harakat+sukun+tnween_chars, repeat = 2))+list(product(shadda_chars+sukun, repeat = 2)) | |
if comb in all_comb or comb[::-1] in all_comb: | |
return True | |
else: | |
return False | |
def remove_tanween_on_alef(text): | |
text_copy = "" | |
for i in range(0, len(text)): | |
# if there is shaddah or character followed by alef followed by tanween add | |
if i < len(text) - 2 and text[i] in all_chars+shadda_chars and text[i+1] in always_saken and text[i+2] == tnween_chars[2]: | |
text_copy += text[i] + tnween_chars[2] | |
#ignore current harakah if there is alef followed by tanween | |
elif i < len(text) - 2 and text[i] in harakat and text[i+1] in always_saken and text[i+2] == tnween_chars[2] : | |
text_copy += tnween_chars[2] | |
# if the current char is tanween with alef is the previous character drop tanween | |
elif i > 0 and text[i] == tnween_chars[2] and text[i-1] in always_saken: | |
continue | |
else: | |
text_copy += text[i] | |
return text_copy | |
def dont_start_by_harakah(text): | |
text_copy = "" | |
for i, char in enumerate(text): | |
if not(char in all_tashkeel): | |
text_copy = text[i:] | |
break | |
return text_copy | |
def valid_arabic_cleaners(text): | |
prev_text = text | |
for i in range(5): | |
text = prev_text | |
cleaned_text = "" | |
text = filter(lambda char: char in VALID_ARABIC, text) | |
text = collapse_whitespace(''.join(list(text))) | |
text = dont_start_by_harakah(text) | |
text = text.strip() | |
i = 0 | |
cnt = 0 | |
len_text = len(text) | |
while( i < len_text): | |
if text[i] in all_tashkeel: | |
cnt += 1 | |
else: | |
cnt = 0 | |
# don't allow three consecutive tashkeel | |
if cnt > 2: | |
i+= 1 | |
continue | |
# remove second tanween and sukun | |
if i > 1 and text[i] in tnween_chars+sukun and text[i-2] in tnween_chars+sukun: | |
i += 1 | |
continue | |
# don't allow harakah followed by shaddah or tanween | |
if i < len(text) - 1 and text[i] in harakat and text[i+1] in tnween_chars+sukun+shadda_chars: | |
i += 1 | |
continue | |
# don't allow harkah on space | |
if i> 0 and text[i] in all_tashkeel and text[i-1] == " " : | |
i += 1 | |
continue | |
# only allow permissable combinations | |
if not_valid_tashkeel_comb((text[i], text[i-1])): | |
i+=1 | |
continue | |
# don't allow harkah on alef, alef maqsura, if there is no tashkeel before move it back | |
if i> 1 and text[i] in harakat and text[i-1] in always_saken : | |
if text[i-2] in all_tashkeel: # in case there is a tashkeelah before alef | |
continue | |
else: | |
cleaned_text = text[:i-1]+text[i]+ always_saken[always_saken.index(text[i-1])] | |
i += 1 | |
if i < len(text): | |
cleaned_text+= text[i] | |
i += 1 | |
# only allow tanween before alef | |
cleaned_text = remove_tanween_on_alef(cleaned_text) | |
cleaned_text = re.sub(r" +", " ", cleaned_text).strip() | |
if prev_text == cleaned_text: | |
break | |
else: | |
prev_text = cleaned_text | |
return cleaned_text |