Spaces:
Running
Running
File size: 3,740 Bytes
9b2107c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import re
import bangla
from bnnumerizer import numerize
from bnunicodenormalizer import Normalizer
# initialize
bnorm = Normalizer()
attribution_dict = {
"সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
"আঃ": "আলাইহিস সালাম",
"রাঃ": "রাদিআল্লাহু আনহু",
"রহঃ": "রহমাতুল্লাহি আলাইহি",
"রহিঃ": "রহিমাহুল্লাহ",
"হাফিঃ": "হাফিযাহুল্লাহ",
"বায়ান": "বাইআন",
"দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
# "আয়াত" : "আইআত",#আইআত
# "ওয়া" : "ওআ",
# "ওয়াসাল্লাম" : "ওআসাল্লাম",
# "কেন" : "কেনো",
# "কোন" : "কোনো",
# "বল" : "বলো",
# "চল" : "চলো",
# "কর" : "করো",
# "রাখ" : "রাখো",
"’": "",
"‘": "",
# "য়" : "অ",
# "সম্প্রদায়" : "সম্প্রদাই",
# "রয়েছে" : "রইছে",
# "রয়েছ" : "রইছ",
"/": " বাই ",
}
def tag_text(text: str):
# remove multiple spaces
text = re.sub(" +", " ", text)
# create start and end
text = "start" + text + "end"
# tag text
parts = re.split("[\u0600-\u06FF]+", text)
# remove non chars
parts = [p for p in parts if p.strip()]
# unique parts
parts = set(parts)
# tag the text
for m in parts:
if len(m.strip()) > 1:
text = text.replace(m, f"{m}")
# clean-tags
text = text.replace("start", "")
text = text.replace("end", "")
return text
def normalize(sen):
global bnorm # pylint: disable=global-statement
_words = [bnorm(word)["normalized"] for word in sen.split()]
return " ".join([word for word in _words if word is not None])
def expand_full_attribution(text):
for word, attr in attribution_dict.items():
if word in text:
text = text.replace(word, normalize(attr))
return text
def collapse_whitespace(text):
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
return re.sub(_whitespace_re, " ", text)
def bangla_text_to_phonemes(text: str) -> str:
# english numbers to bangla conversion
res = re.search("[0-9]", text)
if res is not None:
text = bangla.convert_english_digit_to_bangla_digit(text)
# replace ':' in between two bangla numbers with ' এর '
pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
matches = re.findall(pattern, text)
for m in matches:
r = m.replace(":", " এর ")
text = text.replace(m, r)
# numerize text
text = numerize(text)
# tag sections
text = tag_text(text)
# text blocks
# blocks = text.split("")
# blocks = [b for b in blocks if b.strip()]
# create tuple of (lang,text)
if "" in text:
text = text.replace("", "").replace("", "")
# Split based on sentence ending Characters
bn_text = text.strip()
sentenceEnders = re.compile("[।!?]")
sentences = sentenceEnders.split(str(bn_text))
data = ""
for sent in sentences:
res = re.sub("\n", "", sent)
res = normalize(res)
# expand attributes
res = expand_full_attribution(res)
res = collapse_whitespace(res)
res += "।"
data += res
return data
|