Commit
·
08016ad
1
Parent(s):
301bdf5
create romanian
Browse files- romanian.py +91 -0
romanian.py
CHANGED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
special_combs = {
|
2 |
+
"Este" : "Yeste", "este" : "yeste",
|
3 |
+
"El" : "Yel",
|
4 |
+
"Che": "Ke", "che": "ke",
|
5 |
+
"Chi": "Ki", "chi": "ki",
|
6 |
+
"Ghe": "ଗେ", "ghe": "गे",
|
7 |
+
"Ghi": "ଗି", "ghi": "गी",
|
8 |
+
"Ch" : "h" , "ch" : "h",
|
9 |
+
"Sc" : "Sk" , "sc" : "sk",
|
10 |
+
"Ce" : "Чe", "ce" : "чe",
|
11 |
+
"Ci" : "Чi", "ci" : "чi",
|
12 |
+
"Ge" : "ଜେ", "ge" : "जे",
|
13 |
+
"Gi" : "ଜି", "gi" : "जी",
|
14 |
+
}
|
15 |
+
|
16 |
+
romanian_dict = {
|
17 |
+
"ă" : "aw", "Ă" : "Aw",
|
18 |
+
"â" : "u", "Â" : "U",
|
19 |
+
"î" : "u", "Î" : "U",
|
20 |
+
"j" : "zh", "J" : "Zh",
|
21 |
+
"q" : "k", "Q" : "K",
|
22 |
+
"ș" : "sh", "Ș" : "Sh",
|
23 |
+
"ț" : "ts", "Ț" : "Ts",
|
24 |
+
"c" : "k", "C" : "K",
|
25 |
+
}
|
26 |
+
|
27 |
+
cyrillic_equiv_dict = {
|
28 |
+
"ч" : "ch", "Ч" : "Ch",
|
29 |
+
"ଗି" : "Gi", "गी": "gi",
|
30 |
+
"ଗେ" : "Ge", "गे" : "ge",
|
31 |
+
"ଜି" : "Ji", "जी" : "ji",
|
32 |
+
"ଜେ" : "Je", "जे" : "je",
|
33 |
+
}
|
34 |
+
|
35 |
+
def romanian_position_conditional_replace(word):
|
36 |
+
if len(word) == 1:
|
37 |
+
return word
|
38 |
+
|
39 |
+
if word.startswith("y"):
|
40 |
+
word = word.replace("y", "i",1)
|
41 |
+
|
42 |
+
if word.startswith("Y"):
|
43 |
+
word = word.replace("Y", "I",1)
|
44 |
+
|
45 |
+
if word.startswith("x"): #At beginning or word, x = ks
|
46 |
+
word = word.replace("x", "ks",1)
|
47 |
+
|
48 |
+
x_pattern = r'([aeiouAEIOU])(x)([aeiouAEIOU])' #x between vowels = ks
|
49 |
+
replacement = r'\1gz\3'
|
50 |
+
word = re.sub(x_pattern, replacement, word)
|
51 |
+
|
52 |
+
if "x" in word or "X" in word:
|
53 |
+
word = word.replace("x", "ks")
|
54 |
+
word = word.replace("X", "Ks")
|
55 |
+
|
56 |
+
return word
|
57 |
+
|
58 |
+
def check_special_comb(word):
|
59 |
+
for key in special_combs.keys():
|
60 |
+
if key in word:
|
61 |
+
word = word.replace(key, special_combs[key])
|
62 |
+
return word
|
63 |
+
|
64 |
+
def romanian_replace(word):
|
65 |
+
for key in romanian_dict.keys():
|
66 |
+
word = word.replace(key, romanian_dict[key])
|
67 |
+
return word
|
68 |
+
|
69 |
+
def cyrillic_replace(word):
|
70 |
+
for cyrillic in cyrillic_equiv_dict:
|
71 |
+
if cyrillic in word:
|
72 |
+
word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
|
73 |
+
return word
|
74 |
+
|
75 |
+
def romanian_word_to_latin(word):
|
76 |
+
word = romanian_position_conditional_replace(word)
|
77 |
+
# print(word)
|
78 |
+
word = check_special_comb(word)
|
79 |
+
# print(word)
|
80 |
+
word = romanian_replace(word)
|
81 |
+
# print(word)
|
82 |
+
word = cyrillic_replace(word)
|
83 |
+
return word
|
84 |
+
|
85 |
+
def romanian_sentence_to_latin(text):
|
86 |
+
tokens = text.split(" ")
|
87 |
+
# print(tokens)
|
88 |
+
latin_tokens = [romanian_word_to_latin(token) for token in tokens]
|
89 |
+
# print(latin_tokens)
|
90 |
+
latin_text = " ".join(latin_tokens)
|
91 |
+
return latin_text
|