Update app.py
Browse files
app.py
CHANGED
@@ -8,37 +8,7 @@ def load_data():
|
|
8 |
df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
|
9 |
lemmas = {}
|
10 |
current_lemma = None
|
11 |
-
|
12 |
-
def expand_ppos(ppos):
|
13 |
-
parts = ppos.split('==')
|
14 |
-
tag = parts[0]
|
15 |
-
rest = parts[1] if len(parts) > 1 else ''
|
16 |
-
expanded = [tag]
|
17 |
-
|
18 |
-
bracket_matches = re.findall(r'\[([^\]]+)\]', tag)
|
19 |
-
if bracket_matches:
|
20 |
-
for match in bracket_matches:
|
21 |
-
new_expanded = []
|
22 |
-
for char in match:
|
23 |
-
for item in expanded:
|
24 |
-
new_expanded.append(item.replace(f'[{match}]', char))
|
25 |
-
expanded = new_expanded
|
26 |
-
|
27 |
-
return [f"{item}=={rest}" for item in expanded]
|
28 |
-
|
29 |
-
def simplify_ppos(ppos):
|
30 |
-
ppos_parts = ppos.split('==')
|
31 |
-
if len(ppos_parts) != 2:
|
32 |
-
return None
|
33 |
-
tag, case = ppos_parts
|
34 |
-
if len(tag) < 5:
|
35 |
-
return None
|
36 |
-
number = 's' if tag[3] == 's' else 'p'
|
37 |
-
gender = tag[2]
|
38 |
-
case = tag[4]
|
39 |
-
definiteness = 'i' if case == 'iuu' else 'd'
|
40 |
-
return f"{number}{case}{definiteness}"
|
41 |
-
|
42 |
for row in df.itertuples(index=False, name=None):
|
43 |
if len(row) < 5:
|
44 |
print(f"Skipping problematic line: {row}")
|
@@ -49,55 +19,63 @@ def load_data():
|
|
49 |
elif current_lemma is None:
|
50 |
current_lemma = orto.replace("ORTO:", "")
|
51 |
lemmas[current_lemma] = []
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
if
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
'COMM': comm if comm else "",
|
62 |
-
'pronunciations': pronunciations
|
63 |
-
}
|
64 |
-
lemmas[current_lemma].append(lemma_data)
|
65 |
else:
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
if
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
'pronunciations': pronunciations
|
77 |
-
}
|
78 |
-
lemmas[current_lemma].append(lemma_data)
|
79 |
-
|
80 |
print("Loaded lemmas:", lemmas) # Debugging output
|
81 |
return lemmas
|
82 |
|
83 |
lemmas = load_data()
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
def create_noun_table(lemma, forms):
|
86 |
table_data = {
|
87 |
-
'
|
88 |
-
'
|
89 |
-
'
|
90 |
-
'
|
|
|
|
|
|
|
91 |
}
|
92 |
|
93 |
for form in forms:
|
94 |
-
ppos = form['PPOS']
|
95 |
word = form['word']
|
96 |
-
print(f"Processing: word={word}, ppos={ppos}")
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
101 |
|
102 |
print(f"Final table data for {lemma}: {table_data}") # Debugging output
|
103 |
|
@@ -117,28 +95,28 @@ def create_noun_table(lemma, forms):
|
|
117 |
</thead>
|
118 |
<tbody>
|
119 |
<tr>
|
120 |
-
<td>{table_data['
|
121 |
-
<td>{table_data['
|
122 |
-
<td>{table_data['
|
123 |
-
<td>{table_data['
|
124 |
</tr>
|
125 |
<tr>
|
126 |
-
<td>{table_data['
|
127 |
-
<td>{table_data['
|
128 |
-
<td>{table_data['
|
129 |
-
<td>{table_data['
|
130 |
</tr>
|
131 |
<tr>
|
132 |
-
<td>{table_data['
|
133 |
-
<td>{table_data['
|
134 |
-
<td>{table_data['
|
135 |
-
<td>{table_data['
|
136 |
</tr>
|
137 |
<tr>
|
138 |
-
<td>{table_data['
|
139 |
-
<td>{table_data['
|
140 |
-
<td>{table_data['
|
141 |
-
<td>{table_data['
|
142 |
</tr>
|
143 |
</tbody>
|
144 |
</table>
|
|
|
8 |
df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
|
9 |
lemmas = {}
|
10 |
current_lemma = None
|
11 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
for row in df.itertuples(index=False, name=None):
|
13 |
if len(row) < 5:
|
14 |
print(f"Skipping problematic line: {row}")
|
|
|
19 |
elif current_lemma is None:
|
20 |
current_lemma = orto.replace("ORTO:", "")
|
21 |
lemmas[current_lemma] = []
|
22 |
+
lemma_data = {
|
23 |
+
'word': current_lemma,
|
24 |
+
'PPOS': ppos.replace("PPOS:", "") if ppos else "",
|
25 |
+
'PHON1': phon1.replace("PHON:", "") if phon1 else "",
|
26 |
+
'PHON2': phon2.replace("PHON:", "") if phon2 else "",
|
27 |
+
'COMM': comm if comm else "",
|
28 |
+
'pronunciations': pronunciations
|
29 |
+
}
|
30 |
+
lemmas[current_lemma].append(lemma_data)
|
|
|
|
|
|
|
|
|
31 |
else:
|
32 |
+
lemma_data = {
|
33 |
+
'word': orto.replace("ORTO:", "") if orto else "",
|
34 |
+
'PPOS': ppos.replace("PPOS:", "") if ppos else "",
|
35 |
+
'PHON1': phon1.replace("PHON:", "") if phon1 else "",
|
36 |
+
'PHON2': phon2.replace("PHON:", "") if phon2 else "",
|
37 |
+
'COMM': comm if comm else "",
|
38 |
+
'pronunciations': pronunciations
|
39 |
+
}
|
40 |
+
lemmas[current_lemma].append(lemma_data)
|
41 |
+
|
|
|
|
|
|
|
|
|
42 |
print("Loaded lemmas:", lemmas) # Debugging output
|
43 |
return lemmas
|
44 |
|
45 |
lemmas = load_data()
|
46 |
|
47 |
+
def expand_ppos(ppos):
|
48 |
+
matches = re.findall(r'\[([^\]]+)\]', ppos)
|
49 |
+
if matches:
|
50 |
+
expanded = []
|
51 |
+
for match in matches[0]:
|
52 |
+
expanded.append(ppos.replace(f'[{matches[0]}]', match))
|
53 |
+
return expanded
|
54 |
+
else:
|
55 |
+
return [ppos]
|
56 |
+
|
57 |
def create_noun_table(lemma, forms):
|
58 |
table_data = {
|
59 |
+
'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '',
|
60 |
+
'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '',
|
61 |
+
'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '',
|
62 |
+
'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '',
|
63 |
+
'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '',
|
64 |
+
'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '',
|
65 |
+
'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': ''
|
66 |
}
|
67 |
|
68 |
for form in forms:
|
69 |
+
ppos = form['PPOS'].lower() # Normalize to lowercase
|
70 |
word = form['word']
|
71 |
+
print(f"Processing: word={word}, ppos={ppos}, key={ppos}")
|
72 |
+
expanded_ppos_list = expand_ppos(ppos)
|
73 |
+
for expanded_ppos in expanded_ppos_list:
|
74 |
+
key = expanded_ppos
|
75 |
+
if key in table_data:
|
76 |
+
table_data[key] = word
|
77 |
+
else:
|
78 |
+
print(f"Unmatched key: {key} for word: {word} with PPOS: {ppos}")
|
79 |
|
80 |
print(f"Final table data for {lemma}: {table_data}") # Debugging output
|
81 |
|
|
|
95 |
</thead>
|
96 |
<tbody>
|
97 |
<tr>
|
98 |
+
<td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td>
|
99 |
+
<td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td>
|
100 |
+
<td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
|
101 |
+
<td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
|
102 |
</tr>
|
103 |
<tr>
|
104 |
+
<td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td>
|
105 |
+
<td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td>
|
106 |
+
<td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td>
|
107 |
+
<td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td>
|
108 |
</tr>
|
109 |
<tr>
|
110 |
+
<td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td>
|
111 |
+
<td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td>
|
112 |
+
<td>{table_data['ncmpd==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
|
113 |
+
<td>{table_data['ncmpd==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
|
114 |
</tr>
|
115 |
<tr>
|
116 |
+
<td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td>
|
117 |
+
<td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td>
|
118 |
+
<td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td>
|
119 |
+
<td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td>
|
120 |
</tr>
|
121 |
</tbody>
|
122 |
</table>
|