File size: 6,794 Bytes
38116a9 5e8c89b 38116a9 34202a3 38116a9 37c5513 18877f0 29f0493 37c5513 29f0493 37c5513 18877f0 29f0493 18877f0 37c5513 34202a3 8f7bb50 34202a3 37c5513 34202a3 37c5513 05dab99 38116a9 9fcb0ad 4095e7a da1f459 a1c31f3 4095e7a 4f5f3f9 4095e7a 971f291 4095e7a 8249dc8 37c5513 645b14b c78e3ff 9fcb0ad fdcf728 56e2fd6 fdcf728 56e2fd6 fdcf728 eb06599 56e2fd6 fdcf728 56e2fd6 9fcb0ad 38116a9 9fcb0ad 971f291 4095e7a 9fcb0ad 38116a9 9fcb0ad 38116a9 56e2fd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import gradio as gr
import pandas as pd
import re
# Load and parse the CSV file from Hugging Face
def load_data():
url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv"
df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
lemmas = {}
current_lemma = None
def expand_ppos(ppos):
parts = ppos.split('==')
tag = parts[0]
rest = parts[1] if len(parts) > 1 else ''
expanded = [tag]
bracket_matches = re.findall(r'\[([^\]]+)\]', tag)
if bracket_matches:
for match in bracket_matches:
new_expanded = []
for char in match:
for item in expanded:
new_expanded.append(item.replace(f'[{match}]', char))
expanded = new_expanded
return [f"{item}=={rest}" for item in expanded]
for row in df.itertuples(index=False, name=None):
if len(row) < 5:
print(f"Skipping problematic line: {row}")
continue
orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row)
if orto == '---':
current_lemma = None
elif current_lemma is None:
current_lemma = orto.replace("ORTO:", "")
lemmas[current_lemma] = []
expanded_ppos = expand_ppos(ppos.replace("PPOS:", "") if ppos else "")
for ep in expanded_ppos:
lemma_data = {
'word': current_lemma,
'PPOS': ep,
'PHON1': phon1.replace("PHON:", "") if phon1 else "",
'PHON2': phon2.replace("PHON:", "") if phon2 else "",
'COMM': comm if comm else "",
'pronunciations': pronunciations
}
lemmas[current_lemma].append(lemma_data)
else:
expanded_ppos = expand_ppos(ppos.replace("PPOS:", "") if ppos else "")
for ep in expanded_ppos:
lemma_data = {
'word': orto.replace("ORTO:", "") if orto else "",
'PPOS': ep,
'PHON1': phon1.replace("PHON:", "") if phon1 else "",
'PHON2': phon2.replace("PHON:", "") if phon2 else "",
'COMM': comm if comm else "",
'pronunciations': pronunciations
}
lemmas[current_lemma].append(lemma_data)
print("Loaded lemmas:", lemmas) # Debugging output
return lemmas
lemmas = load_data()
def create_noun_table(lemma, forms):
table_data = {
'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '',
'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '',
'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '',
'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '',
'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '',
'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '',
'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': ''
}
for form in forms:
ppos = form['PPOS'].lower() # Normalize to lowercase
word = form['word']
print(f"Processing: word={word}, ppos={ppos}, key={ppos}")
if ppos in table_data:
table_data[ppos] = word
else:
print(f"Unmatched key: {ppos} for word: {word} with PPOS: {ppos}")
print(f"Final table data for {lemma}: {table_data}") # Debugging output
table = f"""
<table border="1">
<thead>
<tr>
<th colspan="2">Eintal</th>
<th colspan="2">Fleirtal</th>
</tr>
<tr>
<th>Óbundið</th>
<th>Bundið</th>
<th>Óbundið</th>
<th>Bundið</th>
</tr>
</thead>
<tbody>
<tr>
<td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td>
<td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td>
<td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
<td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
</tr>
<tr>
<td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td>
<td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td>
<td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td>
<td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td>
</tr>
<tr>
<td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td>
<td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td>
<td>{table_data['ncmpd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnnn==iuu']}</td>
<td>{table_data['ncmpd==duu'] or table_data['ncfsd==duu'] or table_data['ncnnn==duu']}</td>
</tr>
<tr>
<td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td>
<td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td>
<td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td>
<td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td>
</tr>
</tbody>
</table>
"""
return table
def search_lemma(lemma):
results = lemmas.get(lemma, None)
if not results:
return f"No results found for {lemma}"
if 'n' in results[0]['PPOS'].lower():
table = create_noun_table(lemma, results)
else:
table = "Only noun tables are currently supported."
return table
iface = gr.Interface(
fn=search_lemma,
inputs="text",
outputs="html",
title="Lemma Search",
description="Enter a lemma to search for its declensions and pronunciations."
)
if __name__ == "__main__":
iface.launch()
|