File size: 6,794 Bytes
38116a9
 
5e8c89b
38116a9
 
 
34202a3
 
38116a9
 
37c5513
 
 
 
 
18877f0
29f0493
37c5513
 
 
29f0493
37c5513
18877f0
 
29f0493
 
18877f0
37c5513
34202a3
 
 
 
8f7bb50
34202a3
 
 
 
 
37c5513
 
 
 
 
 
 
 
 
 
 
34202a3
37c5513
 
 
 
 
 
 
 
 
 
 
 
05dab99
38116a9
 
 
 
9fcb0ad
4095e7a
da1f459
 
 
 
 
 
a1c31f3
4095e7a
4f5f3f9
4095e7a
971f291
4095e7a
8249dc8
37c5513
 
 
 
645b14b
c78e3ff
 
9fcb0ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdcf728
 
 
 
56e2fd6
 
fdcf728
 
 
 
56e2fd6
 
fdcf728
 
eb06599
 
56e2fd6
 
fdcf728
 
 
 
56e2fd6
9fcb0ad
 
 
 
 
38116a9
 
 
 
9fcb0ad
971f291
4095e7a
9fcb0ad
 
 
 
38116a9
 
 
 
9fcb0ad
38116a9
 
 
 
 
56e2fd6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio as gr
import pandas as pd
import re

# Load and parse the CSV file from Hugging Face
def load_data():
    url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv"
    df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
    lemmas = {}
    current_lemma = None

    def expand_ppos(ppos):
        parts = ppos.split('==')
        tag = parts[0]
        rest = parts[1] if len(parts) > 1 else ''
        expanded = [tag]

        bracket_matches = re.findall(r'\[([^\]]+)\]', tag)
        if bracket_matches:
            for match in bracket_matches:
                new_expanded = []
                for char in match:
                    for item in expanded:
                        new_expanded.append(item.replace(f'[{match}]', char))
                expanded = new_expanded

        return [f"{item}=={rest}" for item in expanded]

    for row in df.itertuples(index=False, name=None):
        if len(row) < 5:
            print(f"Skipping problematic line: {row}")
            continue
        orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row)
        if orto == '---':
            current_lemma = None
        elif current_lemma is None:
            current_lemma = orto.replace("ORTO:", "")
            lemmas[current_lemma] = []
            expanded_ppos = expand_ppos(ppos.replace("PPOS:", "") if ppos else "")
            for ep in expanded_ppos:
                lemma_data = {
                    'word': current_lemma,
                    'PPOS': ep,
                    'PHON1': phon1.replace("PHON:", "") if phon1 else "",
                    'PHON2': phon2.replace("PHON:", "") if phon2 else "",
                    'COMM': comm if comm else "",
                    'pronunciations': pronunciations
                }
                lemmas[current_lemma].append(lemma_data)
        else:
            expanded_ppos = expand_ppos(ppos.replace("PPOS:", "") if ppos else "")
            for ep in expanded_ppos:
                lemma_data = {
                    'word': orto.replace("ORTO:", "") if orto else "",
                    'PPOS': ep,
                    'PHON1': phon1.replace("PHON:", "") if phon1 else "",
                    'PHON2': phon2.replace("PHON:", "") if phon2 else "",
                    'COMM': comm if comm else "",
                    'pronunciations': pronunciations
                }
                lemmas[current_lemma].append(lemma_data)

    print("Loaded lemmas:", lemmas)  # Debugging output
    return lemmas

lemmas = load_data()

def create_noun_table(lemma, forms):
    table_data = {
        'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '',
        'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '',
        'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '',
        'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '',
        'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '',
        'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '',
        'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': ''
    }
    
    for form in forms:
        ppos = form['PPOS'].lower()  # Normalize to lowercase
        word = form['word']
        print(f"Processing: word={word}, ppos={ppos}, key={ppos}")
        if ppos in table_data:
            table_data[ppos] = word
        else:
            print(f"Unmatched key: {ppos} for word: {word} with PPOS: {ppos}")

    print(f"Final table data for {lemma}: {table_data}")  # Debugging output

    table = f"""
    <table border="1">
        <thead>
            <tr>
                <th colspan="2">Eintal</th>
                <th colspan="2">Fleirtal</th>
            </tr>
            <tr>
                <th>Óbundið</th>
                <th>Bundið</th>
                <th>Óbundið</th>
                <th>Bundið</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td>
                <td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td>
                <td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
                <td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td>
                <td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td>
                <td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td>
                <td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td>
                <td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td>
                <td>{table_data['ncmpd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnnn==iuu']}</td>
                <td>{table_data['ncmpd==duu'] or table_data['ncfsd==duu'] or table_data['ncnnn==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td>
                <td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td>
                <td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td>
                <td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td>
            </tr>
        </tbody>
    </table>
    """
    return table

def search_lemma(lemma):
    results = lemmas.get(lemma, None)
    if not results:
        return f"No results found for {lemma}"
    
    if 'n' in results[0]['PPOS'].lower():
        table = create_noun_table(lemma, results)
    else:
        table = "Only noun tables are currently supported."

    return table

iface = gr.Interface(
    fn=search_lemma,
    inputs="text",
    outputs="html",
    title="Lemma Search",
    description="Enter a lemma to search for its declensions and pronunciations."
)

if __name__ == "__main__":
    iface.launch()