Spaces:

unijoh
/

ord

Sleeping

ord

File size: 6,794 Bytes

import gradio as gr
import pandas as pd
import re

# Load and parse the CSV file from Hugging Face
def load_data():
    url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv"
    df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
    lemmas = {}
    current_lemma = None

    def expand_ppos(ppos):
        parts = ppos.split('==')
        tag = parts[0]
        rest = parts[1] if len(parts) > 1 else ''
        expanded = [tag]

        bracket_matches = re.findall(r'\[([^\]]+)\]', tag)
        if bracket_matches:
            for match in bracket_matches:
                new_expanded = []
                for char in match:
                    for item in expanded:
                        new_expanded.append(item.replace(f'[{match}]', char))
                expanded = new_expanded

        return [f"{item}=={rest}" for item in expanded]

    for row in df.itertuples(index=False, name=None):
        if len(row) < 5:
            print(f"Skipping problematic line: {row}")
            continue
        orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row)
        if orto == '---':
            current_lemma = None
        elif current_lemma is None:
            current_lemma = orto.replace("ORTO:", "")
            lemmas[current_lemma] = []
            expanded_ppos = expand_ppos(ppos.replace("PPOS:", "") if ppos else "")
            for ep in expanded_ppos:
                lemma_data = {
                    'word': current_lemma,
                    'PPOS': ep,
                    'PHON1': phon1.replace("PHON:", "") if phon1 else "",
                    'PHON2': phon2.replace("PHON:", "") if phon2 else "",
                    'COMM': comm if comm else "",
                    'pronunciations': pronunciations
                }
                lemmas[current_lemma].append(lemma_data)
        else:
            expanded_ppos = expand_ppos(ppos.replace("PPOS:", "") if ppos else "")
            for ep in expanded_ppos:
                lemma_data = {
                    'word': orto.replace("ORTO:", "") if orto else "",
                    'PPOS': ep,
                    'PHON1': phon1.replace("PHON:", "") if phon1 else "",
                    'PHON2': phon2.replace("PHON:", "") if phon2 else "",
                    'COMM': comm if comm else "",
                    'pronunciations': pronunciations
                }
                lemmas[current_lemma].append(lemma_data)

    print("Loaded lemmas:", lemmas)  # Debugging output
    return lemmas

lemmas = load_data()

def create_noun_table(lemma, forms):
    table_data = {
        'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '',
        'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '',
        'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '',
        'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '',
        'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '',
        'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '',
        'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': ''
    }
    
    for form in forms:
        ppos = form['PPOS'].lower()  # Normalize to lowercase
        word = form['word']
        print(f"Processing: word={word}, ppos={ppos}, key={ppos}")
        if ppos in table_data:
            table_data[ppos] = word
        else:
            print(f"Unmatched key: {ppos} for word: {word} with PPOS: {ppos}")

    print(f"Final table data for {lemma}: {table_data}")  # Debugging output

    table = f"""
    <table border="1">
        <thead>
            <tr>
                <th colspan="2">Eintal</th>
                <th colspan="2">Fleirtal</th>
            </tr>
            <tr>
                <th>Óbundið</th>
                <th>Bundið</th>
                <th>Óbundið</th>
                <th>Bundið</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td>
                <td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td>
                <td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
                <td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td>
                <td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td>
                <td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td>
                <td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td>
                <td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td>
                <td>{table_data['ncmpd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnnn==iuu']}</td>
                <td>{table_data['ncmpd==duu'] or table_data['ncfsd==duu'] or table_data['ncnnn==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td>
                <td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td>
                <td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td>
                <td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td>
            </tr>
        </tbody>
    </table>
    """
    return table

def search_lemma(lemma):
    results = lemmas.get(lemma, None)
    if not results:
        return f"No results found for {lemma}"
    
    if 'n' in results[0]['PPOS'].lower():
        table = create_noun_table(lemma, results)
    else:
        table = "Only noun tables are currently supported."

    return table

iface = gr.Interface(
    fn=search_lemma,
    inputs="text",
    outputs="html",
    title="Lemma Search",
    description="Enter a lemma to search for its declensions and pronunciations."
)

if __name__ == "__main__":
    iface.launch()