Spaces:

OttoYu
/

Cantonese-Phonetics

Runtime error

File size: 5,292 Bytes

30cac2e
 
 
c441b12
30cac2e
 
c441b12
30cac2e
 
 
 
c441b12
 
 
 
 
 
 
 
 
30cac2e
 
 
c441b12
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
c441b12
 
30cac2e
 
c441b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
c441b12
30cac2e
c441b12
 
30cac2e
c441b12
30cac2e
c441b12
 
30cac2e
 
c441b12
 
30cac2e
c441b12
 
 
30cac2e
c441b12
30cac2e
c441b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
 
c441b12
 
30cac2e
 
c441b12
 
30cac2e
c441b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
 
 
c441b12
30cac2e
 
 
 
c441b12
30cac2e
 
c441b12

import json
from functools import lru_cache
import gradio as gr
from difflib import SequenceMatcher

@lru_cache(maxsize=1)
def load_json_file(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        return json.load(file)

def preprocess_jyutping_data(jyutping_data):
    return {
        char: syllable for syllable, mappings in jyutping_data.items()
        for mapping in mappings for char in mapping["漢字"]
    }


def chinese_to_jyutping(text, char_to_jyutping):
    return [char_to_jyutping.get(char, char) for char in text]


def get_similar_initials():
    return {
        'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
        'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
        'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
        'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
    }

def get_lazy_pronunciations():
    return {
        'n': ['l'], 'l': ['n'],
        'gw': ['g'], 'g': ['gw'],
        'k': ['t'], 't': ['k'],
        'ng': ['n'], 'n': ['ng']
    }


def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
    initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
    initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]

    return (initial1 == initial2 or
            initial2 in similar_initials.get(initial1, []) or
            initial2 in lazy_pronunciations.get(initial1, []))


@lru_cache(maxsize=1)
def get_char_to_jyutping():
    jyutping_data = load_json_file('lexi-can_key.json')
    return preprocess_jyutping_data(jyutping_data)


def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
    similar_count = sum(
        1 for uj in user_jyutping for rj in result_jyutping
        if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
    )
    return similar_count / max(len(user_jyutping), len(result_jyutping))


def match_user_input(user_input):
    char_to_jyutping = get_char_to_jyutping()
    similar_initials = get_similar_initials()
    lazy_pronunciations = get_lazy_pronunciations()
    saved_results = load_json_file('jyutping_results_largec.json')

    user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)

    exact_match = next((result for result in saved_results
                        if set(user_jyutping).issubset(result["jyutping"])), None)

    if exact_match:
        return {
            "input_text": user_input,
            "input_jyutping": user_jyutping,
            "match": exact_match,
            "match_type": "exact"
        }

    matches = []
    for result in saved_results:
        phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
                                                       lazy_pronunciations)
        text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
        length_diff = abs(len(user_input) - len(result["text"]))
        length_penalty = 1 / (1 + length_diff)

        total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
        matches.append((result, total_score))

    matches.sort(key=lambda x: x[1], reverse=True)
    top_matches = matches[:3]

    return {
        "input_text": user_input,
        "input_jyutping": user_jyutping,
        "matches": [
            {
                "match": match[0],
                "score": match[1],
                "match_type": "phonetic_similarity"
            } for match in top_matches
        ]
    }


sample_cases = [
    "龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
    "謝非道", "金中道", "得立街", "地梨根得里"
]


def gradio_app(custom_input, sample_case):
    user_input = sample_case if sample_case else custom_input
    if not user_input:
        return "Please enter text or select a sample case."

    result = match_user_input(user_input)

    if "match" in result:
        return json.dumps(result, ensure_ascii=False, indent=4)
    else:
        formatted_result = {
            "input_text": result["input_text"],
            "input_jyutping": result["input_jyutping"],
            "matches": [
                {
                    "text": match["match"]["text"],
                    "jyutping": match["match"]["jyutping"],
                    "score": round(match["score"], 4),
                    "match_type": match["match_type"]
                } for match in result["matches"]
            ]
        }
        return json.dumps(formatted_result, ensure_ascii=False, indent=4)


interface = gr.Interface(
    fn=gradio_app,
    inputs=[
        gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
        gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
    ],
    outputs=gr.JSON(label="Matching Result"),
    title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
    description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例，應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
)

interface.launch()