File size: 5,292 Bytes
30cac2e
 
 
c441b12
30cac2e
 
c441b12
30cac2e
 
 
 
c441b12
 
 
 
 
 
 
 
 
30cac2e
 
 
c441b12
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
c441b12
 
30cac2e
 
c441b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
c441b12
30cac2e
c441b12
 
30cac2e
c441b12
30cac2e
c441b12
 
30cac2e
 
c441b12
 
30cac2e
c441b12
 
 
30cac2e
c441b12
30cac2e
c441b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
 
c441b12
 
30cac2e
 
c441b12
 
30cac2e
c441b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cac2e
 
 
 
c441b12
30cac2e
 
 
 
c441b12
30cac2e
 
c441b12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
from functools import lru_cache
import gradio as gr
from difflib import SequenceMatcher

@lru_cache(maxsize=1)
def load_json_file(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        return json.load(file)

def preprocess_jyutping_data(jyutping_data):
    return {
        char: syllable for syllable, mappings in jyutping_data.items()
        for mapping in mappings for char in mapping["漢字"]
    }


def chinese_to_jyutping(text, char_to_jyutping):
    return [char_to_jyutping.get(char, char) for char in text]


def get_similar_initials():
    return {
        'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
        'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
        'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
        'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
    }

def get_lazy_pronunciations():
    return {
        'n': ['l'], 'l': ['n'],
        'gw': ['g'], 'g': ['gw'],
        'k': ['t'], 't': ['k'],
        'ng': ['n'], 'n': ['ng']
    }


def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
    initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
    initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]

    return (initial1 == initial2 or
            initial2 in similar_initials.get(initial1, []) or
            initial2 in lazy_pronunciations.get(initial1, []))


@lru_cache(maxsize=1)
def get_char_to_jyutping():
    jyutping_data = load_json_file('lexi-can_key.json')
    return preprocess_jyutping_data(jyutping_data)


def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
    similar_count = sum(
        1 for uj in user_jyutping for rj in result_jyutping
        if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
    )
    return similar_count / max(len(user_jyutping), len(result_jyutping))


def match_user_input(user_input):
    char_to_jyutping = get_char_to_jyutping()
    similar_initials = get_similar_initials()
    lazy_pronunciations = get_lazy_pronunciations()
    saved_results = load_json_file('jyutping_results_largec.json')

    user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)

    exact_match = next((result for result in saved_results
                        if set(user_jyutping).issubset(result["jyutping"])), None)

    if exact_match:
        return {
            "input_text": user_input,
            "input_jyutping": user_jyutping,
            "match": exact_match,
            "match_type": "exact"
        }

    matches = []
    for result in saved_results:
        phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
                                                       lazy_pronunciations)
        text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
        length_diff = abs(len(user_input) - len(result["text"]))
        length_penalty = 1 / (1 + length_diff)

        total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
        matches.append((result, total_score))

    matches.sort(key=lambda x: x[1], reverse=True)
    top_matches = matches[:3]

    return {
        "input_text": user_input,
        "input_jyutping": user_jyutping,
        "matches": [
            {
                "match": match[0],
                "score": match[1],
                "match_type": "phonetic_similarity"
            } for match in top_matches
        ]
    }


sample_cases = [
    "龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
    "謝非道", "金中道", "得立街", "地梨根得里"
]


def gradio_app(custom_input, sample_case):
    user_input = sample_case if sample_case else custom_input
    if not user_input:
        return "Please enter text or select a sample case."

    result = match_user_input(user_input)

    if "match" in result:
        return json.dumps(result, ensure_ascii=False, indent=4)
    else:
        formatted_result = {
            "input_text": result["input_text"],
            "input_jyutping": result["input_jyutping"],
            "matches": [
                {
                    "text": match["match"]["text"],
                    "jyutping": match["match"]["jyutping"],
                    "score": round(match["score"], 4),
                    "match_type": match["match_type"]
                } for match in result["matches"]
            ]
        }
        return json.dumps(formatted_result, ensure_ascii=False, indent=4)


interface = gr.Interface(
    fn=gradio_app,
    inputs=[
        gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
        gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
    ],
    outputs=gr.JSON(label="Matching Result"),
    title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
    description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
)

interface.launch()