Spaces:
Runtime error
Runtime error
File size: 5,292 Bytes
30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 30cac2e c441b12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import json
from functools import lru_cache
import gradio as gr
from difflib import SequenceMatcher
@lru_cache(maxsize=1)
def load_json_file(json_file):
with open(json_file, 'r', encoding='utf-8') as file:
return json.load(file)
def preprocess_jyutping_data(jyutping_data):
return {
char: syllable for syllable, mappings in jyutping_data.items()
for mapping in mappings for char in mapping["漢字"]
}
def chinese_to_jyutping(text, char_to_jyutping):
return [char_to_jyutping.get(char, char) for char in text]
def get_similar_initials():
return {
'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
}
def get_lazy_pronunciations():
return {
'n': ['l'], 'l': ['n'],
'gw': ['g'], 'g': ['gw'],
'k': ['t'], 't': ['k'],
'ng': ['n'], 'n': ['ng']
}
def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
return (initial1 == initial2 or
initial2 in similar_initials.get(initial1, []) or
initial2 in lazy_pronunciations.get(initial1, []))
@lru_cache(maxsize=1)
def get_char_to_jyutping():
jyutping_data = load_json_file('lexi-can_key.json')
return preprocess_jyutping_data(jyutping_data)
def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
similar_count = sum(
1 for uj in user_jyutping for rj in result_jyutping
if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
)
return similar_count / max(len(user_jyutping), len(result_jyutping))
def match_user_input(user_input):
char_to_jyutping = get_char_to_jyutping()
similar_initials = get_similar_initials()
lazy_pronunciations = get_lazy_pronunciations()
saved_results = load_json_file('jyutping_results_largec.json')
user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)
exact_match = next((result for result in saved_results
if set(user_jyutping).issubset(result["jyutping"])), None)
if exact_match:
return {
"input_text": user_input,
"input_jyutping": user_jyutping,
"match": exact_match,
"match_type": "exact"
}
matches = []
for result in saved_results:
phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
lazy_pronunciations)
text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
length_diff = abs(len(user_input) - len(result["text"]))
length_penalty = 1 / (1 + length_diff)
total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
matches.append((result, total_score))
matches.sort(key=lambda x: x[1], reverse=True)
top_matches = matches[:3]
return {
"input_text": user_input,
"input_jyutping": user_jyutping,
"matches": [
{
"match": match[0],
"score": match[1],
"match_type": "phonetic_similarity"
} for match in top_matches
]
}
sample_cases = [
"龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
"謝非道", "金中道", "得立街", "地梨根得里"
]
def gradio_app(custom_input, sample_case):
user_input = sample_case if sample_case else custom_input
if not user_input:
return "Please enter text or select a sample case."
result = match_user_input(user_input)
if "match" in result:
return json.dumps(result, ensure_ascii=False, indent=4)
else:
formatted_result = {
"input_text": result["input_text"],
"input_jyutping": result["input_jyutping"],
"matches": [
{
"text": match["match"]["text"],
"jyutping": match["match"]["jyutping"],
"score": round(match["score"], 4),
"match_type": match["match_type"]
} for match in result["matches"]
]
}
return json.dumps(formatted_result, ensure_ascii=False, indent=4)
interface = gr.Interface(
fn=gradio_app,
inputs=[
gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
],
outputs=gr.JSON(label="Matching Result"),
title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
)
interface.launch() |