penut85420's picture
handle non-jp char
196fdb8
import subprocess as sp
import sys
import gradio as gr
import romkan
from fugashi import Tagger
try:
tagger = Tagger()
except:
sp.call([sys.executable, "-m", "unidic", "download"])
tagger = Tagger()
ALL_HIRA = "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ"
ALL_KATA = "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
kata2hira_table = str.maketrans(ALL_KATA, ALL_HIRA)
def kata2hira(token) -> str:
kana: str = token.feature.kana
if not kana:
return token.surface
return kana.translate(kata2hira_table)
def get_hira_roma(kanji):
hira = [kata2hira(token) for token in tagger(kanji)]
roma = [romkan.to_roma(token) for token in hira]
hira = "".join(hira)
roma = " ".join(roma)
return hira, roma, f"{kanji} {hira} {roma}"
def main():
font = gr.themes.GoogleFont("NotoSans CJK")
theme = gr.themes.Soft(font=font)
with gr.Blocks(theme) as app:
with gr.Row():
with gr.Column(scale=1):
text = gr.Textbox(label="Text", placeholder="日本語", show_copy_button=True)
hira = gr.Textbox(label="Hiragana", show_copy_button=True)
roma = gr.Textbox(label="Roma", show_copy_button=True)
line = gr.Textbox(label="Copy", show_copy_button=True)
text.submit(get_hira_roma, text, [hira, roma, line], show_progress="hidden")
app.launch()
if __name__ == "__main__":
main()