File size: 1,756 Bytes
73b9806
 
 
 
ae0961d
73b9806
 
ae0961d
73b9806
 
 
 
 
 
 
 
b0ec70c
73b9806
ae0961d
 
 
 
 
 
 
 
 
73b9806
ae0961d
 
 
1a8a93d
 
ae0961d
eaa41b4
73b9806
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import gradio as gr
import os
from transformers import pipeline
import re
import matplotlib.pyplot as plt

def preprocess_text(text):
    text = re.sub(r'[^\u4e00-\u9fff]', '', text)
    return text

os.environ['HF_TOKEN'] = os.environ['Century_Test']

nlp = pipeline('text-classification', model='bdsl/HanmunRoBERTa')

def predict_century(text):
    preprocessed_input = preprocess_text(text)
    result = nlp(preprocessed_input, top_k=None)
    result.sort(key=lambda x: x['score'], reverse=True)
    
    scores = {f"{i}th century": 0 for i in range(15, 20)}
    
    for item in result:
        scores[f"{item['label']}th century"] = item['score']
    
    scores_text = "\n".join([f"{century}: {score*100:.2f}%" for century, score in scores.items()])
    
    return preprocessed_input, scores_text

iface = gr.Interface(fn=predict_century,
                     inputs=gr.Textbox(label="Enter your text here:"),
                     outputs=[
                         gr.Textbox(label="Processed Text"),
                         gr.Textbox(label="Confidence Scores")
                     ],
                     description="This Gradio web app uses the March 2024 version of the HanmunRoBERTa model to estimate the century in which the provided text was written. HanmunRoBERTa is a transformer-based model trained exclusively on texts in literary Sinitic authored by Koreans before the 20th century. This version is an early prototype, optimized with data from the Veritable Records and the Diary of the Royal Secretariat. As such, it is prone to overfitting and requires further adjustments and refinement for improved performance. The app automatically removes all non-Sinitic characters and special symbols, including punctuation.")
iface.launch()