File size: 4,034 Bytes
6541f66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f34bfd
6541f66
71b6b00
6541f66
 
71b6b00
 
 
6541f66
 
 
 
 
71b6b00
f1c1edf
6541f66
 
 
 
 
 
 
 
 
 
a8f4b09
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
import os

from cleantext import clean
import gradio as gr
from tqdm.auto import tqdm
from transformers import pipeline


checker_model_name = "textattack/roberta-base-CoLA"
corrector_model_name = "pszemraj/flan-t5-large-grammar-synthesis"

# pipelines
checker = pipeline(
    "text-classification",
    checker_model_name,
)

if os.environ.get("HF_DEMO_NO_USE_ONNX") is None:
    # load onnx runtime unless HF_DEMO_NO_USE_ONNX is set
    from optimum.pipelines import pipeline

    corrector = pipeline(
        "text2text-generation", model=corrector_model_name, accelerator="ort"
    )
else:
    corrector = pipeline("text2text-generation", corrector_model_name)


def split_text(text: str) -> list:
    # Split the text into sentences using regex
    sentences = re.split(r"(?<=[^A-Z].[.?]) +(?=[A-Z])", text)

    # Initialize a list to store the sentence batches
    sentence_batches = []

    # Initialize a temporary list to store the current batch of sentences
    temp_batch = []

    # Iterate through the sentences
    for sentence in sentences:
        # Add the sentence to the temporary batch
        temp_batch.append(sentence)

        # If the length of the temporary batch is between 2 and 3 sentences, or if it is the last batch, add it to the list of sentence batches
        if len(temp_batch) >= 2 and len(temp_batch) <= 3 or sentence == sentences[-1]:
            sentence_batches.append(temp_batch)
            temp_batch = []

    return sentence_batches


def correct_text(text: str, checker, corrector, separator: str = " ") -> str:
    # Split the text into sentence batches
    sentence_batches = split_text(text)

    # Initialize a list to store the corrected text
    corrected_text = []

    # Iterate through the sentence batches
    for batch in tqdm(
        sentence_batches, total=len(sentence_batches), desc="correcting text.."
    ):
        # Join the sentences in the batch into a single string
        raw_text = " ".join(batch)

        # Check the grammar quality of the text using the text-classification pipeline
        results = checker(raw_text)

        # Only correct the text if the results of the text-classification are not LABEL_1 or are LABEL_1 with a score below 0.9
        if results[0]["label"] != "LABEL_1" or (
            results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9
        ):
            # Correct the text using the text-generation pipeline
            corrected_batch = corrector(raw_text)
            corrected_text.append(corrected_batch[0]["generated_text"])
        else:
            corrected_text.append(raw_text)

    # Join the corrected text into a single string
    corrected_text = separator.join(corrected_text)

    return corrected_text


def update(text: str):
    text = clean(text[:4000], lower=False)
    return correct_text(text, checker, corrector)


with gr.Blocks() as demo:
    gr.Markdown("# <center>ํŠผํŠผ ๋ฌธ๋ฒ• ๊ฒ€์‚ฌ๊ธฐ โ–ถ FLAN-T5 ๋ชจ๋ธ ์‚ฌ์šฉ</center>")
    gr.Markdown(
        "**์‚ฌ์šฉ ๋ฐฉ๋ฒ•**: ์•„๋ž˜์˜ ํ…์ŠคํŠธ ์ƒ์ž์— ์ˆ˜์ •ํ•˜๊ณ ์ž ํ•˜๋Š” ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š” (ํ…์ŠคํŠธ๋Š” 4000์ž๊นŒ์ง€๋งŒ ์ž…๋ ฅ๋ฉ๋‹ˆ๋‹ค). **'์ฒ˜๋ฆฌ'** ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์—ฌ ์‹คํ–‰ํ•˜์„ธ์š”."
    )
    gr.Markdown(
        """์‚ฌ์šฉ ๋ชจ๋ธ:
    - `textattack/roberta-base-CoLA` ๊ทธ๋ž˜๋จธ ํ€„๋ฆฌํ‹ฐ ๊ฐ์‹์— ์‚ฌ์šฉ
    - `pszemraj/flan-t5-large-grammar-synthesis` ๋ฌธ๋ฒ• ๊ต์ •์„ ์œ„ํ•ด ์‚ฌ์šฉ
    """
    )
    with gr.Row():
        inp = gr.Textbox(
            label="input",
            placeholder="ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๊ณ  ๊ต์ •ํ•ด์ฃผ์„ธ์š”",
            value="Put it text",
        )
        out = gr.Textbox(label="output", interactive=False)
    btn = gr.Button("Process")
    btn.click(fn=update, inputs=inp, outputs=out)
    gr.Markdown("---")
    gr.Markdown(
        "- see the [model card](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis) for more info"
    )
    gr.Markdown("- if experiencing long wait times, feel free to duplicate the space!")
demo.launch()
# ํ•œ๊ธ€ํ™”์ง„ํ–‰