File size: 3,369 Bytes
b020e81
0fd3cfb
4b9ceb2
 
 
b6adfce
 
0fd3cfb
d7ed4ab
 
 
 
3995371
b6adfce
 
b020e81
 
0fd3cfb
d8843f7
 
 
 
 
 
 
 
 
 
 
3995371
 
061a8cf
74899d4
3995371
 
 
 
 
 
d7ed4ab
74899d4
 
 
 
 
061a8cf
74899d4
4b97b7c
 
 
74899d4
4b97b7c
74899d4
 
 
b020e81
 
 
 
 
 
d8843f7
b309611
061a8cf
d7ed4ab
061a8cf
 
0fd3cfb
 
1b9db30
34b3fd1
 
 
 
 
 
 
b309611
 
 
 
 
061a8cf
 
 
aee9fe8
1b9db30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, BertTokenizer, EncoderDecoderModel, AutoTokenizer, AutoModelForSeq2SeqLM

import gradio as gr
from gradio.mix import Parallel

tokenizer_t5 = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
model_t5 = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")

tokenizer_bert = BertTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
tokenizer_bert.bos_token = tokenizer_bert.cls_token
tokenizer_bert.eos_token = tokenizer_bert.sep_token
model_bert = EncoderDecoderModel.from_pretrained("cahya/bert2bert-indonesian-summarization")

t5_para_tokenizer = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base-paraphrase")  
t5_para_model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base-paraphrase")


def summ_t5(text):
    input_ids = tokenizer_t5.encode(text, return_tensors='pt')
    summary_ids = model_t5.generate(input_ids,
                max_length=100,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True)
    summary_text = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text
    
def summ_bert(text):
    input_ids = tokenizer_bert.encode(text, return_tensors="pt")
    summary_ids= model_bert.generate(input_ids,
                max_length=100,
                num_beams=10,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True)

    summary_text = tokenizer_bert.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text

def para_t5(text):
    encoding = t5_para_tokenizer(text, padding='longest', return_tensors='pt')
    outputs = t5_para_model.generate(
                input_ids=encoding["input_ids"],
                attention_mask=encoding["attention_mask"],
                max_length=100,
                do_sample=True,
                top_k=120,
                top_p=0.95,
                early_stopping=True,
                num_return_sequences=5)
    return [
        t5_para_tokenizer.decode(
            output, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        for output in outputs
    ]

def summarize(text):
    t5_ = summ_t5(text)
    bert_ = summ_bert(text)
    para_ = para_t5(t5_)
    return t5_, bert_, para_

if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.Markdown("""<h1 style="text-align:center">Summary of Summarizer - Indonesia</h1>""")

        gr.Markdown(
            """
            Creator: wiraindrak
            """
            )
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Input Text")
                analyze_button = gr.Button(label="Analyze")
            with gr.Column():
                t5_output = gr.Textbox(label="T5 Base Output")
                bert_output = gr.Textbox(label="Bert2Bert Base Output")
                para_output = gr.Textbox(label="T5 Paraphrase Output")
        analyze_button.click(summarize, inputs=input_text, outputs=[t5_output, bert_output, para_output])
    demo.launch()