File size: 3,282 Bytes
b020e81
0fd3cfb
4b9ceb2
 
 
0fd3cfb
 
 
3995371
1abf4af
 
3995371
 
b020e81
 
 
 
0fd3cfb
d8843f7
 
 
 
 
 
 
 
 
 
 
3995371
 
b020e81
 
 
 
 
 
 
 
 
 
 
84b29d0
 
b020e81
 
3995371
 
 
 
 
 
 
 
 
 
 
 
b020e81
 
 
 
 
 
d8843f7
b309611
 
 
 
 
0fd3cfb
 
1b9db30
b309611
 
 
 
 
4e2a680
b309611
 
1b9db30
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, BertTokenizer, EncoderDecoderModel, AutoTokenizer, AutoModelForSeq2SeqLM

import gradio as gr
from gradio.mix import Parallel

tokenizer_t5 = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
model_t5 = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")

tokenizer_bert = BertTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
tokenizer_bert.bos_token = tokenizer_bert.cls_token
tokenizer_bert.eos_token = tokenizer_bert.sep_token
model_bert = EncoderDecoderModel.from_pretrained("cahya/bert2bert-indonesian-summarization")

t5_para_tokenizer = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base-paraphrase")  
t5_para_model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base-paraphrase")


def summ_t5(text):
    input_ids = tokenizer_t5.encode(text, return_tensors='pt')
    summary_ids = model_t5.generate(input_ids,
                max_length=100,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True)
    summary_text = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text
    
def summ_bert(text):
    encoding = tokenizer(text, padding='longest', return_tensors="pt")
    outputs = model.generate(
                input_ids=encoding["input_ids"], attention_mask=encoding["attention_mask"],
                max_length=512,
                do_sample=True,
                top_k=200,
                top_p=0.95,
                early_stopping=True,
                num_return_sequences=5)
    summary_text = tokenizer_bert.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text

def para_t5(text):
    input_ids = t5_para_tokenizer.encode(text, return_tensors='pt')
    outputs = t5_para_model .generate(input_ids,
                min_length=20,
                max_length=100,
                num_beams=10,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True,
                do_sample = True,
                temperature = 0.8,
                top_k = 50,
                top_p = 0.95)
    return [
        t5_para_tokenizer.decode(
            output, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        for output in outputs
    ]

def summarize(text):
    t5_ = summ_t5(text)
    bert_ = summ_bert(text)
    para_ = para_t5(t5_)
    return t5_, bert_, para_

if __name__ == "__main__":
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Input Text")
                analyze_button = gr.Button(label="Analyze")
            with gr.Column():
                t5_output = gr.Textbox(label="T5 Base Output")
                bert_output = gr.Textbox(label="Bert Base Output")
                para_output = gr.Textbox(label="T5 Paraphrase Output")
        analyze_button.click(summarize, input_text, [t5_output, bert_output, para_output])
    demo.launch()