File size: 2,853 Bytes
fac3c34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101


#mT5 Deployment OtherLanguage2ID

# library 
import gradio as gr
import tensorflow as tf  
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# function to run
def run_model(input_text,
              min_length):

    #mT5 Transformer 
    model_name = "csebuetnlp/mT5_m2m_crossSum_enhanced"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    #get id language
    get_lang_id = lambda lang: tokenizer._convert_token_to_id(
    model.config.task_specific_params["langid_map"][lang][1]) 

    target_lang = "indonesian" # for a list of available language names see below / defaul define

    # preprocessing text input
    input_text = str(input_text)
    input_text = ' '.join(input_text.split()) # hapus white space dan 

    #encode input to vector
    input_ids = tokenizer(input_text,
                          return_tensors="pt",
                          padding="max_length",
                          truncation=True,
                          max_length=512)["input_ids"]
        
    #generate input
    output_ids = model.generate(
        input_ids=input_ids,
        decoder_start_token_id=get_lang_id(target_lang),
        min_length=min_length,
        max_length=512,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        temperature= 0.5, # [0.8 if temper == "Creative" else 0.2 if temper == "Better" else 0.2][0],
        early_stopping=True,
        num_beams=4)[0]                                                
                                        

    #decode output to text
    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False)

    return ' '.join(summary.split(' ')[1:])# get output to str
                             
# end 

#example 
# contoh = [["TAMPAN"]]
          
#judul
title = "Text Summarization ID"

#deskripsi
description = "Demo for Text Summarization ID. Models are mT5"           

#footer            
# article = "<p style='text-align: center'><a href='https://github.com/sultanbst123/Text_summarization-id2id' target='_blank'><u>Untuk penjelasan lihat di repo ku</u> 😁</a></p>"

#run gradio
gr.Interface(
    fn=run_model,
    #input text
    inputs=[
        gr.inputs.Textbox(
            lines=3,
            placeholder="Ketik disini...", 
            label="Text",
        ),
        gr.inputs.Slider(
           minimum=100,
           maximum=400,  
           step=10,
           default=150, 
           label="Max Length(Maximal Sentence Length)",
       ),      
    ],
    #output text
    outputs=
    gr.outputs.Textbox(       
            label="Output text", 
    ),
    title=title,
    description=description,
    # article=article,
    # examples=contoh
    ).launch(debug = True)