File size: 3,904 Bytes
ee90915
 
71dcf62
9cd47c7
ee90915
2f9d0c2
 
0fd3cfb
4b9ceb2
 
 
b6adfce
 
0fd3cfb
ee90915
 
 
 
3995371
ee90915
 
 
 
 
 
 
 
 
 
 
 
 
b020e81
 
0fd3cfb
d8843f7
 
 
 
 
 
 
 
 
 
 
3995371
ee90915
 
 
74899d4
ee90915
 
 
 
 
d3f7143
ee90915
b980ccf
17aedb0
ee90915
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960336c
 
0fd3cfb
 
1b9db30
ee90915
34b3fd1
 
 
455960a
34b3fd1
 
b309611
 
 
 
ee90915
 
 
455960a
ee90915
b309611
455960a
ee90915
 
17aedb0
2f9d0c2
 
 
 
 
 
1b9db30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, pipeline
import nltk.data
import pandas as pd
import matplotlib.pyplot as plt

nltk.download('punkt')


import gradio as gr
from gradio.mix import Parallel

tokenizer_t5 = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
model_t5 = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")

pretrained_sentiment = "w11wo/indonesian-roberta-base-sentiment-classifier"
pretrained_ner = "cahya/bert-base-indonesian-NER"

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=pretrained_sentiment,
    tokenizer=pretrained_sentiment,
    return_all_scores=True
)

ner_pipeline = pipeline(
    "ner",
    model=pretrained_ner,
    tokenizer=pretrained_ner,
    grouped_entities=True
)


def summ_t5(text):
    input_ids = tokenizer_t5.encode(text, return_tensors='pt')
    summary_ids = model_t5.generate(input_ids,
                max_length=100,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True)
    summary_text = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text
    
def sentiment_analysis(text):
    output = sentiment_pipeline(text)
    return {elm["label"]: elm["score"] for elm in output[0]}

def ner(text):
    output = ner_pipeline(text)
    for elm in output:
        elm['entity'] = elm['entity_group']
    return {"text": text, "entities": output}
    
def sentiment_df(text):
    df = pd.DataFrame(columns=['Text', 'Label', 'Score'])
    text_list = sentence_tokenizer.tokenize(text)
    result = [sentiment_analysis(text) for text in text_list]
    labels = []
    scores = []
    for pred in result:
        idx = list(pred.values()).index(max(list(pred.values())))
        labels.append(list(pred.keys())[idx])
        scores.append(round(list(pred.values())[idx], 3))
    df['Text'] = text_list
    df['Label'] = labels
    df['Score'] = scores
    return df
    
def run(text):
    summ_ = summ_t5(text)
    sent_ = sentiment_analysis(summ_)
    ner_ = ner(summ_)
    df_ = sentiment_df(text)
    ner_all = ner(text)
    fig = plt.figure()
    df_.groupby(["Label"])["Text"].count().plot.pie(autopct="%.1f%%", figsize=(6,6))
    return summ_, sent_, ner_, fig, ner_all, df_

if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.Markdown("""<h1 style="text-align:center">News Analyzer - Indonesia</h1>""")

        gr.Markdown(
            """
            Creator: Wira Indra Kusuma
            """
            )
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Input Text")
                analyze_button = gr.Button(label="Analyze")
                
                summ_output = gr.Textbox(label="Article Summary")
                ner_output = gr.HighlightedText(label="NER Summary")
                sent_output = gr.Label(label="Sentiment Summary")
                
            with gr.Column():
                plot_component = gr.Plot(label="Pie Chart of Sentiments of Article")
                ner_all_output = gr.HighlightedText(label="NER Article")
                
        dataframe_component = gr.DataFrame(type="pandas",
                                                label="Dataframe",
                                                max_rows=(20,'fixed'),
                                                overflow_row_behaviour='paginate',
                                                wrap=True)
                
        analyze_button.click(run, inputs=input_text, outputs=[summ_output, sent_output, ner_output, plot_component, ner_all_output, dataframe_component ])
    demo.launch()