File size: 3,325 Bytes
72eedb2
 
2088fe5
72eedb2
 
2088fe5
72eedb2
2088fe5
1a7dca5
 
 
72eedb2
 
ef01f5b
72eedb2
c2b2d84
1a7dca5
a8075a2
ef01f5b
72eedb2
 
 
 
1a7dca5
72eedb2
5759bd5
 
72eedb2
ef01f5b
 
72eedb2
7ac517b
ef01f5b
72eedb2
 
5759bd5
72eedb2
ef01f5b
 
72eedb2
 
 
 
 
1a7dca5
72eedb2
 
 
 
 
2088fe5
 
ef01f5b
 
 
2088fe5
 
e334543
ef01f5b
e334543
ef01f5b
 
 
 
72eedb2
 
1a7dca5
9878be5
 
 
 
 
 
 
642f8d5
 
 
 
 
 
9878be5
 
f249575
 
9878be5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import numpy as np
import pandas as pd
import re
import time
import os

from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from Scraper import Scrap

st.set_page_config(layout="wide")

model_checkpoint = "Rifky/indobert-hoax-classification"
base_model_checkpoint = "indobenchmark/indobert-base-p1"
data_checkpoint = "Rifky/indonesian-hoax-news"
label = {0: "valid", 1: "fake"}

@st.cache(show_spinner=False, allow_output_mutation=True)
def load_model():
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    base_model = SentenceTransformer(base_model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True)
    data = load_dataset(data_checkpoint, split="train")
    return model, base_model, tokenizer, data

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

input_column, reference_column = st.columns(2)
input_column.write('# Fake News Detection AI')

with st.spinner("Loading Model..."):
    model, base_model, tokenizer, data = load_model()

user_input = input_column.text_input("Article url")
submit = input_column.button("submit")


if submit:
    last_time = time.time()
    with st.spinner("Reading Article..."):
        title, text = Scrap(user_input)

    if text:
        text = re.sub(r'\n', ' ', text)

        with st.spinner("Computing..."):
            token = text.split()
            text_len = len(token)

            sequences = []
            for i in range(text_len // 512):
                sequences.append(" ".join(token[i * 512: (i + 1) * 512]))
            sequences.append(" ".join(token[text_len - (text_len % 512) : text_len]))
            sequences = tokenizer(sequences, max_length=512, truncation=True, padding="max_length", return_tensors='pt')

            predictions = model(**sequences)[0].detach().numpy()
            result = [
                np.sum([sigmoid(i[0]) for i in predictions]) / len(predictions), 
                np.sum([sigmoid(i[1]) for i in predictions]) / len(predictions)
            ]
                
            print (f'\nresult: {result}')
            
            title_embeddings = base_model.encode(title)
            similarity_score = cosine_similarity(
                [title_embeddings],
                data["embeddings"]
            ).flatten()
            sorted = np.argsort(similarity_score)[::-1].tolist()
            
            input_column.markdown(f"<small>Compute Finished in {int(time.time() - last_time)} seconds</small>", unsafe_allow_html=True)
            prediction = np.argmax(result, axis=-1)
            input_column.success(f"This news is {label[prediction]}.")
            input_column.text(f"{int(result[prediction]*100)}% confidence")
            input_column.progress(result[prediction])
            
            for i in sorted[:5]:
                reference_column.write(f"""
                <small>turnbackhoax.id</small>
                <a href={data["url"][i]}><h5>{data["title"][i]}</h5></a>
                """, unsafe_allow_html=True)
                with reference_column.expander("read content"):
                    st.write(data["text"][i])