File size: 3,325 Bytes
72eedb2 2088fe5 72eedb2 2088fe5 72eedb2 2088fe5 1a7dca5 72eedb2 ef01f5b 72eedb2 c2b2d84 1a7dca5 a8075a2 ef01f5b 72eedb2 1a7dca5 72eedb2 5759bd5 72eedb2 ef01f5b 72eedb2 7ac517b ef01f5b 72eedb2 5759bd5 72eedb2 ef01f5b 72eedb2 1a7dca5 72eedb2 2088fe5 ef01f5b 2088fe5 e334543 ef01f5b e334543 ef01f5b 72eedb2 1a7dca5 9878be5 642f8d5 9878be5 f249575 9878be5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import streamlit as st
import numpy as np
import pandas as pd
import re
import time
import os
from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from Scraper import Scrap
st.set_page_config(layout="wide")
model_checkpoint = "Rifky/indobert-hoax-classification"
base_model_checkpoint = "indobenchmark/indobert-base-p1"
data_checkpoint = "Rifky/indonesian-hoax-news"
label = {0: "valid", 1: "fake"}
@st.cache(show_spinner=False, allow_output_mutation=True)
def load_model():
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
base_model = SentenceTransformer(base_model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True)
data = load_dataset(data_checkpoint, split="train")
return model, base_model, tokenizer, data
def sigmoid(x):
return 1 / (1 + np.exp(-x))
input_column, reference_column = st.columns(2)
input_column.write('# Fake News Detection AI')
with st.spinner("Loading Model..."):
model, base_model, tokenizer, data = load_model()
user_input = input_column.text_input("Article url")
submit = input_column.button("submit")
if submit:
last_time = time.time()
with st.spinner("Reading Article..."):
title, text = Scrap(user_input)
if text:
text = re.sub(r'\n', ' ', text)
with st.spinner("Computing..."):
token = text.split()
text_len = len(token)
sequences = []
for i in range(text_len // 512):
sequences.append(" ".join(token[i * 512: (i + 1) * 512]))
sequences.append(" ".join(token[text_len - (text_len % 512) : text_len]))
sequences = tokenizer(sequences, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
predictions = model(**sequences)[0].detach().numpy()
result = [
np.sum([sigmoid(i[0]) for i in predictions]) / len(predictions),
np.sum([sigmoid(i[1]) for i in predictions]) / len(predictions)
]
print (f'\nresult: {result}')
title_embeddings = base_model.encode(title)
similarity_score = cosine_similarity(
[title_embeddings],
data["embeddings"]
).flatten()
sorted = np.argsort(similarity_score)[::-1].tolist()
input_column.markdown(f"<small>Compute Finished in {int(time.time() - last_time)} seconds</small>", unsafe_allow_html=True)
prediction = np.argmax(result, axis=-1)
input_column.success(f"This news is {label[prediction]}.")
input_column.text(f"{int(result[prediction]*100)}% confidence")
input_column.progress(result[prediction])
for i in sorted[:5]:
reference_column.write(f"""
<small>turnbackhoax.id</small>
<a href={data["url"][i]}><h5>{data["title"][i]}</h5></a>
""", unsafe_allow_html=True)
with reference_column.expander("read content"):
st.write(data["text"][i]) |