Spaces:
Sleeping
Sleeping
import numpy as np | |
import torch | |
import requests | |
from bs4 import BeautifulSoup | |
if torch.cuda.is_available(): | |
device = torch.device("cuda") | |
print('We will use the GPU:', torch.cuda.get_device_name(0)) | |
else: | |
device = torch.device("cpu") | |
print('No GPU available, using the CPU instead.') | |
import numpy as np | |
import gradio as gr | |
from transformers import BertTokenizer, AutoTokenizer | |
from torch.utils.data import TensorDataset, random_split | |
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler | |
from transformers import BertForSequenceClassification, AdamW, BertConfig | |
import random | |
tokenizer = AutoTokenizer.from_pretrained('armansakif/bengali-fake-news') | |
model = BertForSequenceClassification.from_pretrained( | |
"armansakif/bengali-fake-news", # Use the 12-layer BERT model, with an uncased vocab. | |
num_labels = 2, # The number of output labels--2 for binary classification. | |
# You can increase this for multi-class tasks. | |
output_attentions = False, # Whether the model returns attentions weights. | |
output_hidden_states = False, # Whether the model returns all hidden-states. | |
) | |
# model.cuda() | |
def extract_mainarea_content(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, "html") | |
mainarea_div = soup.find("div", id="mainArea") | |
if mainarea_div: | |
return mainarea_div.get_text() | |
else: | |
return "ERROR" | |
def classify_news(news): | |
label_list = [] | |
input_ids = [] | |
attention_masks = [] | |
if "bangla.hindustantimes" in news: | |
sent = extract_mainarea_content(news) | |
else : | |
sent = news | |
label_list.append(0) | |
encoded_dict = tokenizer.encode_plus( | |
sent, # Sentence to encode. | |
add_special_tokens = True, # Add '[CLS]' and '[SEP]' | |
max_length = 512, # Pad & truncate all sentences. | |
pad_to_max_length = True, | |
return_attention_mask = True, # Construct attn. masks. | |
truncation = True, | |
return_tensors = 'pt', # Return pytorch tensors. | |
) | |
input_ids.append(encoded_dict['input_ids']) | |
attention_masks.append(encoded_dict['attention_mask']) | |
# Convert the lists into tensors. | |
input_ids = torch.cat(input_ids, dim=0) | |
attention_masks = torch.cat(attention_masks, dim=0) | |
labels = torch.tensor(label_list) | |
testdataset = TensorDataset(input_ids, attention_masks, labels) | |
test_dataloader = DataLoader( | |
testdataset, # The validation samples. | |
sampler = SequentialSampler(testdataset), # Pull out batches sequentially. | |
batch_size = 16 # Evaluate with this batch size. | |
) | |
model.eval() | |
y_prob = [] | |
for batch in test_dataloader: | |
b_input_ids = batch[0].to(device) | |
b_input_mask = batch[1].to(device) | |
b_labels = batch[2].to(device) | |
with torch.no_grad(): | |
outputs = model(b_input_ids, | |
token_type_ids=None, | |
attention_mask=b_input_mask, | |
labels=b_labels) | |
loss = outputs[0] | |
logits = outputs[1] | |
# probability in percent code | |
prediction_probs = torch.nn.functional.softmax(logits) | |
y_prob.extend(prediction_probs.detach().cpu().numpy()) | |
print(y_prob[0][0]) | |
print(y_prob[0][1]) | |
#------------------------------------------------------------- | |
_, prediction = torch.max(logits, dim=1) | |
prediction = prediction.cpu().detach().numpy() | |
# targets = b_labels.cpu().detach().numpy() | |
result = 'Fake News' | |
if prediction[0] : | |
result = 'Authentic News' | |
print(result) | |
labels = ['fake', 'authentic'] | |
return {labels[i]: float(y_prob[0][i]) for i in range(2)} | |
demo = gr.Interface( | |
fn=classify_news, | |
inputs=gr.Textbox(lines=10, placeholder="News here or Hindustan Times Bangla Article Link"), | |
outputs=gr.Label(num_top_classes=2) | |
) | |
demo.launch(inline=False, share=True) | |