Spaces:
Sleeping
Sleeping
File size: 4,160 Bytes
6ed3479 5649482 6ed3479 5649482 6ed3479 a0d97a6 5649482 6ed3479 5649482 668d34d 6ed3479 3803d08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import numpy as np
import torch
import requests
from bs4 import BeautifulSoup
if torch.cuda.is_available():
device = torch.device("cuda")
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print('No GPU available, using the CPU instead.')
import numpy as np
import gradio as gr
from transformers import BertTokenizer, AutoTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
import random
tokenizer = AutoTokenizer.from_pretrained('armansakif/bengali-fake-news')
model = BertForSequenceClassification.from_pretrained(
"armansakif/bengali-fake-news", # Use the 12-layer BERT model, with an uncased vocab.
num_labels = 2, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
# model.cuda()
def extract_mainarea_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html")
mainarea_div = soup.find("div", id="mainArea")
if mainarea_div:
return mainarea_div.get_text()
else:
return "ERROR"
def classify_news(news):
label_list = []
input_ids = []
attention_masks = []
if "bangla.hindustantimes" in news:
sent = extract_mainarea_content(news)
else :
sent = news
label_list.append(0)
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 512, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
truncation = True,
return_tensors = 'pt', # Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(label_list)
testdataset = TensorDataset(input_ids, attention_masks, labels)
test_dataloader = DataLoader(
testdataset, # The validation samples.
sampler = SequentialSampler(testdataset), # Pull out batches sequentially.
batch_size = 16 # Evaluate with this batch size.
)
model.eval()
y_prob = []
for batch in test_dataloader:
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
with torch.no_grad():
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs[0]
logits = outputs[1]
# probability in percent code
prediction_probs = torch.nn.functional.softmax(logits)
y_prob.extend(prediction_probs.detach().cpu().numpy())
print(y_prob[0][0])
print(y_prob[0][1])
#-------------------------------------------------------------
_, prediction = torch.max(logits, dim=1)
prediction = prediction.cpu().detach().numpy()
# targets = b_labels.cpu().detach().numpy()
result = 'Fake News'
if prediction[0] :
result = 'Authentic News'
print(result)
labels = ['fake', 'authentic']
return {labels[i]: float(y_prob[0][i]) for i in range(2)}
demo = gr.Interface(
fn=classify_news,
inputs=gr.Textbox(lines=10, placeholder="News here or Hindustan Times Bangla Article Link"),
outputs=gr.Label(num_top_classes=2)
)
demo.launch(inline=False, share=True)
|