Spaces:

jvedsaqib
/

Bengali_NEWS_DETECT

Sleeping

App Files Files Community

Bengali_NEWS_DETECT / app.py

jvedsaqib

Update app.py

a0d97a6 verified 11 months ago

raw

history blame contribute delete

4.16 kB

	import numpy as np
	import torch
	import requests
	from bs4 import BeautifulSoup

	if torch.cuda.is_available():
	device = torch.device("cuda")
	print('We will use the GPU:', torch.cuda.get_device_name(0))
	else:
	device = torch.device("cpu")
	print('No GPU available, using the CPU instead.')

	import numpy as np
	import gradio as gr
	from transformers import BertTokenizer, AutoTokenizer
	from torch.utils.data import TensorDataset, random_split
	from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
	from transformers import BertForSequenceClassification, AdamW, BertConfig
	import random
	tokenizer = AutoTokenizer.from_pretrained('armansakif/bengali-fake-news')

	model = BertForSequenceClassification.from_pretrained(
	"armansakif/bengali-fake-news", # Use the 12-layer BERT model, with an uncased vocab.
	num_labels = 2, # The number of output labels--2 for binary classification.
	# You can increase this for multi-class tasks.
	output_attentions = False, # Whether the model returns attentions weights.
	output_hidden_states = False, # Whether the model returns all hidden-states.
	)
	# model.cuda()

	def extract_mainarea_content(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, "html")
	mainarea_div = soup.find("div", id="mainArea")
	if mainarea_div:
	return mainarea_div.get_text()
	else:
	return "ERROR"

	def classify_news(news):
	label_list = []
	input_ids = []
	attention_masks = []
	if "bangla.hindustantimes" in news:
	sent = extract_mainarea_content(news)
	else :
	sent = news
	label_list.append(0)
	encoded_dict = tokenizer.encode_plus(
	sent, # Sentence to encode.
	add_special_tokens = True, # Add '[CLS]' and '[SEP]'
	max_length = 512, # Pad & truncate all sentences.
	pad_to_max_length = True,
	return_attention_mask = True, # Construct attn. masks.
	truncation = True,
	return_tensors = 'pt', # Return pytorch tensors.
	)

	input_ids.append(encoded_dict['input_ids'])

	attention_masks.append(encoded_dict['attention_mask'])

	# Convert the lists into tensors.
	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)
	labels = torch.tensor(label_list)

	testdataset = TensorDataset(input_ids, attention_masks, labels)

	test_dataloader = DataLoader(
	testdataset, # The validation samples.
	sampler = SequentialSampler(testdataset), # Pull out batches sequentially.
	batch_size = 16 # Evaluate with this batch size.
	)

	model.eval()

	y_prob = []

	for batch in test_dataloader:

	b_input_ids = batch[0].to(device)
	b_input_mask = batch[1].to(device)
	b_labels = batch[2].to(device)

	with torch.no_grad():

	outputs = model(b_input_ids,
	token_type_ids=None,
	attention_mask=b_input_mask,
	labels=b_labels)
	loss = outputs[0]
	logits = outputs[1]

	# probability in percent code
	prediction_probs = torch.nn.functional.softmax(logits)
	y_prob.extend(prediction_probs.detach().cpu().numpy())

	print(y_prob[0][0])
	print(y_prob[0][1])
	#-------------------------------------------------------------

	_, prediction = torch.max(logits, dim=1)
	prediction = prediction.cpu().detach().numpy()
	# targets = b_labels.cpu().detach().numpy()

	result = 'Fake News'
	if prediction[0] :
	result = 'Authentic News'
	print(result)
	labels = ['fake', 'authentic']

	return {labels[i]: float(y_prob[0][i]) for i in range(2)}
	demo = gr.Interface(
	fn=classify_news,
	inputs=gr.Textbox(lines=10, placeholder="News here or Hindustan Times Bangla Article Link"),
	outputs=gr.Label(num_top_classes=2)
	)
	demo.launch(inline=False, share=True)