File size: 4,160 Bytes
6ed3479
 
5649482
 
6ed3479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5649482
 
 
 
 
 
 
 
 
6ed3479
 
 
 
a0d97a6
5649482
 
 
6ed3479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5649482
668d34d
6ed3479
3803d08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import torch
import requests
from bs4 import BeautifulSoup

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

import numpy as np
import gradio as gr
from transformers import BertTokenizer, AutoTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
import random
tokenizer = AutoTokenizer.from_pretrained('armansakif/bengali-fake-news')

model = BertForSequenceClassification.from_pretrained(
    "armansakif/bengali-fake-news", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# model.cuda()

def extract_mainarea_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html")
    mainarea_div = soup.find("div", id="mainArea")
    if mainarea_div:
        return mainarea_div.get_text()
    else:
        return "ERROR"

def classify_news(news):
  label_list = []
  input_ids = []
  attention_masks = []
  if "bangla.hindustantimes" in news:
      sent = extract_mainarea_content(news)
  else :
      sent = news
  label_list.append(0)
  encoded_dict = tokenizer.encode_plus(
                      sent,                      # Sentence to encode.
                      add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                      max_length = 512,           # Pad & truncate all sentences.
                      pad_to_max_length = True,
                      return_attention_mask = True,   # Construct attn. masks.
                      truncation = True,
                      return_tensors = 'pt',     # Return pytorch tensors.
                  )

  input_ids.append(encoded_dict['input_ids'])

  attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(label_list)

  testdataset = TensorDataset(input_ids, attention_masks, labels)

  test_dataloader = DataLoader(
              testdataset, # The validation samples.
              sampler = SequentialSampler(testdataset), # Pull out batches sequentially.
              batch_size = 16 # Evaluate with this batch size.
          )

  model.eval()

  y_prob = []

  for batch in test_dataloader:

      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      with torch.no_grad():

          outputs = model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_mask,
                                  labels=b_labels)
          loss = outputs[0]
          logits = outputs[1]

          # probability in percent code
          prediction_probs = torch.nn.functional.softmax(logits)
          y_prob.extend(prediction_probs.detach().cpu().numpy())

          print(y_prob[0][0])
          print(y_prob[0][1])
          #-------------------------------------------------------------

          _, prediction = torch.max(logits, dim=1)
          prediction = prediction.cpu().detach().numpy()
          # targets = b_labels.cpu().detach().numpy()

          result = 'Fake News'
          if prediction[0] :
            result = 'Authentic News'
          print(result)
          labels = ['fake', 'authentic']

          return {labels[i]: float(y_prob[0][i]) for i in range(2)}
demo = gr.Interface(
    fn=classify_news,
    inputs=gr.Textbox(lines=10, placeholder="News here or Hindustan Times Bangla Article Link"),
    outputs=gr.Label(num_top_classes=2)
)
demo.launch(inline=False, share=True)