eaglelandsonce commited on
Commit
cd3cab0
·
verified ·
1 Parent(s): 63668c4

Create 21_NLP_Transformer.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP_Transformer.py +89 -0
pages/21_NLP_Transformer.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader
3
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
4
+ from datasets import load_dataset
5
+ from tqdm.auto import tqdm
6
+ import streamlit as st
7
+ import matplotlib.pyplot as plt
8
+
9
+ # Load and preprocess the dataset
10
+ dataset = load_dataset("imdb")
11
+ train_dataset = dataset["train"]
12
+ test_dataset = dataset["test"]
13
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
14
+
15
+ def preprocess_function(examples):
16
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
17
+
18
+ encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
19
+ encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)
20
+ train_dataloader = DataLoader(encoded_train_dataset, shuffle=True, batch_size=8)
21
+ test_dataloader = DataLoader(encoded_test_dataset, batch_size=8)
22
+
23
+ model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
24
+ optimizer = AdamW(model.parameters(), lr=5e-5)
25
+ num_epochs = 3
26
+ num_training_steps = num_epochs * len(train_dataloader)
27
+ lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
28
+
29
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
30
+ model.to(device)
31
+
32
+ # Training Loop with loss tracking
33
+ loss_values = []
34
+
35
+ model.train()
36
+ for epoch in range(num_epochs):
37
+ for batch in train_dataloader:
38
+ batch = {k: v.to(device) for k, v in batch.items()}
39
+ outputs = model(**batch)
40
+ loss = outputs.loss
41
+ loss.backward()
42
+
43
+ optimizer.step()
44
+ lr_scheduler.step()
45
+ optimizer.zero_grad()
46
+ loss_values.append(loss.item())
47
+
48
+ # Define evaluation function
49
+ def evaluate(model, dataloader):
50
+ model.eval()
51
+ correct = 0
52
+ total = 0
53
+ with torch.no_grad():
54
+ for batch in dataloader:
55
+ batch = {k: v.to(device) for k, v in batch.items()}
56
+ outputs = model(**batch)
57
+ predictions = outputs.logits.argmax(dim=-1)
58
+ correct += (predictions == batch["labels"]).sum().item()
59
+ total += batch["labels"].size(0)
60
+ return correct / total
61
+
62
+ # Evaluate the model on the test set
63
+ accuracy = evaluate(model, test_dataloader)
64
+
65
+ # Streamlit Interface
66
+ st.title("Sentiment Analysis with BERT")
67
+ st.write(f"Test Accuracy: {accuracy * 100:.2f}%")
68
+
69
+ # Plot loss values
70
+ st.write("### Training Loss")
71
+ plt.figure(figsize=(10, 6))
72
+ plt.plot(loss_values, label="Training Loss")
73
+ plt.xlabel("Training Steps")
74
+ plt.ylabel("Loss")
75
+ plt.legend()
76
+ st.pyplot(plt)
77
+
78
+ # Text input for prediction
79
+ st.write("### Predict Sentiment")
80
+ user_input = st.text_area("Enter text:", "I loved this movie!")
81
+ if user_input:
82
+ inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
83
+ inputs = {k: v.to(device) for k, v in inputs.items()}
84
+ model.eval()
85
+ with torch.no_grad():
86
+ outputs = model(**inputs)
87
+ prediction = outputs.logits.argmax(dim=-1).item()
88
+ sentiment = "Positive" if prediction == 1 else "Negative"
89
+ st.write(f"Sentiment: **{sentiment}**")