eaglelandsonce commited on
Commit
53feba4
·
verified ·
1 Parent(s): da7889f

Update pages/21_NLP_Transformer.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP_Transformer.py +56 -59
pages/21_NLP_Transformer.py CHANGED
@@ -1,89 +1,86 @@
1
  import torch
2
- from torch.utils.data import DataLoader
3
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
4
  from datasets import load_dataset
5
- from tqdm.auto import tqdm
6
  import streamlit as st
7
  import matplotlib.pyplot as plt
 
8
 
9
- # Load and preprocess the dataset
10
- dataset = load_dataset("imdb")
11
- train_dataset = dataset["train"]
12
- test_dataset = dataset["test"]
13
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
14
-
15
- def preprocess_function(examples):
16
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
17
-
18
- encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
19
- encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)
20
- train_dataloader = DataLoader(encoded_train_dataset, shuffle=True, batch_size=8)
21
- test_dataloader = DataLoader(encoded_test_dataset, batch_size=8)
22
-
23
  model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
24
- optimizer = AdamW(model.parameters(), lr=5e-5)
25
- num_epochs = 3
26
- num_training_steps = num_epochs * len(train_dataloader)
27
- lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
28
-
29
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
30
  model.to(device)
31
 
32
- # Training Loop with loss tracking
33
- loss_values = []
34
 
35
- model.train()
36
- for epoch in range(num_epochs):
37
- for batch in train_dataloader:
38
- batch = {k: v.to(device) for k, v in batch.items()}
39
- outputs = model(**batch)
40
- loss = outputs.loss
41
- loss.backward()
42
 
43
- optimizer.step()
44
- lr_scheduler.step()
45
- optimizer.zero_grad()
46
- loss_values.append(loss.item())
 
 
 
 
 
47
 
48
- # Define evaluation function
49
- def evaluate(model, dataloader):
50
- model.eval()
51
- correct = 0
52
- total = 0
53
- with torch.no_grad():
54
- for batch in dataloader:
 
 
 
 
 
 
 
 
 
55
  batch = {k: v.to(device) for k, v in batch.items()}
56
  outputs = model(**batch)
57
- predictions = outputs.logits.argmax(dim=-1)
58
- correct += (predictions == batch["labels"]).sum().item()
59
- total += batch["labels"].size(0)
60
- return correct / total
61
 
62
- # Evaluate the model on the test set
63
- accuracy = evaluate(model, test_dataloader)
 
 
 
64
 
65
- # Streamlit Interface
66
- st.title("Sentiment Analysis with BERT")
67
- st.write(f"Test Accuracy: {accuracy * 100:.2f}%")
68
 
69
- # Plot loss values
70
- st.write("### Training Loss")
71
- plt.figure(figsize=(10, 6))
72
- plt.plot(loss_values, label="Training Loss")
73
- plt.xlabel("Training Steps")
74
- plt.ylabel("Loss")
75
- plt.legend()
76
- st.pyplot(plt)
77
 
78
  # Text input for prediction
79
  st.write("### Predict Sentiment")
80
  user_input = st.text_area("Enter text:", "I loved this movie!")
 
81
  if user_input:
82
- inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
83
  inputs = {k: v.to(device) for k, v in inputs.items()}
 
84
  model.eval()
85
  with torch.no_grad():
86
  outputs = model(**inputs)
87
  prediction = outputs.logits.argmax(dim=-1).item()
88
  sentiment = "Positive" if prediction == 1 else "Negative"
89
- st.write(f"Sentiment: **{sentiment}**")
 
 
1
  import torch
 
2
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
3
  from datasets import load_dataset
4
+ from torch.utils.data import DataLoader
5
  import streamlit as st
6
  import matplotlib.pyplot as plt
7
+ from tqdm.auto import tqdm
8
 
9
+ # Load pre-trained model and tokenizer from Hugging Face
 
 
 
10
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
 
 
 
 
 
 
 
 
11
  model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
 
 
 
 
 
12
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
13
  model.to(device)
14
 
15
+ # Streamlit interface
16
+ st.title("Sentiment Analysis with BERT")
17
 
18
+ # Training setup
19
+ st.sidebar.title("Training Setup")
20
+ num_epochs = st.sidebar.slider("Number of Epochs", 1, 5, 3)
21
+ batch_size = st.sidebar.slider("Batch Size", 4, 32, 8)
22
+ learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 5e-5, format="%.6f")
 
 
23
 
24
+ # Load and preprocess dataset
25
+ @st.cache(allow_output_mutation=True)
26
+ def load_and_preprocess_data():
27
+ dataset = load_dataset("imdb", split="train[:1%]")
28
+ def preprocess_function(examples):
29
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
30
+ encoded_dataset = dataset.map(preprocess_function, batched=True)
31
+ encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
32
+ return DataLoader(encoded_dataset, shuffle=True, batch_size=batch_size)
33
 
34
+ train_dataloader = load_and_preprocess_data()
35
+
36
+ # Training loop
37
+ if st.sidebar.button("Train"):
38
+ optimizer = AdamW(model.parameters(), lr=learning_rate)
39
+ num_training_steps = num_epochs * len(train_dataloader)
40
+ lr_scheduler = get_scheduler(
41
+ name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
42
+ )
43
+
44
+ progress_bar = tqdm(range(num_training_steps))
45
+ loss_values = []
46
+
47
+ model.train()
48
+ for epoch in range(num_epochs):
49
+ for batch in train_dataloader:
50
  batch = {k: v.to(device) for k, v in batch.items()}
51
  outputs = model(**batch)
52
+ loss = outputs.loss
53
+ loss.backward()
 
 
54
 
55
+ optimizer.step()
56
+ lr_scheduler.step()
57
+ optimizer.zero_grad()
58
+ progress_bar.update(1)
59
+ loss_values.append(loss.item())
60
 
61
+ st.sidebar.success("Training completed")
 
 
62
 
63
+ # Plot loss values
64
+ st.write("### Training Loss")
65
+ plt.figure(figsize=(10, 6))
66
+ plt.plot(loss_values, label="Training Loss")
67
+ plt.xlabel("Training Steps")
68
+ plt.ylabel("Loss")
69
+ plt.legend()
70
+ st.pyplot(plt)
71
 
72
  # Text input for prediction
73
  st.write("### Predict Sentiment")
74
  user_input = st.text_area("Enter text:", "I loved this movie!")
75
+
76
  if user_input:
77
+ inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
78
  inputs = {k: v.to(device) for k, v in inputs.items()}
79
+
80
  model.eval()
81
  with torch.no_grad():
82
  outputs = model(**inputs)
83
  prediction = outputs.logits.argmax(dim=-1).item()
84
  sentiment = "Positive" if prediction == 1 else "Negative"
85
+
86
+ st.write(f"Sentiment: **{sentiment}**")