File size: 3,348 Bytes
cd3cab0
 
 
53feba4
cd3cab0
 
53feba4
cd3cab0
53feba4
cd3cab0
 
 
 
 
53feba4
 
cd3cab0
53feba4
 
 
 
 
cd3cab0
802215e
6415d2b
53feba4
 
 
 
 
 
44c0170
53feba4
cd3cab0
53feba4
 
5a1cec5
 
 
53feba4
 
 
 
 
 
 
 
 
 
 
 
 
 
cd3cab0
5a1cec5
53feba4
 
cd3cab0
53feba4
 
 
 
 
cd3cab0
5a1cec5
cd3cab0
53feba4
 
 
 
 
 
 
 
cd3cab0
 
 
 
53feba4
cd3cab0
53feba4
cd3cab0
53feba4
cd3cab0
 
 
 
 
53feba4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from datasets import load_dataset
from torch.utils.data import DataLoader
import streamlit as st
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Load pre-trained model and tokenizer from Hugging Face
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Streamlit interface
st.title("Sentiment Analysis with BERT")

# Training setup
st.sidebar.title("Training Setup")
num_epochs = st.sidebar.slider("Number of Epochs", 1, 5, 3)
batch_size = st.sidebar.slider("Batch Size", 4, 32, 8)
learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 5e-5, format="%.6f")

# Define a custom hash function for AddedToken type
@st.cache_data(hash_funcs={tokenizer.__class__: id})
def load_and_preprocess_data():
    dataset = load_dataset("imdb", split="train[:1%]")
    def preprocess_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    encoded_dataset = dataset.map(preprocess_function, batched=True)
    encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    encoded_dataset = encoded_dataset.rename_column("label", "labels")  # Rename the column to 'labels'
    return DataLoader(encoded_dataset, shuffle=True, batch_size=batch_size)

train_dataloader = load_and_preprocess_data()

# Initialize training status
training_completed = st.sidebar.empty()

# Training loop
if st.sidebar.button("Train"):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    progress_bar = tqdm(range(num_training_steps))
    loss_values = []

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            loss_values.append(loss.item())

    training_completed.success("Training completed")

    # Plot loss values
    st.write("### Training Loss")
    plt.figure(figsize=(10, 6))
    plt.plot(loss_values, label="Training Loss")
    plt.xlabel("Training Steps")
    plt.ylabel("Loss")
    plt.legend()
    st.pyplot(plt)

# Text input for prediction
st.write("### Predict Sentiment")
user_input = st.text_area("Enter text:", "I loved this movie!")

if user_input:
    inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(dim=-1).item()
        sentiment = "Positive" if prediction == 1 else "Negative"
        
    st.write(f"Sentiment: **{sentiment}**")