File size: 3,129 Bytes
cd3cab0
 
 
53feba4
cd3cab0
 
53feba4
cd3cab0
53feba4
cd3cab0
 
 
 
 
53feba4
 
cd3cab0
53feba4
 
 
 
 
cd3cab0
53feba4
 
 
 
 
 
 
 
 
cd3cab0
53feba4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd3cab0
 
53feba4
 
cd3cab0
53feba4
 
 
 
 
cd3cab0
53feba4
cd3cab0
53feba4
 
 
 
 
 
 
 
cd3cab0
 
 
 
53feba4
cd3cab0
53feba4
cd3cab0
53feba4
cd3cab0
 
 
 
 
53feba4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from datasets import load_dataset
from torch.utils.data import DataLoader
import streamlit as st
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Load pre-trained model and tokenizer from Hugging Face
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Streamlit interface
st.title("Sentiment Analysis with BERT")

# Training setup
st.sidebar.title("Training Setup")
num_epochs = st.sidebar.slider("Number of Epochs", 1, 5, 3)
batch_size = st.sidebar.slider("Batch Size", 4, 32, 8)
learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 5e-5, format="%.6f")

# Load and preprocess dataset
@st.cache(allow_output_mutation=True)
def load_and_preprocess_data():
    dataset = load_dataset("imdb", split="train[:1%]")
    def preprocess_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    encoded_dataset = dataset.map(preprocess_function, batched=True)
    encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return DataLoader(encoded_dataset, shuffle=True, batch_size=batch_size)

train_dataloader = load_and_preprocess_data()

# Training loop
if st.sidebar.button("Train"):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    progress_bar = tqdm(range(num_training_steps))
    loss_values = []

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            loss_values.append(loss.item())

    st.sidebar.success("Training completed")

    # Plot loss values
    st.write("### Training Loss")
    plt.figure(figsize=(10, 6))
    plt.plot(loss_values, label="Training Loss")
    plt.xlabel("Training Steps")
    plt.ylabel("Loss")
    plt.legend()
    st.pyplot(plt)

# Text input for prediction
st.write("### Predict Sentiment")
user_input = st.text_area("Enter text:", "I loved this movie!")

if user_input:
    inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(dim=-1).item()
        sentiment = "Positive" if prediction == 1 else "Negative"
        
    st.write(f"Sentiment: **{sentiment}**")