|
import streamlit as st |
|
from datasets import load_dataset |
|
import pandas as pd |
|
import numpy as np |
|
from transformers import pipeline |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, Trainer, TrainingArguments, LineByLineTextDataset |
|
import json |
|
|
|
st.markdown("### Here is a sentiment model trained on a slice of a twitter dataset") |
|
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True) |
|
|
|
|
|
text = st.text_area("Try typing something here! \n You will see how much better our model is compared to the base model. No kidding") |
|
|
|
|
|
|
|
|
|
data = load_dataset("carblacac/twitter-sentiment-analysis") |
|
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english") |
|
dataset = data.map(lambda xs: tokenizer(xs["text"], truncation=True, padding='max_length')) |
|
dataset = dataset.rename_column("feeling", "labels") |
|
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english", num_labels=2) |
|
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
model=model, train_dataset=dataset["train"].shuffle().select(range(10000)), |
|
eval_dataset = dataset['test'].select(range(5000)), |
|
args=TrainingArguments( |
|
output_dir="./my_saved_model", overwrite_output_dir=True, |
|
num_train_epochs=1, per_device_train_batch_size=4, |
|
save_steps=10_000, save_total_limit=2), |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
|
|
model() |
|
|
|
|
|
|
|
|
|
|
|
st.markdown(f"{raw_predictions}") |
|
|