In [1]:
from datasets import load_dataset

dataset = load_dataset("imsoumyaneel/sentiment-analysis-llama2")

KeyboardInterrupt: 

In [None]:
import pandas as pd
from pandas.core.frame import DataFrame as df

train_dataset = df(dataset['train'])

In [None]:
train_dataset

Unnamed: 0,sentence,label,text
0,I'll throw out the garbage .,neutral,###Human:\nyou are a sentiment analist. guess ...
1,"So Dick , how about getting some coffee for to...",joy,###Human:\nyou are a sentiment analist. guess ...
2,"Come on , you can at least try a little , besi...",neutral,###Human:\nyou are a sentiment analist. guess ...
3,What ’ s wrong with that ? Cigarette is the th...,anger,###Human:\nyou are a sentiment analist. guess ...
4,"Not for me , Dick .",neutral,###Human:\nyou are a sentiment analist. guess ...
...,...,...,...
598293,You got banned for participating in a brigade.,sadness,###Human:\nyou are a sentiment analist. guess ...
598294,"A joke is subjective pal, second of all you ne...",joy,###Human:\nyou are a sentiment analist. guess ...
598295,"Well, I'm glad you're out of all that now. How...",joy,###Human:\nyou are a sentiment analist. guess ...
598296,Everyone likes [NAME].,love,###Human:\nyou are a sentiment analist. guess ...


In [None]:
# imports for model creation
import tensorflow as tf
from keras import layers
from keras import losses
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenization of dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_dataset['sentence'])

vocab_size = len(tokenizer.word_index) + 1    # our dataset vocab size (space split)
max_length = 200    # max words in a sentence
embedding_dim = 50    # TODO: need to adjust accordingly

X = tokenizer.texts_to_sequences(train_dataset['sentence'])
X = pad_sequences(X, maxlen=max_length, padding='post')

In [None]:
# Encode the lables
labels = train_dataset['label'].map({'neutral': '1', 'joy': '2', 'sadness': '3', 'anger': '4', 'fear': '5', 'love': '6', 'surprise': '7'}).astype('float32').values

In [None]:
# Build the model
model = keras.Sequential([
  keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),
  keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dense(16, activation='relu'),
  keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# split the dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)

In [None]:
# train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
accuracy

In [None]:
# save the model
try:
  model.save("../models/sentimental-analysis-llama2.keras")
except FileNotFoundError:
  os.mkdir("../models")
  model.save("../models/sentimental-analysis-llama2.keras")