import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding , LSTM , Dense # these import numpy and keras needing to build lstm layers and uses embedding to embed data into a 16 dimensional vector. dataset = [ ("I love this product! It's amazing.", "positive"), ("This is the worst experience I've ever had.", "negative"), ("It's okay, not great but not terrible.", "neutral"), ("Absolutely fantastic service!", "positive"), ("I'm extremely disappointed.", "negative"), ("The item arrived as expected.", "neutral"), ("Highly recommend this to everyone!", "positive"), ("The quality is awful, do not buy.", "negative") ] # this is the dataset that the model will train on and it will also show whta statments are postive , negative or neutral. validation_data = [ ("This product exceeded my expectations.", "positive"), ("I hate this item; it's a waste of money.", "negative"), ("It's neither good nor bad, just average.", "neutral"), ("What an incredible experience!", "positive"), ("Terrible quality and poor service.", "negative"), ("The delivery time was acceptable.", "neutral"), ("A great choice, I'm very satisfied!", "positive"), ("Not worth the price at all.", "negative") ] # this data is the validation data that will introduce new words that do not appear in the dataset for teh machine to learn examples that use more advanced words. texts = [sample[0] for sample in dataset] labels = [sample[1] for sample in dataset] # These lines extract the text and from the dataset and put in one list and extract the labels from one list and put in another list. tokenizer = tf.keras.layers.TextVectorization(max_tokens = 50) # a tokenizer converts text into sequences of tokens ot numeric indices that cna be fed therough a machine learning model. #max tokens limits the size of the vocabulary. # only the 50 most common words will be included in the vocabulary. any words after this are OOV and are replaced with a special token. tokenizer.adapt(texts) # this line will pass our list of strings into the tokenizer and it will build a vocabulary of 50 most common words as out parameter is set to 50. sequences = tokenizer(texts) # the tokenizer will convert the text into tokenized sequences ans it assigns to 'sequences' variable. print(sequences) #run this to see the sequences # as you can see it is a list of a list of numbers. padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences) #padding a sequence will add zeros by defaukt to make them the same length as the longer sequences in the input. # also it willl truncate a sequence if it is longer than the maximum desired length. # when this is printed all teh numpy arrays will have the same length. print(padded_sequences) model = Sequential([ Embedding(50,16), LSTM(16), Dense(1,activation='sigmoid') ]) # this line builds a sequential mode, which is a model where the output of one layer is the input to the next layer. # The list inside Sequential defines the order of layers in the model. # Embedding maps each word(represented as a integer index) into a dense vector of fixed size (an embedding). # The first parameter (50) is the size of the voacbulary (the number of unique tokens in the dataset) This should match the vocab size in the tokenizer. # The second parameter (16) is the size of the embedding vector each word will be represnted by a 16 dimensional vector. #The second line adds a Long short term memroy layer to the model. #LSTM's are a type of Recurrent neural network that hnadle sequential data by remembering past informationa and learning dependencies over time. # The paramter (16) is the number of hidden units (or dimensions) in the lstm cell .This controls the size of the output vector from the LSTM. # The LSTM processes the input sequence from the (embedding layer) produces a single output vector representing the sequences learned features. # The last line adds a fully connected dense layer with (the first paramter creating a single output neuron for classification) #The second paramter applie sthe sigmoid function to the output which maps the reuslt to a value between 0 and 1. # This produces the output which is below 0.5 is negative and greater than 0.5 is postive. # MODEL ARCHITECTURE # Input -> Embedding Layer -> LSTM Layer -> Dense (Sigmoid) Layer -> Output model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) # model.compile configures the model for training. # the loss function is binary crossentropy which measures the diffrence between predicted probabilties and actual labels. # This chooses the adam optimizer which adjusts the models weights based on past gradients helping to minimize loss efficiently. # this chooses accuracy as the metric to evaluate model performance during training, showing the percentage of correct predictions. label_to_id = {'positive': 1 , 'negative': 0, 'neutral' : 0.5 } numeric_labels = np.array([label_to_id[label] for label in labels]) # The first line converts the text into numbers and takes positive as 1 and negative as 0 in the dictionary. # For each label in the labels list the second line converts retrives the corresponding numeric value (0 or 1). # the np.array converts the list of lists into a numpy array. # Positive --> 1 # Negative --> 0 print(len(padded_sequences)) print(len(numeric_labels)) model.fit(padded_sequences,numeric_labels,epochs=100,verbose=1,validation_split=0.2) # this line is training the model for 100 epochs.