Dataset from hugging face

In [None]:
import pandas as pd 

column_names = ['id',"place","label", "text"]
#Train Dataset
train_df = pd.read_csv("twitter_training.csv", names=column_names, header=None)

#Test Dataset
test_df = pd.read_csv("twitter_validation.csv", names=column_names, header=None)


print(train_df.head())


     id        place     label  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Clean Text
def preprocess_text(text):
    if isinstance(text, float):  # Handle missing values
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply preprocessing to the text column
train_df["clean_text"] = train_df["text"].apply(preprocess_text)
test_df["clean_text"] = test_df["text"].apply(preprocess_text)

print("Sample cleaned text:")
display(train_df[["text", "clean_text"]].head())


[nltk_data] Downloading package stopwords to C:\Users\Regino Balogo
[nltk_data]     Jr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Sample cleaned text:


Unnamed: 0,text,clean_text
0,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,I am coming to the borders and I will kill you...,coming borders kill
2,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,im coming on borderlands and i will murder you...,im coming borderlands murder
4,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform training data, then transform test data
X_train = vectorizer.fit_transform(train_df["clean_text"])
X_test = vectorizer.transform(test_df["clean_text"])

# Extract labels
y_train = train_df["label"]
y_test = test_df["label"]

print("TF-IDF vectorization complete! ✅")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


TF-IDF vectorization complete! ✅
Training data shape: (74682, 5000)
Testing data shape: (1000, 5000)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.8120

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.82      0.73      0.77       172
    Negative       0.78      0.89      0.83       266
     Neutral       0.85      0.76      0.80       285
    Positive       0.81      0.84      0.82       277

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000



In [13]:
import joblib

# Save the trained model
joblib.dump(model, "sentiment_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully! ✅")


Model and vectorizer saved successfully! ✅
