{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Dataset from hugging face" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " id place label \\\n", "0 2401 Borderlands Positive \n", "1 2401 Borderlands Positive \n", "2 2401 Borderlands Positive \n", "3 2401 Borderlands Positive \n", "4 2401 Borderlands Positive \n", "\n", " text \n", "0 im getting on borderlands and i will murder yo... \n", "1 I am coming to the borders and I will kill you... \n", "2 im getting on borderlands and i will kill you ... \n", "3 im coming on borderlands and i will murder you... \n", "4 im getting on borderlands 2 and i will murder ... \n" ] } ], "source": [ "import pandas as pd \n", "\n", "column_names = ['id',\"place\",\"label\", \"text\"]\n", "#Train Dataset\n", "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n", "\n", "#Test Dataset\n", "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n", "\n", "\n", "print(train_df.head())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n", "[nltk_data] Jr\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Sample cleaned text:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textclean_text
0im getting on borderlands and i will murder yo...im getting borderlands murder
1I am coming to the borders and I will kill you...coming borders kill
2im getting on borderlands and i will kill you ...im getting borderlands kill
3im coming on borderlands and i will murder you...im coming borderlands murder
4im getting on borderlands 2 and i will murder ...im getting borderlands 2 murder
\n", "
" ], "text/plain": [ " text \\\n", "0 im getting on borderlands and i will murder yo... \n", "1 I am coming to the borders and I will kill you... \n", "2 im getting on borderlands and i will kill you ... \n", "3 im coming on borderlands and i will murder you... \n", "4 im getting on borderlands 2 and i will murder ... \n", "\n", " clean_text \n", "0 im getting borderlands murder \n", "1 coming borders kill \n", "2 im getting borderlands kill \n", "3 im coming borderlands murder \n", "4 im getting borderlands 2 murder " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import re\n", "import nltk\n", "from nltk.corpus import stopwords\n", "\n", "# Stopwords\n", "nltk.download(\"stopwords\")\n", "stop_words = set(stopwords.words(\"english\"))\n", "\n", "# Clean Text\n", "def preprocess_text(text):\n", " if isinstance(text, float): # Handle missing values\n", " return \"\"\n", " \n", " text = text.lower() # Convert to lowercase\n", " text = re.sub(r\"\\W\", \" \", text) # Remove special characters\n", " text = re.sub(r\"\\s+\", \" \", text).strip() # Remove extra spaces\n", " text = \" \".join([word for word in text.split() if word not in stop_words]) # Remove stopwords\n", " return text\n", "\n", "# Apply preprocessing to the text column\n", "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n", "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n", "\n", "print(\"Sample cleaned text:\")\n", "display(train_df[[\"text\", \"clean_text\"]].head())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TF-IDF vectorization complete! ✅\n", "Training data shape: (74682, 5000)\n", "Testing data shape: (1000, 5000)\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# TF-IDF Vectorizer\n", "vectorizer = TfidfVectorizer(max_features=5000)\n", "\n", "# Fit and transform training data, then transform test data\n", "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n", "X_test = vectorizer.transform(test_df[\"clean_text\"])\n", "\n", "# Extract labels\n", "y_train = train_df[\"label\"]\n", "y_test = test_df[\"label\"]\n", "\n", "print(\"TF-IDF vectorization complete! ✅\")\n", "print(f\"Training data shape: {X_train.shape}\")\n", "print(f\"Testing data shape: {X_test.shape}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Accuracy: 0.8120\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " Irrelevant 0.82 0.73 0.77 172\n", " Negative 0.78 0.89 0.83 266\n", " Neutral 0.85 0.76 0.80 285\n", " Positive 0.81 0.84 0.82 277\n", "\n", " accuracy 0.81 1000\n", " macro avg 0.81 0.81 0.81 1000\n", "weighted avg 0.81 0.81 0.81 1000\n", "\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Train the model\n", "model = LogisticRegression(max_iter=1000)\n", "model.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred = model.predict(X_test)\n", "\n", "# Evaluate the model\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Model Accuracy: {accuracy:.4f}\")\n", "\n", "# Display classification report\n", "print(\"\\nClassification Report:\")\n", "print(classification_report(y_test, y_pred))\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model and vectorizer saved successfully! ✅\n" ] } ], "source": [ "import joblib\n", "\n", "# Save the trained model\n", "joblib.dump(model, \"sentiment_model.pkl\")\n", "\n", "# Save the TF-IDF vectorizer\n", "joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n", "\n", "print(\"Model and vectorizer saved successfully! ✅\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 2 }