{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dataset from hugging face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     id        place     label  \\\n",
      "0  2401  Borderlands  Positive   \n",
      "1  2401  Borderlands  Positive   \n",
      "2  2401  Borderlands  Positive   \n",
      "3  2401  Borderlands  Positive   \n",
      "4  2401  Borderlands  Positive   \n",
      "\n",
      "                                                text  \n",
      "0  im getting on borderlands and i will murder yo...  \n",
      "1  I am coming to the borders and I will kill you...  \n",
      "2  im getting on borderlands and i will kill you ...  \n",
      "3  im coming on borderlands and i will murder you...  \n",
      "4  im getting on borderlands 2 and i will murder ...  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd \n",
    "\n",
    "column_names = ['id',\"place\",\"label\", \"text\"]\n",
    "#Train Dataset\n",
    "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
    "\n",
    "#Test Dataset\n",
    "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
    "\n",
    "\n",
    "print(train_df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n",
      "[nltk_data]     Jr\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample cleaned text:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>clean_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>im getting on borderlands and i will murder yo...</td>\n",
       "      <td>im getting borderlands murder</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>I am coming to the borders and I will kill you...</td>\n",
       "      <td>coming borders kill</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>im getting on borderlands and i will kill you ...</td>\n",
       "      <td>im getting borderlands kill</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>im coming on borderlands and i will murder you...</td>\n",
       "      <td>im coming borderlands murder</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>im getting on borderlands 2 and i will murder ...</td>\n",
       "      <td>im getting borderlands 2 murder</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  \\\n",
       "0  im getting on borderlands and i will murder yo...   \n",
       "1  I am coming to the borders and I will kill you...   \n",
       "2  im getting on borderlands and i will kill you ...   \n",
       "3  im coming on borderlands and i will murder you...   \n",
       "4  im getting on borderlands 2 and i will murder ...   \n",
       "\n",
       "                        clean_text  \n",
       "0    im getting borderlands murder  \n",
       "1              coming borders kill  \n",
       "2      im getting borderlands kill  \n",
       "3     im coming borderlands murder  \n",
       "4  im getting borderlands 2 murder  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "# Stopwords\n",
    "nltk.download(\"stopwords\")\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "\n",
    "# Clean Text\n",
    "def preprocess_text(text):\n",
    "    if isinstance(text, float):  # Handle missing values\n",
    "        return \"\"\n",
    "    \n",
    "    text = text.lower()  # Convert to lowercase\n",
    "    text = re.sub(r\"\\W\", \" \", text)  # Remove special characters\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()  # Remove extra spaces\n",
    "    text = \" \".join([word for word in text.split() if word not in stop_words])  # Remove stopwords\n",
    "    return text\n",
    "\n",
    "# Apply preprocessing to the text column\n",
    "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
    "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
    "\n",
    "print(\"Sample cleaned text:\")\n",
    "display(train_df[[\"text\", \"clean_text\"]].head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TF-IDF vectorization complete! ✅\n",
      "Training data shape: (74682, 5000)\n",
      "Testing data shape: (1000, 5000)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "# TF-IDF Vectorizer\n",
    "vectorizer = TfidfVectorizer(max_features=5000)\n",
    "\n",
    "# Fit and transform training data, then transform test data\n",
    "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
    "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
    "\n",
    "# Extract labels\n",
    "y_train = train_df[\"label\"]\n",
    "y_test = test_df[\"label\"]\n",
    "\n",
    "print(\"TF-IDF vectorization complete! ✅\")\n",
    "print(f\"Training data shape: {X_train.shape}\")\n",
    "print(f\"Testing data shape: {X_test.shape}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Accuracy: 0.8120\n",
      "\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "  Irrelevant       0.82      0.73      0.77       172\n",
      "    Negative       0.78      0.89      0.83       266\n",
      "     Neutral       0.85      0.76      0.80       285\n",
      "    Positive       0.81      0.84      0.82       277\n",
      "\n",
      "    accuracy                           0.81      1000\n",
      "   macro avg       0.81      0.81      0.81      1000\n",
      "weighted avg       0.81      0.81      0.81      1000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "\n",
    "# Train the model\n",
    "model = LogisticRegression(max_iter=1000)\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# Make predictions\n",
    "y_pred = model.predict(X_test)\n",
    "\n",
    "# Evaluate the model\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Model Accuracy: {accuracy:.4f}\")\n",
    "\n",
    "# Display classification report\n",
    "print(\"\\nClassification Report:\")\n",
    "print(classification_report(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model and vectorizer saved successfully! ✅\n"
     ]
    }
   ],
   "source": [
    "import joblib\n",
    "\n",
    "# Save the trained model\n",
    "joblib.dump(model, \"sentiment_model.pkl\")\n",
    "\n",
    "# Save the TF-IDF vectorizer\n",
    "joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n",
    "\n",
    "print(\"Model and vectorizer saved successfully! ✅\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}