Spaces:

Rejeno
/

SentimentAnalysis

Running

App Files Files Community

Regino commited on Feb 13

Commit

045e883

1 Parent(s): 3f37371

jlsdndfnds

Browse files

Files changed (1) hide show

Train Model.ipynb +17 -20

Train Model.ipynb CHANGED Viewed

@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -33,24 +33,22 @@
     }
    ],
    "source": [
-    "import pandas as pd  \n",
     "\n",
-    "# Define column names manually\n",
-    "column_names = ['id',\"place\",\"label\", \"text\"]  # Change this based on your dataset\n",
-    "\n",
-    "# Load training dataset\n",
     "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
     "\n",
-    "# Load test dataset\n",
     "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
     "\n",
-    "# Display first few rows\n",
     "print(train_df.head())\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -149,11 +147,11 @@
     "import nltk\n",
     "from nltk.corpus import stopwords\n",
     "\n",
-    "# Download stopwords if not already downloaded\n",
     "nltk.download(\"stopwords\")\n",
     "stop_words = set(stopwords.words(\"english\"))\n",
     "\n",
-    "# Function to clean text\n",
     "def preprocess_text(text):\n",
     "    if isinstance(text, float):  # Handle missing values\n",
     "        return \"\"\n",
@@ -168,14 +166,13 @@
     "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
     "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
     "\n",
-    "# Display a sample of the cleaned text\n",
     "print(\"Sample cleaned text:\")\n",
     "display(train_df[[\"text\", \"clean_text\"]].head())\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -191,14 +188,14 @@
    "source": [
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "\n",
-    "# Initialize TF-IDF Vectorizer\n",
-    "vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 most important words\n",
     "\n",
     "# Fit and transform training data, then transform test data\n",
     "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
     "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
     "\n",
-    "# Extract labels (assuming the sentiment column is named \"label\")\n",
     "y_train = train_df[\"label\"]\n",
     "y_test = test_df[\"label\"]\n",
     "\n",
@@ -209,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -237,11 +234,11 @@
     "from sklearn.linear_model import LogisticRegression\n",
     "from sklearn.metrics import accuracy_score, classification_report\n",
     "\n",
-    "# Initialize and train the model\n",
-    "model = LogisticRegression(max_iter=1000)  # Increase iterations to ensure convergence\n",
     "model.fit(X_train, y_train)\n",
     "\n",
-    "# Make predictions on the test set\n",
     "y_pred = model.predict(X_test)\n",
     "\n",
     "# Evaluate the model\n",

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
     }
    ],
    "source": [
+    "import pandas as pd \n",
     "\n",
+    "column_names = ['id',\"place\",\"label\", \"text\"]\n",
+    "#Train Dataset\n",
     "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
     "\n",
+    "#Test Dataset\n",
     "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
     "\n",
+    "\n",
     "print(train_df.head())\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
     "import nltk\n",
     "from nltk.corpus import stopwords\n",
     "\n",
+    "# Stopwords\n",
     "nltk.download(\"stopwords\")\n",
     "stop_words = set(stopwords.words(\"english\"))\n",
     "\n",
+    "# Clean Text\n",
     "def preprocess_text(text):\n",
     "    if isinstance(text, float):  # Handle missing values\n",
     "        return \"\"\n",
     "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
     "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
     "\n",
     "print(\"Sample cleaned text:\")\n",
     "display(train_df[[\"text\", \"clean_text\"]].head())\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
    "source": [
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "\n",
+    "# TF-IDF Vectorizer\n",
+    "vectorizer = TfidfVectorizer(max_features=5000)\n",
     "\n",
     "# Fit and transform training data, then transform test data\n",
     "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
     "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
     "\n",
+    "# Extract labels\n",
     "y_train = train_df[\"label\"]\n",
     "y_test = test_df[\"label\"]\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
     "from sklearn.linear_model import LogisticRegression\n",
     "from sklearn.metrics import accuracy_score, classification_report\n",
     "\n",
+    "# Train the model\n",
+    "model = LogisticRegression(max_iter=1000)\n",
     "model.fit(X_train, y_train)\n",
     "\n",
+    "# Make predictions\n",
     "y_pred = model.predict(X_test)\n",
     "\n",
     "# Evaluate the model\n",