{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# data loading\n", "data = pd.read_csv('data//Combined_Data.csv')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0statementstatus
00oh my goshAnxiety
11trouble sleeping, confused mind, restless hear...Anxiety
22All wrong, back off dear, forward doubt. Stay ...Anxiety
33I've shifted my focus to something else but I'...Anxiety
44I'm restless and restless, it's been a month n...Anxiety
\n", "
" ], "text/plain": [ " Unnamed: 0 statement status\n", "0 0 oh my gosh Anxiety\n", "1 1 trouble sleeping, confused mind, restless hear... Anxiety\n", "2 2 All wrong, back off dear, forward doubt. Stay ... Anxiety\n", "3 3 I've shifted my focus to something else but I'... Anxiety\n", "4 4 I'm restless and restless, it's been a month n... Anxiety" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'I recently watched my dad die a gruesome death due to cancer this week, and I am sure something similar is in my future, I do not have any real friends and I do not have a home, I have been living in a hotel the past 6 months. I do not want to live anymore I just want to see my dad again and I do not want to suffer like he did I do not want to live anymore'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['statement'].values[19230]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statementstatus
0oh my goshAnxiety
1trouble sleeping, confused mind, restless hear...Anxiety
2All wrong, back off dear, forward doubt. Stay ...Anxiety
3I've shifted my focus to something else but I'...Anxiety
4I'm restless and restless, it's been a month n...Anxiety
\n", "
" ], "text/plain": [ " statement status\n", "0 oh my gosh Anxiety\n", "1 trouble sleeping, confused mind, restless hear... Anxiety\n", "2 All wrong, back off dear, forward doubt. Stay ... Anxiety\n", "3 I've shifted my focus to something else but I'... Anxiety\n", "4 I'm restless and restless, it's been a month n... Anxiety" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# selecting needed columns\n", "df = data[['statement', 'status']]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "status\n", "Normal 16351\n", "Depression 15404\n", "Suicidal 10653\n", "Anxiety 3888\n", "Bipolar 2877\n", "Stress 2669\n", "Personality disorder 1201\n", "Name: count, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# value counts for the status\n", "df['status'].value_counts()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(53043, 2)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "statement 362\n", "status 0\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# checking for nan values\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "statement 0\n", "status 0\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dropping nan values\n", "df_1 = df.dropna()\n", "df_1.isna().sum()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\timmy\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\timmy\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", "\n", "# Download necessary NLTK data files\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "example sentence demonstrate text preprocessing python includes number like punctuation\n" ] } ], "source": [ "# creating a cleaning pipeline for the statement column\n", "def preprocess_text(text, use_stemming=False, use_lemmatization=True):\n", " # Lowercase the text\n", " text = text.lower()\n", " \n", " # Remove punctuation\n", " text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)\n", " \n", " # Remove numbers\n", " text = re.sub(r'\\d+', '', text)\n", " \n", " # Tokenize the text\n", " words = text.split()\n", " \n", " # Remove stopwords\n", " stop_words = set(stopwords.words('english'))\n", " words = [word for word in words if word not in stop_words]\n", " \n", " # Initialize stemmer and lemmatizer\n", " stemmer = PorterStemmer()\n", " lemmatizer = WordNetLemmatizer()\n", " \n", " if use_stemming:\n", " # Apply stemming\n", " words = [stemmer.stem(word) for word in words]\n", " elif use_lemmatization:\n", " # Apply lemmatization\n", " words = [lemmatizer.lemmatize(word) for word in words]\n", " \n", " # Join words back into a single string\n", " cleaned_text = ' '.join(words)\n", " \n", " return cleaned_text\n", "\n", "# Example usage\n", "text = \"This is an example sentence to demonstrate text preprocessing in Python. It includes numbers like 123 and punctuation!\"\n", "cleaned_text = preprocess_text(text)\n", "print(cleaned_text)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\timmy\\AppData\\Local\\Temp\\ipykernel_4184\\637849828.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_1['cleaned_statement'] = df_1['statement'].apply(preprocess_text)\n" ] } ], "source": [ "# implementing on the statement column\n", "df_1['cleaned_statement'] = df_1['statement'].apply(preprocess_text)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statementstatuscleaned_statement
0oh my goshAnxietyoh gosh
1trouble sleeping, confused mind, restless hear...Anxietytrouble sleeping confused mind restless heart ...
2All wrong, back off dear, forward doubt. Stay ...Anxietywrong back dear forward doubt stay restless re...
3I've shifted my focus to something else but I'...Anxietyive shifted focus something else im still worried
4I'm restless and restless, it's been a month n...Anxietyim restless restless month boy mean
\n", "
" ], "text/plain": [ " statement status \\\n", "0 oh my gosh Anxiety \n", "1 trouble sleeping, confused mind, restless hear... Anxiety \n", "2 All wrong, back off dear, forward doubt. Stay ... Anxiety \n", "3 I've shifted my focus to something else but I'... Anxiety \n", "4 I'm restless and restless, it's been a month n... Anxiety \n", "\n", " cleaned_statement \n", "0 oh gosh \n", "1 trouble sleeping confused mind restless heart ... \n", "2 wrong back dear forward doubt stay restless re... \n", "3 ive shifted focus something else im still worried \n", "4 im restless restless month boy mean " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_1.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cleaned_statementstatus
0oh goshAnxiety
1trouble sleeping confused mind restless heart ...Anxiety
2wrong back dear forward doubt stay restless re...Anxiety
3ive shifted focus something else im still worriedAnxiety
4im restless restless month boy meanAnxiety
\n", "
" ], "text/plain": [ " cleaned_statement status\n", "0 oh gosh Anxiety\n", "1 trouble sleeping confused mind restless heart ... Anxiety\n", "2 wrong back dear forward doubt stay restless re... Anxiety\n", "3 ive shifted focus something else im still worried Anxiety\n", "4 im restless restless month boy mean Anxiety" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_2 = df_1[['cleaned_statement', 'status']]\n", "df_2.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\timmy\\AppData\\Local\\Temp\\ipykernel_4184\\858368390.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_2['status'] = encoder.fit_transform(df_2['status'])\n" ] } ], "source": [ "# encoding the status column\n", "from sklearn.preprocessing import LabelEncoder\n", "encoder = LabelEncoder()\n", "df_2['status'] = encoder.fit_transform(df_2['status'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Anxiety', 'Bipolar', 'Depression', 'Normal',\n", " 'Personality disorder', 'Stress', 'Suicidal'], dtype=object)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder.classes_" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Anxiety': np.int64(0),\n", " 'Bipolar': np.int64(1),\n", " 'Depression': np.int64(2),\n", " 'Normal': np.int64(3),\n", " 'Personality disorder': np.int64(4),\n", " 'Stress': np.int64(5),\n", " 'Suicidal': np.int64(6)}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))\n", "label_mapping" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cleaned_statementstatus
0oh gosh0
1trouble sleeping confused mind restless heart ...0
2wrong back dear forward doubt stay restless re...0
3ive shifted focus something else im still worried0
4im restless restless month boy mean0
\n", "
" ], "text/plain": [ " cleaned_statement status\n", "0 oh gosh 0\n", "1 trouble sleeping confused mind restless heart ... 0\n", "2 wrong back dear forward doubt stay restless re... 0\n", "3 ive shifted focus something else im still worried 0\n", "4 im restless restless month boy mean 0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_2.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# splitting the data \n", "from sklearn.model_selection import train_test_split\n", "X = df_2['cleaned_statement']\n", "y = df_2['status']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# creating vectors for the cleaned_statement column\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# Vectorize the text using TF-IDF\n", "vectorizer = TfidfVectorizer()\n", "X_train_tfidf = vectorizer.fit_transform(X_train)\n", "X_test_tfidf = vectorizer.transform(X_test)\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestClassifier()" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# random forest classifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "# Initialize the model\n", "model = RandomForestClassifier()\n", "\n", "# Train the model\n", "model.fit(X_train_tfidf, y_train)\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.688715953307393\n", " precision recall f1-score support\n", "\n", " 0 0.90 0.50 0.64 768\n", " 1 0.97 0.37 0.53 556\n", " 2 0.55 0.82 0.66 3081\n", " 3 0.79 0.95 0.86 3269\n", " 4 1.00 0.26 0.41 215\n", " 5 0.97 0.21 0.35 517\n", " 6 0.71 0.40 0.52 2131\n", "\n", " accuracy 0.69 10537\n", " macro avg 0.84 0.50 0.57 10537\n", "weighted avg 0.74 0.69 0.67 10537\n", "\n" ] } ], "source": [ "from sklearn.metrics import classification_report, accuracy_score\n", "# making predictions\n", "y_pred = model.predict(X_test_tfidf)\n", "\n", "# checking the accuracy\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print('Accuracy:', accuracy)\n", "\n", "# classification report\n", "report = classification_report(y_test, y_pred)\n", "print(report)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# creating a pipeline\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.pipeline import Pipeline\n", "\n", "# Custom transformer for text preprocessing\n", "class TextPreprocessor(BaseEstimator, TransformerMixin):\n", " def __init__(self):\n", " self.stop_words = set(stopwords.words('english'))\n", " self.lemmatizer = WordNetLemmatizer()\n", " \n", " def preprocess_text(self, text):\n", " # Lowercase the text\n", " text = text.lower()\n", " \n", " # Remove punctuation\n", " text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)\n", " \n", " # Remove numbers\n", " text = re.sub(r'\\d+', '', text)\n", " \n", " # Tokenize the text\n", " words = text.split()\n", " \n", " # Remove stopwords and apply lemmatization\n", " words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]\n", " \n", " # Join words back into a single string\n", " cleaned_text = ' '.join(words)\n", " \n", " return cleaned_text\n", " \n", " def fit(self, X, y=None):\n", " return self\n", " \n", " def transform(self, X, y=None):\n", " return [self.preprocess_text(text) for text in X]\n", " \n", " \n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline([\n", " ('preprocessor', TextPreprocessor()),\n", " ('vectorizer', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())\n", "])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "X = df_1['statement']\n", "y = df_2['status']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor', TextPreprocessor()),\n",
       "                ('vectorizer', TfidfVectorizer()),\n",
       "                ('classifier', RandomForestClassifier())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor', TextPreprocessor()),\n", " ('vectorizer', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train the model\n", "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# Make predictions\n", "y_pred = pipeline.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6797950080668121\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.89 0.49 0.63 768\n", " 1 0.98 0.36 0.52 556\n", " 2 0.54 0.82 0.65 3081\n", " 3 0.79 0.95 0.86 3269\n", " 4 1.00 0.26 0.41 215\n", " 5 0.97 0.21 0.34 517\n", " 6 0.69 0.38 0.49 2131\n", "\n", " accuracy 0.68 10537\n", " macro avg 0.84 0.49 0.56 10537\n", "weighted avg 0.73 0.68 0.66 10537\n", "\n" ] } ], "source": [ "# Evaluate the model\n", "accuracy = accuracy_score(y_test, y_pred)\n", "report = classification_report(y_test, y_pred)\n", "\n", "print(f'Accuracy: {accuracy}')\n", "print('Classification Report:')\n", "print(report)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'text': 'A lot of times if I am feeling sad, I immediately think of how others will respond to it. Or I am looking for comfort.. my father is a homophobic, racist, sexist piece of shit and my mother takes care of everything in the house. I hate my dad, when he started saying things like \"there is only two genders\" and \"you are looking for attention\" and making things seem like I was in the wrong no matter how much I was right, I realized how much of a shitbag he was and really felt desperate. I felt desperate for love and so I am confusing that with wanting attention.. am I in the wrong for doing this? Am I depressed or wanting attention?', 'prediction': 'Depression'}\n" ] } ], "source": [ "import requests\n", "text = 'A lot of times if I am feeling sad, I immediately think of how others will respond to it. Or I am looking for comfort.. my father is a homophobic, racist, sexist piece of shit and my mother takes care of everything in the house. I hate my dad, when he started saying things like \"there is only two genders\" and \"you are looking for attention\" and making things seem like I was in the wrong no matter how much I was right, I realized how much of a shitbag he was and really felt desperate. I felt desperate for love and so I am confusing that with wanting attention.. am I in the wrong for doing this? Am I depressed or wanting attention?'\n", "url = \"http://127.0.0.1:8000/predict_sentiment\"\n", "data = {\"text\": text}\n", "response = requests.post(url, json=data)\n", "\n", "print(response.json())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }