In [13]:
import pandas as pd
import numpy as np

In [16]:
# data loading
data = pd.read_csv('data//Combined_Data.csv')

In [17]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [21]:
data['statement'].values[19230]

'I recently watched my dad die a gruesome death due to cancer this week, and I am sure something similar is in my future, I do not have any real friends and I do not have a home, I have been living in a hotel the past 6 months. I do not want to live anymore I just want to see my dad again and I do not want to suffer like he did I do not want to live anymore'

In [19]:
# selecting needed columns
df = data[['statement', 'status']]
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [5]:
# value counts for the status
df['status'].value_counts()

status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64

In [6]:
df.shape

(53043, 2)

In [7]:
# checking for nan values
df.isnull().sum()

statement    362
status         0
dtype: int64

In [8]:
# dropping nan values
df_1 = df.dropna()
df_1.isna().sum()

statement    0
status       0
dtype: int64

In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timmy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\timmy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# creating a cleaning pipeline for the statement column
def preprocess_text(text, use_stemming=False, use_lemmatization=True):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    words = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    if use_stemming:
        # Apply stemming
        words = [stemmer.stem(word) for word in words]
    elif use_lemmatization:
        # Apply lemmatization
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Example usage
text = "This is an example sentence to demonstrate text preprocessing in Python. It includes numbers like 123 and punctuation!"
cleaned_text = preprocess_text(text)
print(cleaned_text)


example sentence demonstrate text preprocessing python includes number like punctuation


In [11]:
# implementing on the statement column
df_1['cleaned_statement'] = df_1['statement'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['cleaned_statement'] = df_1['statement'].apply(preprocess_text)


In [12]:
df_1.head()

Unnamed: 0,statement,status,cleaned_statement
0,oh my gosh,Anxiety,oh gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,I've shifted my focus to something else but I'...,Anxiety,ive shifted focus something else im still worried
4,"I'm restless and restless, it's been a month n...",Anxiety,im restless restless month boy mean


In [13]:
df_2 = df_1[['cleaned_statement', 'status']]
df_2.head()

Unnamed: 0,cleaned_statement,status
0,oh gosh,Anxiety
1,trouble sleeping confused mind restless heart ...,Anxiety
2,wrong back dear forward doubt stay restless re...,Anxiety
3,ive shifted focus something else im still worried,Anxiety
4,im restless restless month boy mean,Anxiety


In [14]:
# encoding the status column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df_2['status'] = encoder.fit_transform(df_2['status'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['status'] = encoder.fit_transform(df_2['status'])


In [15]:
encoder.classes_

array(['Anxiety', 'Bipolar', 'Depression', 'Normal',
       'Personality disorder', 'Stress', 'Suicidal'], dtype=object)

In [16]:
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
label_mapping

{'Anxiety': np.int64(0),
 'Bipolar': np.int64(1),
 'Depression': np.int64(2),
 'Normal': np.int64(3),
 'Personality disorder': np.int64(4),
 'Stress': np.int64(5),
 'Suicidal': np.int64(6)}

In [17]:
df_2.head()

Unnamed: 0,cleaned_statement,status
0,oh gosh,0
1,trouble sleeping confused mind restless heart ...,0
2,wrong back dear forward doubt stay restless re...,0
3,ive shifted focus something else im still worried,0
4,im restless restless month boy mean,0


In [20]:
# splitting the data 
from sklearn.model_selection import train_test_split
X = df_2['cleaned_statement']
y = df_2['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
# creating vectors for the cleaned_statement column
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [26]:
# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier()

# Train the model
model.fit(X_train_tfidf, y_train)


In [27]:
from sklearn.metrics import classification_report, accuracy_score
# making predictions
y_pred = model.predict(X_test_tfidf)

# checking the accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.688715953307393
              precision    recall  f1-score   support

           0       0.90      0.50      0.64       768
           1       0.97      0.37      0.53       556
           2       0.55      0.82      0.66      3081
           3       0.79      0.95      0.86      3269
           4       1.00      0.26      0.41       215
           5       0.97      0.21      0.35       517
           6       0.71      0.40      0.52      2131

    accuracy                           0.69     10537
   macro avg       0.84      0.50      0.57     10537
weighted avg       0.74      0.69      0.67     10537



In [28]:
# creating a pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def preprocess_text(self, text):
        # Lowercase the text
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenize the text
        words = text.split()
        
        # Remove stopwords and apply lemmatization
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        
        # Join words back into a single string
        cleaned_text = ' '.join(words)
        
        return cleaned_text
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [self.preprocess_text(text) for text in X]
    
    


In [29]:
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

In [31]:
X = df_1['statement']
y = df_2['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [32]:
# Train the model
pipeline.fit(X_train, y_train)

In [33]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [34]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.6797950080668121
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.49      0.63       768
           1       0.98      0.36      0.52       556
           2       0.54      0.82      0.65      3081
           3       0.79      0.95      0.86      3269
           4       1.00      0.26      0.41       215
           5       0.97      0.21      0.34       517
           6       0.69      0.38      0.49      2131

    accuracy                           0.68     10537
   macro avg       0.84      0.49      0.56     10537
weighted avg       0.73      0.68      0.66     10537



In [10]:
import requests
text = 'A lot of times if I am feeling sad, I immediately think of how others will respond to it. Or I am looking for comfort.. my father is a homophobic, racist, sexist piece of shit and my mother takes care of everything in the house. I hate my dad, when he started saying things like "there is only two genders" and "you are looking for attention" and making things seem like I was in the wrong no matter how much I was right, I realized how much of a shitbag he was and really felt desperate. I felt desperate for love and so I am confusing that with wanting attention.. am I in the wrong for doing this? Am I depressed or wanting attention?'
url = "http://127.0.0.1:8000/predict_sentiment"
data = {"text": text}
response = requests.post(url, json=data)

print(response.json())


{'text': 'A lot of times if I am feeling sad, I immediately think of how others will respond to it. Or I am looking for comfort.. my father is a homophobic, racist, sexist piece of shit and my mother takes care of everything in the house. I hate my dad, when he started saying things like "there is only two genders" and "you are looking for attention" and making things seem like I was in the wrong no matter how much I was right, I realized how much of a shitbag he was and really felt desperate. I felt desperate for love and so I am confusing that with wanting attention.. am I in the wrong for doing this? Am I depressed or wanting attention?', 'prediction': 'Depression'}
