Regino commited on
Commit
0e876c8
Β·
1 Parent(s): 7ef995c

first commit

Browse files
Train Model.ipynb ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "Dataset from hugging face"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 5,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ " id place label \\\n",
20
+ "0 2401 Borderlands Positive \n",
21
+ "1 2401 Borderlands Positive \n",
22
+ "2 2401 Borderlands Positive \n",
23
+ "3 2401 Borderlands Positive \n",
24
+ "4 2401 Borderlands Positive \n",
25
+ "\n",
26
+ " text \n",
27
+ "0 im getting on borderlands and i will murder yo... \n",
28
+ "1 I am coming to the borders and I will kill you... \n",
29
+ "2 im getting on borderlands and i will kill you ... \n",
30
+ "3 im coming on borderlands and i will murder you... \n",
31
+ "4 im getting on borderlands 2 and i will murder ... \n"
32
+ ]
33
+ }
34
+ ],
35
+ "source": [
36
+ "import pandas as pd \n",
37
+ "\n",
38
+ "# Define column names manually\n",
39
+ "column_names = ['id',\"place\",\"label\", \"text\"] # Change this based on your dataset\n",
40
+ "\n",
41
+ "# Load training dataset\n",
42
+ "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
43
+ "\n",
44
+ "# Load test dataset\n",
45
+ "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
46
+ "\n",
47
+ "# Display first few rows\n",
48
+ "print(train_df.head())\n"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 10,
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "name": "stderr",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n",
61
+ "[nltk_data] Jr\\AppData\\Roaming\\nltk_data...\n",
62
+ "[nltk_data] Package stopwords is already up-to-date!\n"
63
+ ]
64
+ },
65
+ {
66
+ "name": "stdout",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "Sample cleaned text:\n"
70
+ ]
71
+ },
72
+ {
73
+ "data": {
74
+ "text/html": [
75
+ "<div>\n",
76
+ "<style scoped>\n",
77
+ " .dataframe tbody tr th:only-of-type {\n",
78
+ " vertical-align: middle;\n",
79
+ " }\n",
80
+ "\n",
81
+ " .dataframe tbody tr th {\n",
82
+ " vertical-align: top;\n",
83
+ " }\n",
84
+ "\n",
85
+ " .dataframe thead th {\n",
86
+ " text-align: right;\n",
87
+ " }\n",
88
+ "</style>\n",
89
+ "<table border=\"1\" class=\"dataframe\">\n",
90
+ " <thead>\n",
91
+ " <tr style=\"text-align: right;\">\n",
92
+ " <th></th>\n",
93
+ " <th>text</th>\n",
94
+ " <th>clean_text</th>\n",
95
+ " </tr>\n",
96
+ " </thead>\n",
97
+ " <tbody>\n",
98
+ " <tr>\n",
99
+ " <th>0</th>\n",
100
+ " <td>im getting on borderlands and i will murder yo...</td>\n",
101
+ " <td>im getting borderlands murder</td>\n",
102
+ " </tr>\n",
103
+ " <tr>\n",
104
+ " <th>1</th>\n",
105
+ " <td>I am coming to the borders and I will kill you...</td>\n",
106
+ " <td>coming borders kill</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>im getting on borderlands and i will kill you ...</td>\n",
111
+ " <td>im getting borderlands kill</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>3</th>\n",
115
+ " <td>im coming on borderlands and i will murder you...</td>\n",
116
+ " <td>im coming borderlands murder</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>4</th>\n",
120
+ " <td>im getting on borderlands 2 and i will murder ...</td>\n",
121
+ " <td>im getting borderlands 2 murder</td>\n",
122
+ " </tr>\n",
123
+ " </tbody>\n",
124
+ "</table>\n",
125
+ "</div>"
126
+ ],
127
+ "text/plain": [
128
+ " text \\\n",
129
+ "0 im getting on borderlands and i will murder yo... \n",
130
+ "1 I am coming to the borders and I will kill you... \n",
131
+ "2 im getting on borderlands and i will kill you ... \n",
132
+ "3 im coming on borderlands and i will murder you... \n",
133
+ "4 im getting on borderlands 2 and i will murder ... \n",
134
+ "\n",
135
+ " clean_text \n",
136
+ "0 im getting borderlands murder \n",
137
+ "1 coming borders kill \n",
138
+ "2 im getting borderlands kill \n",
139
+ "3 im coming borderlands murder \n",
140
+ "4 im getting borderlands 2 murder "
141
+ ]
142
+ },
143
+ "metadata": {},
144
+ "output_type": "display_data"
145
+ }
146
+ ],
147
+ "source": [
148
+ "import re\n",
149
+ "import nltk\n",
150
+ "from nltk.corpus import stopwords\n",
151
+ "\n",
152
+ "# Download stopwords if not already downloaded\n",
153
+ "nltk.download(\"stopwords\")\n",
154
+ "stop_words = set(stopwords.words(\"english\"))\n",
155
+ "\n",
156
+ "# Function to clean text\n",
157
+ "def preprocess_text(text):\n",
158
+ " if isinstance(text, float): # Handle missing values\n",
159
+ " return \"\"\n",
160
+ " \n",
161
+ " text = text.lower() # Convert to lowercase\n",
162
+ " text = re.sub(r\"\\W\", \" \", text) # Remove special characters\n",
163
+ " text = re.sub(r\"\\s+\", \" \", text).strip() # Remove extra spaces\n",
164
+ " text = \" \".join([word for word in text.split() if word not in stop_words]) # Remove stopwords\n",
165
+ " return text\n",
166
+ "\n",
167
+ "# Apply preprocessing to the text column\n",
168
+ "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
169
+ "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
170
+ "\n",
171
+ "# Display a sample of the cleaned text\n",
172
+ "print(\"Sample cleaned text:\")\n",
173
+ "display(train_df[[\"text\", \"clean_text\"]].head())\n"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 11,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "TF-IDF vectorization complete! βœ…\n",
186
+ "Training data shape: (74682, 5000)\n",
187
+ "Testing data shape: (1000, 5000)\n"
188
+ ]
189
+ }
190
+ ],
191
+ "source": [
192
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
193
+ "\n",
194
+ "# Initialize TF-IDF Vectorizer\n",
195
+ "vectorizer = TfidfVectorizer(max_features=5000) # Limit to 5000 most important words\n",
196
+ "\n",
197
+ "# Fit and transform training data, then transform test data\n",
198
+ "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
199
+ "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
200
+ "\n",
201
+ "# Extract labels (assuming the sentiment column is named \"label\")\n",
202
+ "y_train = train_df[\"label\"]\n",
203
+ "y_test = test_df[\"label\"]\n",
204
+ "\n",
205
+ "print(\"TF-IDF vectorization complete! βœ…\")\n",
206
+ "print(f\"Training data shape: {X_train.shape}\")\n",
207
+ "print(f\"Testing data shape: {X_test.shape}\")\n"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 12,
213
+ "metadata": {},
214
+ "outputs": [
215
+ {
216
+ "name": "stdout",
217
+ "output_type": "stream",
218
+ "text": [
219
+ "Model Accuracy: 0.8120\n",
220
+ "\n",
221
+ "Classification Report:\n",
222
+ " precision recall f1-score support\n",
223
+ "\n",
224
+ " Irrelevant 0.82 0.73 0.77 172\n",
225
+ " Negative 0.78 0.89 0.83 266\n",
226
+ " Neutral 0.85 0.76 0.80 285\n",
227
+ " Positive 0.81 0.84 0.82 277\n",
228
+ "\n",
229
+ " accuracy 0.81 1000\n",
230
+ " macro avg 0.81 0.81 0.81 1000\n",
231
+ "weighted avg 0.81 0.81 0.81 1000\n",
232
+ "\n"
233
+ ]
234
+ }
235
+ ],
236
+ "source": [
237
+ "from sklearn.linear_model import LogisticRegression\n",
238
+ "from sklearn.metrics import accuracy_score, classification_report\n",
239
+ "\n",
240
+ "# Initialize and train the model\n",
241
+ "model = LogisticRegression(max_iter=1000) # Increase iterations to ensure convergence\n",
242
+ "model.fit(X_train, y_train)\n",
243
+ "\n",
244
+ "# Make predictions on the test set\n",
245
+ "y_pred = model.predict(X_test)\n",
246
+ "\n",
247
+ "# Evaluate the model\n",
248
+ "accuracy = accuracy_score(y_test, y_pred)\n",
249
+ "print(f\"Model Accuracy: {accuracy:.4f}\")\n",
250
+ "\n",
251
+ "# Display classification report\n",
252
+ "print(\"\\nClassification Report:\")\n",
253
+ "print(classification_report(y_test, y_pred))\n"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": 13,
259
+ "metadata": {},
260
+ "outputs": [
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "Model and vectorizer saved successfully! βœ…\n"
266
+ ]
267
+ }
268
+ ],
269
+ "source": [
270
+ "import joblib\n",
271
+ "\n",
272
+ "# Save the trained model\n",
273
+ "joblib.dump(model, \"sentiment_model.pkl\")\n",
274
+ "\n",
275
+ "# Save the TF-IDF vectorizer\n",
276
+ "joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n",
277
+ "\n",
278
+ "print(\"Model and vectorizer saved successfully! βœ…\")\n"
279
+ ]
280
+ }
281
+ ],
282
+ "metadata": {
283
+ "kernelspec": {
284
+ "display_name": "Python 3",
285
+ "language": "python",
286
+ "name": "python3"
287
+ },
288
+ "language_info": {
289
+ "codemirror_mode": {
290
+ "name": "ipython",
291
+ "version": 3
292
+ },
293
+ "file_extension": ".py",
294
+ "mimetype": "text/x-python",
295
+ "name": "python",
296
+ "nbconvert_exporter": "python",
297
+ "pygments_lexer": "ipython3",
298
+ "version": "3.13.1"
299
+ }
300
+ },
301
+ "nbformat": 4,
302
+ "nbformat_minor": 2
303
+ }
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import re
5
+ import nltk
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from wordcloud import WordCloud
9
+ from nltk.corpus import stopwords
10
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
11
+
12
+ # Download stopwords if not already available
13
+ nltk.download("stopwords")
14
+ stop_words = set(stopwords.words("english"))
15
+
16
+ # Load the trained model and TF-IDF vectorizer
17
+ model = joblib.load("sentiment_model.pkl")
18
+ vectorizer = joblib.load("tfidf_vectorizer.pkl")
19
+
20
+ # Load dataset with manually defined headers
21
+ column_names = ["id", "place", "label", "text"]
22
+ df = pd.read_csv("twitter_training.csv", names=column_names, header=None)
23
+
24
+ # Function to preprocess text
25
+ def preprocess_text(text):
26
+ text = str(text).lower()
27
+ text = re.sub(r"\W", " ", text) # Remove special characters
28
+ text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
29
+ text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
30
+ return text
31
+
32
+ # Load test dataset and compute model metrics
33
+ try:
34
+ test_df = pd.read_csv("twitter_validation.csv", names=column_names, header=None)
35
+ X_test = vectorizer.transform(test_df["text"].astype(str))
36
+ y_test = test_df["label"]
37
+ y_pred = model.predict(X_test)
38
+
39
+ # Model metrics
40
+ accuracy = accuracy_score(y_test, y_pred)
41
+ classification_report_text = classification_report(y_test, y_pred, output_dict=True)
42
+ class_report_df = pd.DataFrame(classification_report_text).T.round(2)
43
+
44
+ # Compute confusion matrix
45
+ cm = confusion_matrix(y_test, y_pred, labels=["Positive", "Neutral", "Negative"])
46
+
47
+ except Exception as e:
48
+ accuracy = None
49
+ class_report_df = None
50
+ cm = None
51
+
52
+ # Function to predict sentiment
53
+ def predict_sentiment(user_input):
54
+ cleaned_text = preprocess_text(user_input)
55
+ text_vector = vectorizer.transform([cleaned_text])
56
+ prediction = model.predict(text_vector)[0]
57
+ return prediction
58
+
59
+ # Sidebar Navigation
60
+ st.sidebar.title("πŸ” Sentiment Analysis App")
61
+ st.sidebar.markdown(
62
+ "This app performs **Sentiment Analysis** on text using **Machine Learning**. "
63
+ "It classifies text as **Positive, Neutral, or Negative** based on its sentiment."
64
+ )
65
+
66
+ st.sidebar.header("πŸ“Œ Navigation")
67
+ page = st.sidebar.radio(
68
+ "Go to:",
69
+ ["πŸ“‚ Dataset", "πŸ“Š Visualizations", "πŸ“ˆ Model Metrics", "πŸ€– Sentiment Predictor"]
70
+ )
71
+
72
+ # App Title and Explanation
73
+ st.title("πŸ“’ Twitter Sentiment Analysis")
74
+ st.markdown(
75
+ "This application uses **Natural Language Processing (NLP)** and "
76
+ "**Logistic Regression** to analyze the sentiment of tweets. The model is trained using a dataset "
77
+ "of tweets labeled as **Positive, Neutral, or Negative**."
78
+ )
79
+
80
+ # πŸ“‚ Dataset Page
81
+ if page == "πŸ“‚ Dataset":
82
+ st.header("πŸ“‚ Dataset Preview")
83
+ st.markdown("### Displaying Rows **50-55** from the Training Data:")
84
+ st.dataframe(df.iloc[49:55])
85
+
86
+ # πŸ“Š Visualization Page
87
+ elif page == "πŸ“Š Visualizations":
88
+ st.header("πŸ“Š Data Visualizations")
89
+
90
+ # Pie Chart of Sentiments
91
+ st.subheader("πŸ₯§ Sentiment Distribution")
92
+ fig, ax = plt.subplots(figsize=(5, 5))
93
+ df["label"].value_counts().plot(kind="pie", autopct="%1.1f%%", colors=["green", "gray", "red", "blue"], ax=ax)
94
+ plt.title("Sentiment Distribution")
95
+ plt.ylabel("")
96
+ st.pyplot(fig)
97
+
98
+ # Bar Chart of Sentiment Counts
99
+ st.subheader("πŸ“Š Sentiment Count (Bar Chart)")
100
+ fig, ax = plt.subplots(figsize=(6, 4))
101
+ sns.countplot(x=df["label"], palette={"Positive": "green", "Neutral": "gray", "Negative": "red", "Irrelevant": "blue"}, ax=ax)
102
+ plt.xlabel("Sentiment Type")
103
+ plt.ylabel("Count")
104
+ plt.title("Distribution of Sentiments")
105
+ st.pyplot(fig)
106
+
107
+ # Word Cloud for Most Frequent Words
108
+ st.subheader("☁️ Word Cloud of Most Common Words")
109
+ text_data = " ".join(df["text"].astype(str))
110
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text_data)
111
+ fig, ax = plt.subplots(figsize=(8, 4))
112
+ ax.imshow(wordcloud, interpolation="bilinear")
113
+ ax.axis("off")
114
+ st.pyplot(fig)
115
+
116
+ # πŸ“ˆ Model Metrics Page
117
+ elif page == "πŸ“ˆ Model Metrics":
118
+ st.header("πŸ“ˆ Model Performance")
119
+
120
+ if accuracy is not None:
121
+ st.write(f"βœ… **Accuracy:** {accuracy * 100:.2f}%")
122
+ else:
123
+ st.warning("⚠️ Could not calculate accuracy. Please check the test dataset.")
124
+
125
+ if class_report_df is not None and not class_report_df.empty:
126
+ st.subheader("πŸ“Œ Classification Report")
127
+ st.dataframe(class_report_df)
128
+ else:
129
+ st.warning("⚠️ Classification report is empty.")
130
+
131
+ if cm is not None and cm.any():
132
+ st.subheader("πŸ”₯ Confusion Matrix")
133
+ fig, ax = plt.subplots(figsize=(6, 5))
134
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Positive", "Neutral", "Negative"], yticklabels=["Positive", "Neutral", "Negative"], ax=ax)
135
+ plt.xlabel("Predicted")
136
+ plt.ylabel("Actual")
137
+ plt.title("Confusion Matrix")
138
+ st.pyplot(fig)
139
+ else:
140
+ st.warning("⚠️ Confusion matrix could not be generated.")
141
+
142
+ # πŸ€– Sentiment Predictor Page
143
+ elif page == "πŸ€– Sentiment Predictor":
144
+ st.header("πŸ€– Sentiment Analysis")
145
+ st.markdown("Enter a sentence below, and the model will predict whether it is **Positive, Neutral, or Negative**.")
146
+
147
+ user_input = st.text_area("Type your sentence here:", "")
148
+
149
+ if st.button("Analyze Sentiment"):
150
+ if user_input.strip():
151
+ sentiment_result = predict_sentiment(user_input)
152
+ st.markdown(f"### πŸ” Prediction: **{sentiment_result}**")
153
+ else:
154
+ st.warning("Please enter some text to analyze.")
confusion_matrix.png ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ joblib
3
+ pandas
4
+ nltk
5
+ matplotlib
6
+ seaborn
7
+ wordcloud
8
+ scikit-learn
sentiment_distribution.png ADDED
sentiment_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5061ba50ae5dfc7b3f1415eade952be7b8764ade9d1945e2ec27f5ad85e63092
3
+ size 161127
tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24722296250083368688b553d01fb5b3723364fea155b7d64820200e681c149f
3
+ size 181291
twitter_training.csv ADDED
The diff for this file is too large to render. See raw diff
 
twitter_validation.csv ADDED
The diff for this file is too large to render. See raw diff