Spaces:
Sleeping
Sleeping
Regino
commited on
Commit
·
045e883
1
Parent(s):
3f37371
jlsdndfnds
Browse files- Train Model.ipynb +17 -20
Train Model.ipynb
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
-
"execution_count":
|
13 |
"metadata": {},
|
14 |
"outputs": [
|
15 |
{
|
@@ -33,24 +33,22 @@
|
|
33 |
}
|
34 |
],
|
35 |
"source": [
|
36 |
-
"import pandas as pd
|
37 |
"\n",
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"\n",
|
41 |
-
"# Load training dataset\n",
|
42 |
"train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
|
43 |
"\n",
|
44 |
-
"#
|
45 |
"test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
|
46 |
"\n",
|
47 |
-
"
|
48 |
"print(train_df.head())\n"
|
49 |
]
|
50 |
},
|
51 |
{
|
52 |
"cell_type": "code",
|
53 |
-
"execution_count":
|
54 |
"metadata": {},
|
55 |
"outputs": [
|
56 |
{
|
@@ -149,11 +147,11 @@
|
|
149 |
"import nltk\n",
|
150 |
"from nltk.corpus import stopwords\n",
|
151 |
"\n",
|
152 |
-
"#
|
153 |
"nltk.download(\"stopwords\")\n",
|
154 |
"stop_words = set(stopwords.words(\"english\"))\n",
|
155 |
"\n",
|
156 |
-
"#
|
157 |
"def preprocess_text(text):\n",
|
158 |
" if isinstance(text, float): # Handle missing values\n",
|
159 |
" return \"\"\n",
|
@@ -168,14 +166,13 @@
|
|
168 |
"train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
|
169 |
"test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
|
170 |
"\n",
|
171 |
-
"# Display a sample of the cleaned text\n",
|
172 |
"print(\"Sample cleaned text:\")\n",
|
173 |
"display(train_df[[\"text\", \"clean_text\"]].head())\n"
|
174 |
]
|
175 |
},
|
176 |
{
|
177 |
"cell_type": "code",
|
178 |
-
"execution_count":
|
179 |
"metadata": {},
|
180 |
"outputs": [
|
181 |
{
|
@@ -191,14 +188,14 @@
|
|
191 |
"source": [
|
192 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
193 |
"\n",
|
194 |
-
"#
|
195 |
-
"vectorizer = TfidfVectorizer(max_features=5000)
|
196 |
"\n",
|
197 |
"# Fit and transform training data, then transform test data\n",
|
198 |
"X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
|
199 |
"X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
|
200 |
"\n",
|
201 |
-
"# Extract labels
|
202 |
"y_train = train_df[\"label\"]\n",
|
203 |
"y_test = test_df[\"label\"]\n",
|
204 |
"\n",
|
@@ -209,7 +206,7 @@
|
|
209 |
},
|
210 |
{
|
211 |
"cell_type": "code",
|
212 |
-
"execution_count":
|
213 |
"metadata": {},
|
214 |
"outputs": [
|
215 |
{
|
@@ -237,11 +234,11 @@
|
|
237 |
"from sklearn.linear_model import LogisticRegression\n",
|
238 |
"from sklearn.metrics import accuracy_score, classification_report\n",
|
239 |
"\n",
|
240 |
-
"#
|
241 |
-
"model = LogisticRegression(max_iter=1000)
|
242 |
"model.fit(X_train, y_train)\n",
|
243 |
"\n",
|
244 |
-
"# Make predictions
|
245 |
"y_pred = model.predict(X_test)\n",
|
246 |
"\n",
|
247 |
"# Evaluate the model\n",
|
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
"metadata": {},
|
14 |
"outputs": [
|
15 |
{
|
|
|
33 |
}
|
34 |
],
|
35 |
"source": [
|
36 |
+
"import pandas as pd \n",
|
37 |
"\n",
|
38 |
+
"column_names = ['id',\"place\",\"label\", \"text\"]\n",
|
39 |
+
"#Train Dataset\n",
|
|
|
|
|
40 |
"train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
|
41 |
"\n",
|
42 |
+
"#Test Dataset\n",
|
43 |
"test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
|
44 |
"\n",
|
45 |
+
"\n",
|
46 |
"print(train_df.head())\n"
|
47 |
]
|
48 |
},
|
49 |
{
|
50 |
"cell_type": "code",
|
51 |
+
"execution_count": null,
|
52 |
"metadata": {},
|
53 |
"outputs": [
|
54 |
{
|
|
|
147 |
"import nltk\n",
|
148 |
"from nltk.corpus import stopwords\n",
|
149 |
"\n",
|
150 |
+
"# Stopwords\n",
|
151 |
"nltk.download(\"stopwords\")\n",
|
152 |
"stop_words = set(stopwords.words(\"english\"))\n",
|
153 |
"\n",
|
154 |
+
"# Clean Text\n",
|
155 |
"def preprocess_text(text):\n",
|
156 |
" if isinstance(text, float): # Handle missing values\n",
|
157 |
" return \"\"\n",
|
|
|
166 |
"train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
|
167 |
"test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
|
168 |
"\n",
|
|
|
169 |
"print(\"Sample cleaned text:\")\n",
|
170 |
"display(train_df[[\"text\", \"clean_text\"]].head())\n"
|
171 |
]
|
172 |
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
+
"execution_count": null,
|
176 |
"metadata": {},
|
177 |
"outputs": [
|
178 |
{
|
|
|
188 |
"source": [
|
189 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
190 |
"\n",
|
191 |
+
"# TF-IDF Vectorizer\n",
|
192 |
+
"vectorizer = TfidfVectorizer(max_features=5000)\n",
|
193 |
"\n",
|
194 |
"# Fit and transform training data, then transform test data\n",
|
195 |
"X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
|
196 |
"X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
|
197 |
"\n",
|
198 |
+
"# Extract labels\n",
|
199 |
"y_train = train_df[\"label\"]\n",
|
200 |
"y_test = test_df[\"label\"]\n",
|
201 |
"\n",
|
|
|
206 |
},
|
207 |
{
|
208 |
"cell_type": "code",
|
209 |
+
"execution_count": null,
|
210 |
"metadata": {},
|
211 |
"outputs": [
|
212 |
{
|
|
|
234 |
"from sklearn.linear_model import LogisticRegression\n",
|
235 |
"from sklearn.metrics import accuracy_score, classification_report\n",
|
236 |
"\n",
|
237 |
+
"# Train the model\n",
|
238 |
+
"model = LogisticRegression(max_iter=1000)\n",
|
239 |
"model.fit(X_train, y_train)\n",
|
240 |
"\n",
|
241 |
+
"# Make predictions\n",
|
242 |
"y_pred = model.predict(X_test)\n",
|
243 |
"\n",
|
244 |
"# Evaluate the model\n",
|