Regino commited on
Commit
045e883
·
1 Parent(s): 3f37371

jlsdndfnds

Browse files
Files changed (1) hide show
  1. Train Model.ipynb +17 -20
Train Model.ipynb CHANGED
@@ -9,7 +9,7 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": 5,
13
  "metadata": {},
14
  "outputs": [
15
  {
@@ -33,24 +33,22 @@
33
  }
34
  ],
35
  "source": [
36
- "import pandas as pd \n",
37
  "\n",
38
- "# Define column names manually\n",
39
- "column_names = ['id',\"place\",\"label\", \"text\"] # Change this based on your dataset\n",
40
- "\n",
41
- "# Load training dataset\n",
42
  "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
43
  "\n",
44
- "# Load test dataset\n",
45
  "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
46
  "\n",
47
- "# Display first few rows\n",
48
  "print(train_df.head())\n"
49
  ]
50
  },
51
  {
52
  "cell_type": "code",
53
- "execution_count": 10,
54
  "metadata": {},
55
  "outputs": [
56
  {
@@ -149,11 +147,11 @@
149
  "import nltk\n",
150
  "from nltk.corpus import stopwords\n",
151
  "\n",
152
- "# Download stopwords if not already downloaded\n",
153
  "nltk.download(\"stopwords\")\n",
154
  "stop_words = set(stopwords.words(\"english\"))\n",
155
  "\n",
156
- "# Function to clean text\n",
157
  "def preprocess_text(text):\n",
158
  " if isinstance(text, float): # Handle missing values\n",
159
  " return \"\"\n",
@@ -168,14 +166,13 @@
168
  "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
169
  "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
170
  "\n",
171
- "# Display a sample of the cleaned text\n",
172
  "print(\"Sample cleaned text:\")\n",
173
  "display(train_df[[\"text\", \"clean_text\"]].head())\n"
174
  ]
175
  },
176
  {
177
  "cell_type": "code",
178
- "execution_count": 11,
179
  "metadata": {},
180
  "outputs": [
181
  {
@@ -191,14 +188,14 @@
191
  "source": [
192
  "from sklearn.feature_extraction.text import TfidfVectorizer\n",
193
  "\n",
194
- "# Initialize TF-IDF Vectorizer\n",
195
- "vectorizer = TfidfVectorizer(max_features=5000) # Limit to 5000 most important words\n",
196
  "\n",
197
  "# Fit and transform training data, then transform test data\n",
198
  "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
199
  "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
200
  "\n",
201
- "# Extract labels (assuming the sentiment column is named \"label\")\n",
202
  "y_train = train_df[\"label\"]\n",
203
  "y_test = test_df[\"label\"]\n",
204
  "\n",
@@ -209,7 +206,7 @@
209
  },
210
  {
211
  "cell_type": "code",
212
- "execution_count": 12,
213
  "metadata": {},
214
  "outputs": [
215
  {
@@ -237,11 +234,11 @@
237
  "from sklearn.linear_model import LogisticRegression\n",
238
  "from sklearn.metrics import accuracy_score, classification_report\n",
239
  "\n",
240
- "# Initialize and train the model\n",
241
- "model = LogisticRegression(max_iter=1000) # Increase iterations to ensure convergence\n",
242
  "model.fit(X_train, y_train)\n",
243
  "\n",
244
- "# Make predictions on the test set\n",
245
  "y_pred = model.predict(X_test)\n",
246
  "\n",
247
  "# Evaluate the model\n",
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": null,
13
  "metadata": {},
14
  "outputs": [
15
  {
 
33
  }
34
  ],
35
  "source": [
36
+ "import pandas as pd \n",
37
  "\n",
38
+ "column_names = ['id',\"place\",\"label\", \"text\"]\n",
39
+ "#Train Dataset\n",
 
 
40
  "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
41
  "\n",
42
+ "#Test Dataset\n",
43
  "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
44
  "\n",
45
+ "\n",
46
  "print(train_df.head())\n"
47
  ]
48
  },
49
  {
50
  "cell_type": "code",
51
+ "execution_count": null,
52
  "metadata": {},
53
  "outputs": [
54
  {
 
147
  "import nltk\n",
148
  "from nltk.corpus import stopwords\n",
149
  "\n",
150
+ "# Stopwords\n",
151
  "nltk.download(\"stopwords\")\n",
152
  "stop_words = set(stopwords.words(\"english\"))\n",
153
  "\n",
154
+ "# Clean Text\n",
155
  "def preprocess_text(text):\n",
156
  " if isinstance(text, float): # Handle missing values\n",
157
  " return \"\"\n",
 
166
  "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
167
  "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
168
  "\n",
 
169
  "print(\"Sample cleaned text:\")\n",
170
  "display(train_df[[\"text\", \"clean_text\"]].head())\n"
171
  ]
172
  },
173
  {
174
  "cell_type": "code",
175
+ "execution_count": null,
176
  "metadata": {},
177
  "outputs": [
178
  {
 
188
  "source": [
189
  "from sklearn.feature_extraction.text import TfidfVectorizer\n",
190
  "\n",
191
+ "# TF-IDF Vectorizer\n",
192
+ "vectorizer = TfidfVectorizer(max_features=5000)\n",
193
  "\n",
194
  "# Fit and transform training data, then transform test data\n",
195
  "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
196
  "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
197
  "\n",
198
+ "# Extract labels\n",
199
  "y_train = train_df[\"label\"]\n",
200
  "y_test = test_df[\"label\"]\n",
201
  "\n",
 
206
  },
207
  {
208
  "cell_type": "code",
209
+ "execution_count": null,
210
  "metadata": {},
211
  "outputs": [
212
  {
 
234
  "from sklearn.linear_model import LogisticRegression\n",
235
  "from sklearn.metrics import accuracy_score, classification_report\n",
236
  "\n",
237
+ "# Train the model\n",
238
+ "model = LogisticRegression(max_iter=1000)\n",
239
  "model.fit(X_train, y_train)\n",
240
  "\n",
241
+ "# Make predictions\n",
242
  "y_pred = model.predict(X_test)\n",
243
  "\n",
244
  "# Evaluate the model\n",