File size: 8,601 Bytes
5c052bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# csv1 = pd.read_csv(\n",
    "#     \"yt.csv\",\n",
    "#     delimiter=\";\"\n",
    "# )\n",
    "# df1 = csv1[[\"Comment\", \"Spam\"]].copy()\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# df1['Comment'] = df1['Comment'].replace('\\n', ' ', regex=True)\n",
    "\n",
    "# df1= df1.fillna(0)\n",
    "# df1['Spam'] = df1['Spam'].astype(int)\n",
    "# df1 = df1.dropna()\n",
    "\n",
    "# df1 =df1.drop_duplicates()\n",
    "\n",
    "\n",
    "\n",
    "csv2 = pd.read_csv(\n",
    "    \"../spam_or_not_spam.csv\",\n",
    "    delimiter=\",\"\n",
    ")\n",
    "df2 = csv2[[\"text\", \"label_num\"]].copy()\n",
    "\n",
    "# df2[\"v1\"] = df2['v1'].map( {'spam': 1, 'ham': 0} )\n",
    "\n",
    "\n",
    "# df2['label_num'] = df2['label_num'].fillna(0).astype(int)\n",
    "\n",
    "df2 = df2.dropna()\n",
    "df2 =df2.drop_duplicates()\n",
    "\n",
    "# df2 = df2.reindex(columns=['v2', 'v1'])\n",
    "\n",
    "df2['text'] = df2['text'].replace('\\n', ' ', regex=True).replace('^ ', '', regex=True).replace('$ ', '', regex=True).replace('Subject: ', '', regex=True)\n",
    "\n",
    "\n",
    "df2\n",
    "\n",
    "\n",
    "\n",
    "df2.to_csv(\"./spam-dataset2.csv\",index=0)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "23bd4cdf",
   "metadata": {},
   "source": [
    "# 1->spam\n",
    "# 0-> not spam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ad83ce85",
   "metadata": {},
   "outputs": [
    {
     "ename": "ParserError",
     "evalue": "Error tokenizing data. C error: Expected 3 fields in line 6048, saw 4\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mParserError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[15], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m csv2 \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\n\u001b[1;32m      3\u001b[0m     \u001b[39m\"\u001b[39;49m\u001b[39mspam.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m      4\u001b[0m     delimiter\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m,\u001b[39;49m\u001b[39m\"\u001b[39;49m\n\u001b[1;32m      5\u001b[0m )\n\u001b[1;32m      8\u001b[0m df2 \u001b[39m=\u001b[39m csv2[[\u001b[39m\"\u001b[39m\u001b[39mBody\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mLabel\u001b[39m\u001b[39m\"\u001b[39m]]\n\u001b[1;32m      9\u001b[0m df2\u001b[39m=\u001b[39m df2\u001b[39m.\u001b[39mcopy()\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m    899\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    900\u001b[0m     dialect,\n\u001b[1;32m    901\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    908\u001b[0m     dtype_backend\u001b[39m=\u001b[39mdtype_backend,\n\u001b[1;32m    909\u001b[0m )\n\u001b[1;32m    910\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:583\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    580\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\n\u001b[1;32m    582\u001b[0m \u001b[39mwith\u001b[39;00m parser:\n\u001b[0;32m--> 583\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\u001b[39m.\u001b[39;49mread(nrows)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1704\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m   1697\u001b[0m nrows \u001b[39m=\u001b[39m validate_integer(\u001b[39m\"\u001b[39m\u001b[39mnrows\u001b[39m\u001b[39m\"\u001b[39m, nrows)\n\u001b[1;32m   1698\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1699\u001b[0m     \u001b[39m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m   1700\u001b[0m     (\n\u001b[1;32m   1701\u001b[0m         index,\n\u001b[1;32m   1702\u001b[0m         columns,\n\u001b[1;32m   1703\u001b[0m         col_dict,\n\u001b[0;32m-> 1704\u001b[0m     ) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_engine\u001b[39m.\u001b[39;49mread(  \u001b[39m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m   1705\u001b[0m         nrows\n\u001b[1;32m   1706\u001b[0m     )\n\u001b[1;32m   1707\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m   1708\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py:234\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    233\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlow_memory:\n\u001b[0;32m--> 234\u001b[0m         chunks \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_reader\u001b[39m.\u001b[39;49mread_low_memory(nrows)\n\u001b[1;32m    235\u001b[0m         \u001b[39m# destructive to chunks\u001b[39;00m\n\u001b[1;32m    236\u001b[0m         data \u001b[39m=\u001b[39m _concatenate_chunks(chunks)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:812\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:873\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:848\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:859\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:2025\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 3 fields in line 6048, saw 4\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "csv2 = pd.read_csv(\n",
    "    \"spam.csv\",\n",
    "    delimiter=\",\"\n",
    ")\n",
    "\n",
    "\n",
    "df2 = csv2[[\"Body\", \"Label\"]]\n",
    "df2= df2.copy()\n",
    "\n",
    "df2['Label'] = df2['Label'].astype(int)\n",
    "df2 = df2.dropna()\n",
    "df2 =df2.drop_duplicates()\n",
    "\n",
    "df2['Body'] = df2['Body'].replace('\\n', ' ', regex=True)\n",
    "df2['Body'] = df2['Body'].replace('empty', '', regex=True)\n",
    "\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "423c6595",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}