Spaces:
Runtime error
Runtime error
File size: 8,601 Bytes
5c052bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# csv1 = pd.read_csv(\n",
"# \"yt.csv\",\n",
"# delimiter=\";\"\n",
"# )\n",
"# df1 = csv1[[\"Comment\", \"Spam\"]].copy()\n",
"\n",
"\n",
"\n",
"\n",
"# df1['Comment'] = df1['Comment'].replace('\\n', ' ', regex=True)\n",
"\n",
"# df1= df1.fillna(0)\n",
"# df1['Spam'] = df1['Spam'].astype(int)\n",
"# df1 = df1.dropna()\n",
"\n",
"# df1 =df1.drop_duplicates()\n",
"\n",
"\n",
"\n",
"csv2 = pd.read_csv(\n",
" \"../spam_or_not_spam.csv\",\n",
" delimiter=\",\"\n",
")\n",
"df2 = csv2[[\"text\", \"label_num\"]].copy()\n",
"\n",
"# df2[\"v1\"] = df2['v1'].map( {'spam': 1, 'ham': 0} )\n",
"\n",
"\n",
"# df2['label_num'] = df2['label_num'].fillna(0).astype(int)\n",
"\n",
"df2 = df2.dropna()\n",
"df2 =df2.drop_duplicates()\n",
"\n",
"# df2 = df2.reindex(columns=['v2', 'v1'])\n",
"\n",
"df2['text'] = df2['text'].replace('\\n', ' ', regex=True).replace('^ ', '', regex=True).replace('$ ', '', regex=True).replace('Subject: ', '', regex=True)\n",
"\n",
"\n",
"df2\n",
"\n",
"\n",
"\n",
"df2.to_csv(\"./spam-dataset2.csv\",index=0)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "23bd4cdf",
"metadata": {},
"source": [
"# 1->spam\n",
"# 0-> not spam"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ad83ce85",
"metadata": {},
"outputs": [
{
"ename": "ParserError",
"evalue": "Error tokenizing data. C error: Expected 3 fields in line 6048, saw 4\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[15], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m csv2 \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\n\u001b[1;32m 3\u001b[0m \u001b[39m\"\u001b[39;49m\u001b[39mspam.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 4\u001b[0m delimiter\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m,\u001b[39;49m\u001b[39m\"\u001b[39;49m\n\u001b[1;32m 5\u001b[0m )\n\u001b[1;32m 8\u001b[0m df2 \u001b[39m=\u001b[39m csv2[[\u001b[39m\"\u001b[39m\u001b[39mBody\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mLabel\u001b[39m\u001b[39m\"\u001b[39m]]\n\u001b[1;32m 9\u001b[0m df2\u001b[39m=\u001b[39m df2\u001b[39m.\u001b[39mcopy()\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[39m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:583\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n\u001b[1;32m 582\u001b[0m \u001b[39mwith\u001b[39;00m parser:\n\u001b[0;32m--> 583\u001b[0m \u001b[39mreturn\u001b[39;00m parser\u001b[39m.\u001b[39;49mread(nrows)\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1704\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1697\u001b[0m nrows \u001b[39m=\u001b[39m validate_integer(\u001b[39m\"\u001b[39m\u001b[39mnrows\u001b[39m\u001b[39m\"\u001b[39m, nrows)\n\u001b[1;32m 1698\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1699\u001b[0m \u001b[39m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m 1700\u001b[0m (\n\u001b[1;32m 1701\u001b[0m index,\n\u001b[1;32m 1702\u001b[0m columns,\n\u001b[1;32m 1703\u001b[0m col_dict,\n\u001b[0;32m-> 1704\u001b[0m ) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_engine\u001b[39m.\u001b[39;49mread( \u001b[39m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m 1705\u001b[0m nrows\n\u001b[1;32m 1706\u001b[0m )\n\u001b[1;32m 1707\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m 1708\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py:234\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 233\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlow_memory:\n\u001b[0;32m--> 234\u001b[0m chunks \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_reader\u001b[39m.\u001b[39;49mread_low_memory(nrows)\n\u001b[1;32m 235\u001b[0m \u001b[39m# destructive to chunks\u001b[39;00m\n\u001b[1;32m 236\u001b[0m data \u001b[39m=\u001b[39m _concatenate_chunks(chunks)\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:812\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:873\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:848\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:859\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:2025\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 3 fields in line 6048, saw 4\n"
]
}
],
"source": [
"import pandas as pd\n",
"csv2 = pd.read_csv(\n",
" \"spam.csv\",\n",
" delimiter=\",\"\n",
")\n",
"\n",
"\n",
"df2 = csv2[[\"Body\", \"Label\"]]\n",
"df2= df2.copy()\n",
"\n",
"df2['Label'] = df2['Label'].astype(int)\n",
"df2 = df2.dropna()\n",
"df2 =df2.drop_duplicates()\n",
"\n",
"df2['Body'] = df2['Body'].replace('\\n', ' ', regex=True)\n",
"df2['Body'] = df2['Body'].replace('empty', '', regex=True)\n",
"\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "423c6595",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|