{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# csv1 = pd.read_csv(\n", "# \"yt.csv\",\n", "# delimiter=\";\"\n", "# )\n", "# df1 = csv1[[\"Comment\", \"Spam\"]].copy()\n", "\n", "\n", "\n", "\n", "# df1['Comment'] = df1['Comment'].replace('\\n', ' ', regex=True)\n", "\n", "# df1= df1.fillna(0)\n", "# df1['Spam'] = df1['Spam'].astype(int)\n", "# df1 = df1.dropna()\n", "\n", "# df1 =df1.drop_duplicates()\n", "\n", "\n", "\n", "csv2 = pd.read_csv(\n", " \"../spam_or_not_spam.csv\",\n", " delimiter=\",\"\n", ")\n", "df2 = csv2[[\"text\", \"label_num\"]].copy()\n", "\n", "# df2[\"v1\"] = df2['v1'].map( {'spam': 1, 'ham': 0} )\n", "\n", "\n", "# df2['label_num'] = df2['label_num'].fillna(0).astype(int)\n", "\n", "df2 = df2.dropna()\n", "df2 =df2.drop_duplicates()\n", "\n", "# df2 = df2.reindex(columns=['v2', 'v1'])\n", "\n", "df2['text'] = df2['text'].replace('\\n', ' ', regex=True).replace('^ ', '', regex=True).replace('$ ', '', regex=True).replace('Subject: ', '', regex=True)\n", "\n", "\n", "df2\n", "\n", "\n", "\n", "df2.to_csv(\"./spam-dataset2.csv\",index=0)\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "23bd4cdf", "metadata": {}, "source": [ "# 1->spam\n", "# 0-> not spam" ] }, { "cell_type": "code", "execution_count": 15, "id": "ad83ce85", "metadata": {}, "outputs": [ { "ename": "ParserError", "evalue": "Error tokenizing data. C error: Expected 3 fields in line 6048, saw 4\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[15], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m csv2 \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\n\u001b[1;32m 3\u001b[0m \u001b[39m\"\u001b[39;49m\u001b[39mspam.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 4\u001b[0m delimiter\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m,\u001b[39;49m\u001b[39m\"\u001b[39;49m\n\u001b[1;32m 5\u001b[0m )\n\u001b[1;32m 8\u001b[0m df2 \u001b[39m=\u001b[39m csv2[[\u001b[39m\"\u001b[39m\u001b[39mBody\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mLabel\u001b[39m\u001b[39m\"\u001b[39m]]\n\u001b[1;32m 9\u001b[0m df2\u001b[39m=\u001b[39m df2\u001b[39m.\u001b[39mcopy()\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[39m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:583\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n\u001b[1;32m 582\u001b[0m \u001b[39mwith\u001b[39;00m parser:\n\u001b[0;32m--> 583\u001b[0m \u001b[39mreturn\u001b[39;00m parser\u001b[39m.\u001b[39;49mread(nrows)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1704\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1697\u001b[0m nrows \u001b[39m=\u001b[39m validate_integer(\u001b[39m\"\u001b[39m\u001b[39mnrows\u001b[39m\u001b[39m\"\u001b[39m, nrows)\n\u001b[1;32m 1698\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1699\u001b[0m \u001b[39m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m 1700\u001b[0m (\n\u001b[1;32m 1701\u001b[0m index,\n\u001b[1;32m 1702\u001b[0m columns,\n\u001b[1;32m 1703\u001b[0m col_dict,\n\u001b[0;32m-> 1704\u001b[0m ) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_engine\u001b[39m.\u001b[39;49mread( \u001b[39m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m 1705\u001b[0m nrows\n\u001b[1;32m 1706\u001b[0m )\n\u001b[1;32m 1707\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m 1708\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py:234\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 233\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlow_memory:\n\u001b[0;32m--> 234\u001b[0m chunks \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_reader\u001b[39m.\u001b[39;49mread_low_memory(nrows)\n\u001b[1;32m 235\u001b[0m \u001b[39m# destructive to chunks\u001b[39;00m\n\u001b[1;32m 236\u001b[0m data \u001b[39m=\u001b[39m _concatenate_chunks(chunks)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:812\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:873\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:848\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:859\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx:2025\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 3 fields in line 6048, saw 4\n" ] } ], "source": [ "import pandas as pd\n", "csv2 = pd.read_csv(\n", " \"spam.csv\",\n", " delimiter=\",\"\n", ")\n", "\n", "\n", "df2 = csv2[[\"Body\", \"Label\"]]\n", "df2= df2.copy()\n", "\n", "df2['Label'] = df2['Label'].astype(int)\n", "df2 = df2.dropna()\n", "df2 =df2.drop_duplicates()\n", "\n", "df2['Body'] = df2['Body'].replace('\\n', ' ', regex=True)\n", "df2['Body'] = df2['Body'].replace('empty', '', regex=True)\n", "\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "id": "423c6595", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }