{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1.Overview of the Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from matplotlib import pyplot as plt\n", "%matplotlib inline\n", "import matplotlib\n", "matplotlib.rcParams[\"figure.figsize\"] = (20,10)\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'C:/Users/vijay/OneDrive/Desktop/Data Analysis Project/BHV.csv'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df1 \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mC:/Users/vijay/OneDrive/Desktop/Data Analysis Project/BHV.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m df1\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m5\u001b[39m)\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.._deprecate_kwarg..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 936\u001b[0m dialect,\n\u001b[0;32m 937\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 946\u001b[0m defaults\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelimiter\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[0;32m 947\u001b[0m )\n\u001b[0;32m 948\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 602\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 604\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[0;32m 607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 608\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1439\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1441\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1733\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m 1734\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1736\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1737\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1738\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1739\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1740\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1741\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1742\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1743\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1744\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1746\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", "File \u001b[1;32mc:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 851\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 852\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 853\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 855\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m 857\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 858\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 859\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 863\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 864\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 865\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:/Users/vijay/OneDrive/Desktop/Data Analysis Project/BHV.csv'" ] } ], "source": [ "df1 = pd.read_csv(\"C:/Users/vijay/OneDrive/Desktop/Data Analysis Project/BHV.csv\")\n", "df1.head(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13320, 9)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "area_type\n", "Built-up Area 2418\n", "Carpet Area 87\n", "Plot Area 2025\n", "Super built-up Area 8790\n", "Name: area_type, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.groupby('area_type')['area_type'].agg('count')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['area_type', 'availability', 'location', 'size', 'society',\n", " 'total_sqft', 'bath', 'balcony', 'price'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Super built-up Area', 'Plot Area', 'Built-up Area',\n", " 'Carpet Area'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['area_type'].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Super built-up Area 8790\n", "Built-up Area 2418\n", "Plot Area 2025\n", "Carpet Area 87\n", "Name: area_type, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['area_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13320, 5)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df1.drop(['area_type','society','balcony','availability'], axis = 'columns')\n", "df2.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2.Data Cleaning (Handling NA values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location 1\n", "size 16\n", "total_sqft 0\n", "bath 73\n", "price 0\n", "dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13320, 5)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location 0\n", "size 0\n", "total_sqft 0\n", "bath 0\n", "price 0\n", "dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = df2.dropna()\n", "df3.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13246, 5)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3.Feature Engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Adding new feature (in integer format) for bhk (Bedroom,Hall,Kitchen)\n", "2. Add new feature called price per square feet" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Adding new feature (in integer format) for bhk (Bedroom,Hall,Kitchen)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vijay\\AppData\\Local\\Temp\\ipykernel_7480\\2716584372.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))\n" ] }, { "data": { "text/plain": [ "array([ 2, 4, 3, 6, 1, 8, 7, 5, 11, 9, 27, 10, 19, 16, 43, 14, 12,\n", " 13, 18], dtype=int64)" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))\n", "df3.bhk.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def is_float(x):\n", " try:\n", " float(x)\n", " except:\n", " return False\n", " return True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhk
30Yelahanka4 BHK2100 - 28504.0186.0004
122Hebbal4 BHK3067 - 81564.0477.0004
1378th Phase JP Nagar2 BHK1042 - 11052.054.0052
165Sarjapur2 BHK1145 - 13402.043.4902
188KR Puram2 BHK1015 - 15402.056.8002
410Kengeri1 BHK34.46Sq. Meter1.018.5001
549Hennur Road2 BHK1195 - 14402.063.7702
648Arekere9 Bedroom4125Perch9.0265.0009
661Yelahanka2 BHK1120 - 11452.048.1302
672Bettahalsoor4 Bedroom3090 - 50024.0445.0004
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk\n", "30 Yelahanka 4 BHK 2100 - 2850 4.0 186.000 4\n", "122 Hebbal 4 BHK 3067 - 8156 4.0 477.000 4\n", "137 8th Phase JP Nagar 2 BHK 1042 - 1105 2.0 54.005 2\n", "165 Sarjapur 2 BHK 1145 - 1340 2.0 43.490 2\n", "188 KR Puram 2 BHK 1015 - 1540 2.0 56.800 2\n", "410 Kengeri 1 BHK 34.46Sq. Meter 1.0 18.500 1\n", "549 Hennur Road 2 BHK 1195 - 1440 2.0 63.770 2\n", "648 Arekere 9 Bedroom 4125Perch 9.0 265.000 9\n", "661 Yelahanka 2 BHK 1120 - 1145 2.0 48.130 2\n", "672 Bettahalsoor 4 Bedroom 3090 - 5002 4.0 445.000 4" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3[~df3['total_sqft'].apply(is_float)].head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. I am going to just drop such corner cases to keep things simple" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def convert_sqft_to_num(x):\n", " tokens = x.split('-')\n", " if len(tokens) == 2:\n", " return (float(tokens[0])+float(tokens[1]))/2\n", " try:\n", " return float(x)\n", " except:\n", " return None " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhk
0Electronic City Phase II2 BHK1056.02.039.072
1Chikka Tirupathi4 Bedroom2600.05.0120.004
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk\n", "0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2\n", "1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "df4 = df3.copy()\n", "df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)\n", "df4 = df4[df4.total_sqft.notnull()]\n", "df4.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location Yelahanka\n", "size 4 BHK\n", "total_sqft 2475.0\n", "bath 4.0\n", "price 186.0\n", "bhk 4\n", "Name: 30, dtype: object" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "df4.loc[30]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2. Add new feature called price per square feet" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
0Electronic City Phase II2 BHK1056.02.039.0723699.810606
1Chikka Tirupathi4 Bedroom2600.05.0120.0044615.384615
2Uttarahalli3 BHK1440.02.062.0034305.555556
3Lingadheeranahalli3 BHK1521.03.095.0036245.890861
4Kothanur2 BHK1200.02.051.0024250.000000
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk \\\n", "0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2 \n", "1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4 \n", "2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3 \n", "3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3 \n", "4 Kothanur 2 BHK 1200.0 2.0 51.00 2 \n", "\n", " price_per_sqft \n", "0 3699.810606 \n", "1 4615.384615 \n", "2 4305.555556 \n", "3 6245.890861 \n", "4 4250.000000 " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5 = df4.copy()\n", "df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']\n", "df5.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.320000e+04\n", "mean 7.920759e+03\n", "std 1.067272e+05\n", "min 2.678298e+02\n", "25% 4.267701e+03\n", "50% 5.438331e+03\n", "75% 7.317073e+03\n", "max 1.200000e+07\n", "Name: price_per_sqft, dtype: float64" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5_stats = df5['price_per_sqft'].describe()\n", "df5_stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df5.to_csv(\"BHV.csv\",index=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Whitefield 533\n", "Sarjapur Road 392\n", "Electronic City 304\n", "Kanakpura Road 264\n", "Thanisandra 235\n", " ... \n", "Rajanna Layout 1\n", "Subramanyanagar 1\n", "Lakshmipura Vidyaanyapura 1\n", "Malur Hosur Road 1\n", "Abshot Layout 1\n", "Name: location, Length: 1287, dtype: int64" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.location = df5.location.apply(lambda x: x.strip())\n", "location_stats = df5['location'].value_counts(ascending=False)\n", "location_stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "13200" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "location_stats.values.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "240" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(location_stats[location_stats>10])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1287" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(location_stats)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1047" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(location_stats[location_stats<=10])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Dimensionality Reduction" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BTM 1st Stage 10\n", "Gunjur Palya 10\n", "Nagappa Reddy Layout 10\n", "Sector 1 HSR Layout 10\n", "Thyagaraja Nagar 10\n", " ..\n", "Rajanna Layout 1\n", "Subramanyanagar 1\n", "Lakshmipura Vidyaanyapura 1\n", "Malur Hosur Road 1\n", "Abshot Layout 1\n", "Name: location, Length: 1047, dtype: int64" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "location_stats_less_than_10 = location_stats[location_stats<=10]\n", "location_stats_less_than_10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1287" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "len(df5.location.unique())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "241" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)\n", "len(df5.location.unique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
0Electronic City Phase II2 BHK1056.02.039.0723699.810606
1Chikka Tirupathi4 Bedroom2600.05.0120.0044615.384615
2Uttarahalli3 BHK1440.02.062.0034305.555556
3Lingadheeranahalli3 BHK1521.03.095.0036245.890861
4Kothanur2 BHK1200.02.051.0024250.000000
5Whitefield2 BHK1170.02.038.0023247.863248
6Old Airport Road4 BHK2732.04.0204.0047467.057101
7Rajaji Nagar4 BHK3300.04.0600.00418181.818182
8Marathahalli3 BHK1310.03.063.2534828.244275
9other6 Bedroom1020.06.0370.00636274.509804
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk \\\n", "0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2 \n", "1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4 \n", "2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3 \n", "3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3 \n", "4 Kothanur 2 BHK 1200.0 2.0 51.00 2 \n", "5 Whitefield 2 BHK 1170.0 2.0 38.00 2 \n", "6 Old Airport Road 4 BHK 2732.0 4.0 204.00 4 \n", "7 Rajaji Nagar 4 BHK 3300.0 4.0 600.00 4 \n", "8 Marathahalli 3 BHK 1310.0 3.0 63.25 3 \n", "9 other 6 Bedroom 1020.0 6.0 370.00 6 \n", "\n", " price_per_sqft \n", "0 3699.810606 \n", "1 4615.384615 \n", "2 4305.555556 \n", "3 6245.890861 \n", "4 4250.000000 \n", "5 3247.863248 \n", "6 7467.057101 \n", "7 18181.818182 \n", "8 4828.244275 \n", "9 36274.509804 " ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.head(10)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Outlier Removal Using Business Logic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Outlier Removal Using Standard Deviation and Mean\n", "2. Outlier Removal Using Bathroom Features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Outlier Removal Using Standard Deviation and Mean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "As a data scientist when you have a conversation with your business manager (who has expertise in real estate), he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
9other6 Bedroom1020.06.0370.0636274.509804
45HSR Layout8 Bedroom600.09.0200.0833333.333333
58Murugeshpalya6 Bedroom1407.04.0150.0610660.980810
68Devarachikkanahalli8 Bedroom1350.07.085.086296.296296
70other3 Bedroom500.03.0100.0320000.000000
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk \\\n", "9 other 6 Bedroom 1020.0 6.0 370.0 6 \n", "45 HSR Layout 8 Bedroom 600.0 9.0 200.0 8 \n", "58 Murugeshpalya 6 Bedroom 1407.0 4.0 150.0 6 \n", "68 Devarachikkanahalli 8 Bedroom 1350.0 7.0 85.0 8 \n", "70 other 3 Bedroom 500.0 3.0 100.0 3 \n", "\n", " price_per_sqft \n", "9 36274.509804 \n", "45 33333.333333 \n", "58 10660.980810 \n", "68 6296.296296 \n", "70 20000.000000 " ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5[df5.total_sqft/df5.bhk<300].head()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13200, 7)" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(12456, 7)" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6 = df5[~(df5.total_sqft/df5.bhk<300)]\n", "df6.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Outlier removal using standard deviation and mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 12456.000000\n", "mean 6308.502826\n", "std 4168.127339\n", "min 267.829813\n", "25% 4210.526316\n", "50% 5294.117647\n", "75% 6916.666667\n", "max 176470.588235\n", "Name: price_per_sqft, dtype: float64" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6.price_per_sqft.describe()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10242, 7)" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def remove_pps_outliers(df):\n", " df_out = pd.DataFrame()\n", " for key, subdf in df.groupby('location'):\n", " m = np.mean(subdf.price_per_sqft)\n", " st = np.std(subdf.price_per_sqft)\n", " reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]\n", " df_out = pd.concat([df_out,reduced_df],ignore_index=True)\n", " return df_out\n", "df7 = remove_pps_outliers(df6)\n", "df7.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's check if for a given location how does the 2 BHK and 3 BHK property prices look like\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def plot_scatter_chart(df,location):\n", " bhk2 = df[(df.location==location) & (df.bhk==2)]\n", " bhk3 = df[(df.location==location) & (df.bhk==3)]\n", " matplotlib.rcParams['figure.figsize'] = (15,10)\n", " plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)\n", " plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)\n", " plt.xlabel(\"Total Square Feet Area\")\n", " plt.ylabel(\"Price (Lakh Indian Rupees)\")\n", " plt.title(location)\n", " plt.legend()\n", " \n", "plot_scatter_chart(df7,\"Rajaji Nagar\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plot_scatter_chart(df7,\"Hebbal\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We should also remove properties where for same location, the price of (for example) 3 bedroom apartment is less than 2 bedroom apartment (with same square ft area). What we will do is for a given location, we will build a dictionary of stats per bhk, i.e.\n", "\n", "{\n", " '1' : {\n", "\n", " \n", " 'mean': 4000,\n", " 'std: 2000,\n", " 'count': 34\n", " },\n", " '2' : {\n", " 'mean': 4300,\n", " 'std: 2300,\n", " 'count': 22\n", " }, \n", "}\n", "Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7317, 7)" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def remove_bhk_outliers(df):\n", " exclude_indices = np.array([])\n", " for location, location_df in df.groupby('location'):\n", " bhk_stats = {}\n", " for bhk, bhk_df in location_df.groupby('bhk'):\n", " bhk_stats[bhk] = {\n", " 'mean': np.mean(bhk_df.price_per_sqft),\n", " 'std': np.std(bhk_df.price_per_sqft),\n", " 'count': bhk_df.shape[0]\n", " }\n", " for bhk, bhk_df in location_df.groupby('bhk'):\n", " stats = bhk_stats.get(bhk-1)\n", " if stats and stats['count']>5:\n", " exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)\n", " return df.drop(exclude_indices,axis='index')\n", "df8 = remove_bhk_outliers(df7)\n", "# df8 = df7.copy()\n", "df8.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Plot same scatter chart again to visualize price_per_sqft for 2 BHK and 3 BHK properties\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plot_scatter_chart(df8,\"Rajaji Nagar\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plot_scatter_chart(df8,\"Hebbal\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Based on above charts we can see that data points highlighted in red below are outliers and they are being removed due to remove_bhk_outliers function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Count')" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "import matplotlib\n", "matplotlib.rcParams[\"figure.figsize\"] = (20,10)\n", "plt.hist(df8.price_per_sqft,rwidth=0.8)\n", "plt.xlabel(\"Price Per Square Feet\")\n", "plt.ylabel(\"Count\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2. Outlier Removal Using Bathrooms Features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 4., 3., 2., 5., 8., 1., 6., 7., 9., 12., 16., 13.])" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df8.bath.unique()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Count')" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.hist(df8.bath,rwidth=0.8)\n", "plt.xlabel(\"Number of bathrooms\")\n", "plt.ylabel(\"Count\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
5277Neeladri Nagar10 BHK4000.012.0160.0104000.000000
8483other10 BHK12000.012.0525.0104375.000000
8572other16 BHK10000.016.0550.0165500.000000
9306other11 BHK6000.012.0150.0112500.000000
9637other13 BHK5425.013.0275.0135069.124424
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk price_per_sqft\n", "5277 Neeladri Nagar 10 BHK 4000.0 12.0 160.0 10 4000.000000\n", "8483 other 10 BHK 12000.0 12.0 525.0 10 4375.000000\n", "8572 other 16 BHK 10000.0 16.0 550.0 16 5500.000000\n", "9306 other 11 BHK 6000.0 12.0 150.0 11 2500.000000\n", "9637 other 13 BHK 5425.0 13.0 275.0 13 5069.124424" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df8[df8.bath>10]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is unusual to have 2 more bathrooms than number of bedrooms in a home\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
1626Chikkabanavar4 Bedroom2460.07.080.043252.032520
5238Nagasandra4 Bedroom7000.08.0450.046428.571429
6711Thanisandra3 BHK1806.06.0116.036423.034330
8408other6 BHK11338.09.01000.068819.897689
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk price_per_sqft\n", "1626 Chikkabanavar 4 Bedroom 2460.0 7.0 80.0 4 3252.032520\n", "5238 Nagasandra 4 Bedroom 7000.0 8.0 450.0 4 6428.571429\n", "6711 Thanisandra 3 BHK 1806.0 6.0 116.0 3 6423.034330\n", "8408 other 6 BHK 11338.0 9.0 1000.0 6 8819.897689" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df8[df8.bath>df8.bhk+2]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Again the business manager has a conversation with you (i.e. a data scientist) that if you have 4 bedroom home and even if you have bathroom in all 4 rooms plus one guest bathroom, you will have total bath = total bed + 1 max. Anything above that is an outlier or a data error and can be removed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7239, 7)" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df9 = df8[df8.bath\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
01st Block Jayanagar4 BHK2850.04.0428.0415017.543860
11st Block Jayanagar3 BHK1630.03.0194.0311901.840491
\n", "" ], "text/plain": [ " location size total_sqft bath price bhk price_per_sqft\n", "0 1st Block Jayanagar 4 BHK 2850.0 4.0 428.0 4 15017.543860\n", "1 1st Block Jayanagar 3 BHK 1630.0 3.0 194.0 3 11901.840491" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "df9.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationtotal_sqftbathpricebhk
01st Block Jayanagar2850.04.0428.04
11st Block Jayanagar1630.03.0194.03
21st Block Jayanagar1875.02.0235.03
\n", "
" ], "text/plain": [ " location total_sqft bath price bhk\n", "0 1st Block Jayanagar 2850.0 4.0 428.0 4\n", "1 1st Block Jayanagar 1630.0 3.0 194.0 3\n", "2 1st Block Jayanagar 1875.0 2.0 235.0 3" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df10 = df9.drop(['size','price_per_sqft'],axis='columns')\n", "df10.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. One Hot Encoding for location" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout5th Phase JP Nagar6th Phase JP Nagar7th Phase JP Nagar8th Phase JP Nagar9th Phase JP Nagar...Vishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpurother
01000000000...0000000000
11000000000...0000000000
21000000000...0000000000
\n", "

3 rows × 241 columns

\n", "
" ], "text/plain": [ " 1st Block Jayanagar 1st Phase JP Nagar 2nd Phase Judicial Layout \\\n", "0 1 0 0 \n", "1 1 0 0 \n", "2 1 0 0 \n", "\n", " 2nd Stage Nagarbhavi 5th Block Hbr Layout 5th Phase JP Nagar \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", " 6th Phase JP Nagar 7th Phase JP Nagar 8th Phase JP Nagar \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", " 9th Phase JP Nagar ... Vishveshwarya Layout Vishwapriya Layout \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "\n", " Vittasandra Whitefield Yelachenahalli Yelahanka Yelahanka New Town \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "\n", " Yelenahalli Yeshwanthpur other \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", "[3 rows x 241 columns]" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummies = pd.get_dummies(df10.location)\n", "dummies.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationtotal_sqftbathpricebhk1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout...VijayanagarVishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpur
01st Block Jayanagar2850.04.0428.0410000...0000000000
11st Block Jayanagar1630.03.0194.0310000...0000000000
21st Block Jayanagar1875.02.0235.0310000...0000000000
31st Block Jayanagar1200.02.0130.0310000...0000000000
41st Block Jayanagar1235.02.0148.0210000...0000000000
\n", "

5 rows × 245 columns

\n", "
" ], "text/plain": [ " location total_sqft bath price bhk 1st Block Jayanagar \\\n", "0 1st Block Jayanagar 2850.0 4.0 428.0 4 1 \n", "1 1st Block Jayanagar 1630.0 3.0 194.0 3 1 \n", "2 1st Block Jayanagar 1875.0 2.0 235.0 3 1 \n", "3 1st Block Jayanagar 1200.0 2.0 130.0 3 1 \n", "4 1st Block Jayanagar 1235.0 2.0 148.0 2 1 \n", "\n", " 1st Phase JP Nagar 2nd Phase Judicial Layout 2nd Stage Nagarbhavi \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " 5th Block Hbr Layout ... Vijayanagar Vishveshwarya Layout \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "3 0 ... 0 0 \n", "4 0 ... 0 0 \n", "\n", " Vishwapriya Layout Vittasandra Whitefield Yelachenahalli Yelahanka \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 \n", "\n", " Yelahanka New Town Yelenahalli Yeshwanthpur \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", "[5 rows x 245 columns]" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')\n", "df11.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_sqftbathpricebhk1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout5th Phase JP Nagar...VijayanagarVishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpur
02850.04.0428.04100000...0000000000
11630.03.0194.03100000...0000000000
\n", "

2 rows × 244 columns

\n", "
" ], "text/plain": [ " total_sqft bath price bhk 1st Block Jayanagar 1st Phase JP Nagar \\\n", "0 2850.0 4.0 428.0 4 1 0 \n", "1 1630.0 3.0 194.0 3 1 0 \n", "\n", " 2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr Layout \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "\n", " 5th Phase JP Nagar ... Vijayanagar Vishveshwarya Layout \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "\n", " Vishwapriya Layout Vittasandra Whitefield Yelachenahalli Yelahanka \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "\n", " Yelahanka New Town Yelenahalli Yeshwanthpur \n", "0 0 0 0 \n", "1 0 0 0 \n", "\n", "[2 rows x 244 columns]" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12 = df11.drop('location',axis='columns')\n", "df12.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. Model Building" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7239, 244)" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12.shape\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_sqftbathbhk1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout5th Phase JP Nagar6th Phase JP Nagar...VijayanagarVishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpur
02850.04.041000000...0000000000
11630.03.031000000...0000000000
21875.02.031000000...0000000000
\n", "

3 rows × 243 columns

\n", "
" ], "text/plain": [ " total_sqft bath bhk 1st Block Jayanagar 1st Phase JP Nagar \\\n", "0 2850.0 4.0 4 1 0 \n", "1 1630.0 3.0 3 1 0 \n", "2 1875.0 2.0 3 1 0 \n", "\n", " 2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr Layout \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", " 5th Phase JP Nagar 6th Phase JP Nagar ... Vijayanagar \\\n", "0 0 0 ... 0 \n", "1 0 0 ... 0 \n", "2 0 0 ... 0 \n", "\n", " Vishveshwarya Layout Vishwapriya Layout Vittasandra Whitefield \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "\n", " Yelachenahalli Yelahanka Yelahanka New Town Yelenahalli Yeshwanthpur \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "\n", "[3 rows x 243 columns]" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df12.drop(['price'],axis='columns')\n", "X.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7239, 243)" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 428.0\n", "1 194.0\n", "2 235.0\n", "Name: price, dtype: float64" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = df12.price\n", "y.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7239" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8629132245229447" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "lr_clf = LinearRegression()\n", "lr_clf.fit(X_train,y_train)\n", "lr_clf.score(X_test,y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 8. Accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.82702546, 0.86027005, 0.85322178, 0.8436466 , 0.85481502])" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import ShuffleSplit\n", "from sklearn.model_selection import cross_val_score\n", "\n", "cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)\n", "\n", "cross_val_score(LinearRegression(), X, y, cv=cv)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see that in 5 iterations we get a score above 80% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Find best model using GridSearchCV" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vijay\\AppData\\Local\\Temp\\ipykernel_7480\\980312241.py:50: RuntimeWarning: overflow encountered in exp\n", " y_pred = np.exp(gs.predict(X))\n", "c:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\metrics\\_regression.py:1196: RuntimeWarning: overflow encountered in square\n", " numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "The best fit model is: linear_regression\n", "The R-squared score for the best fit model is: 0.8543503020740678\n", " model best_score \\\n", "0 linear_regression -2.106080e+14 \n", "1 exponential_regression -2.106080e+14 \n", "2 logarithmic_regression -2.106080e+14 \n", "\n", " best_params r2_score \n", "0 {'copy_X': True, 'fit_intercept': False, 'n_jo... 0.854350 \n", "1 {'copy_X': True, 'fit_intercept': False, 'n_jo... -inf \n", "2 {'copy_X': True, 'fit_intercept': False, 'n_jo... -1.162146 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n", " warnings.warn(\n" ] } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import GridSearchCV\n", "import pandas as pd\n", "from sklearn.metrics import r2_score\n", "import numpy as np\n", "import math\n", "\n", "# Define the algorithms and their parameters\n", "algos = {\n", " 'linear_regression': {\n", " 'model': LinearRegression(),\n", " 'params': {\n", " 'copy_X': [True, False],\n", " 'fit_intercept': [True, False],\n", " 'n_jobs': [None],\n", " 'positive': [False]\n", " }\n", " },\n", " 'exponential_regression': {\n", " 'model': LinearRegression(),\n", " 'params': {\n", " 'copy_X': [True, False],\n", " 'fit_intercept': [True, False],\n", " 'n_jobs': [None],\n", " 'positive': [False]\n", " }\n", " },\n", " 'logarithmic_regression': {\n", " 'model': LinearRegression(),\n", " 'params': {\n", " 'copy_X': [True, False],\n", " 'fit_intercept': [True, False],\n", " 'n_jobs': [None],\n", " 'positive': [False]\n", " }\n", " }\n", "}\n", "\n", "# Your function to find the best model using GridSearchCV\n", "def find_best_model_using_gridsearchcv(X, y):\n", " scores = []\n", " cv = 5 # Number of cross-validation folds\n", "\n", " for algo_name, config in algos.items():\n", " gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)\n", " gs.fit(X, y)\n", " \n", " # Exponential Regression\n", " if algo_name == 'exponential_regression':\n", " y_pred = np.exp(gs.predict(X))\n", " y_pred = np.where(y_pred == np.inf, np.nan, y_pred) # Replace infinity with NaN\n", " y_pred = np.nan_to_num(y_pred) # Replace NaN with 0\n", " r2_exp = r2_score(y, y_pred)\n", " scores.append({\n", " 'model': algo_name,\n", " 'best_score': gs.best_score_,\n", " 'best_params': gs.best_params_,\n", " 'r2_score': r2_exp\n", " })\n", " \n", " # Logarithmic Regression \n", " elif algo_name == 'logarithmic_regression':\n", " X_log = np.where(X <= 0, 1, X) # Replace non-positive values with 1\n", " y_pred = gs.predict(np.log(X_log))\n", " r2_log = r2_score(y, y_pred)\n", " scores.append({\n", " 'model': algo_name,\n", " 'best_score': gs.best_score_,\n", " 'best_params': gs.best_params_,\n", " 'r2_score': r2_log\n", " })\n", " \n", " # Linear Regression\n", " else:\n", " y_pred = gs.predict(X)\n", " r2_lin = r2_score(y, y_pred)\n", " scores.append({\n", " 'model': algo_name,\n", " 'best_score': gs.best_score_,\n", " 'best_params': gs.best_params_,\n", " 'r2_score': r2_lin\n", " })\n", "\n", " # Create the results DataFrame\n", " results = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params', 'r2_score'])\n", " \n", " # Determine the best fit model\n", " best_fit_model = results.loc[results['r2_score'].idxmax()]['model']\n", " best_r2_score = results['r2_score'].max()\n", " \n", " print(\"The best fit model is:\", best_fit_model)\n", " print(\"The R-squared score for the best fit model is:\", best_r2_score)\n", " \n", " return results\n", "\n", "# Call your function with X and y\n", "result = find_best_model_using_gridsearchcv(X, y)\n", "print(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 9. Testing Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def predict_price(location,sqft,bath,bhk): \n", " loc_index = np.where(X.columns==location)[0][0]\n", "\n", " x = np.zeros(len(X.columns))\n", " x[0] = sqft\n", " x[1] = bath\n", " x[2] = bhk\n", " if loc_index >= 0:\n", " x[loc_index] = 1\n", "\n", " return lr_clf.predict([x])[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "83.86570258311501" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_price('1st Phase JP Nagar',1000, 2, 2)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\vijay\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "86.08062284986265" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_price('1st Phase JP Nagar',1000, 3, 3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 10. Export the tested model to a pickle file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "with open('banglore_home_prices_model.pickle','wb') as f:\n", " pickle.dump(lr_clf,f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "columns = {\n", " 'data_columns' : [col.lower() for col in X.columns]\n", "}\n", "with open(\"columns.json\",\"w\") as f:\n", " f.write(json.dumps(columns))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 2 }