{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from os.path import join, dirname, exists\n", "import concurrent\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "import pdfplumber\n", "from datetime import datetime\n", "import requests\n", "import shutil\n", "import sys" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def check_exists(date):\n", " str_date = date.strftime('%Y%m%d')\n", " file_name = f\"AQI_Bulletin_{str_date}.pdf\"\n", " file_path = f\"AQI_data/{file_name}\"\n", " return exists(file_path), file_path, file_name\n", "\n", "def download(date):\n", " file_exists, file_path, file_name = check_exists(date)\n", " if file_exists:\n", " return file_path\n", "\n", " os.makedirs(\"AQI_data\", exist_ok=True)\n", "\n", " url = f\"https://cpcb.nic.in//upload/Downloads/{file_name}\"\n", " response = requests.get(url)\n", " if response.status_code == 200:\n", " with open(file_path, 'wb') as f:\n", " f.write(response.content)\n", " return file_path\n", " else:\n", " print(f\"Failed to download {url} with status code {response.status_code}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',\n", " '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',\n", " '2016-01-09', '2016-01-10',\n", " ...\n", " '2025-02-17', '2025-02-18', '2025-02-19', '2025-02-20',\n", " '2025-02-21', '2025-02-22', '2025-02-23', '2025-02-24',\n", " '2025-02-25', '2025-02-26'],\n", " dtype='datetime64[ns]', length=3345, freq='D')\n" ] }, { "data": { "text/plain": [ "(None, 3345)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range('2016-01-01', datetime.today() - pd.Timedelta(days=1), freq='D')\n", "# dates = pd.date_range('2024-01-01', '2024-02-01', freq='D')\n", "print(dates), len(dates)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20160606.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20170618.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20171014.pdf with status code 404\n", "Failed to download https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_20250101.pdf with status code 404\n" ] } ], "source": [ "with concurrent.futures.ThreadPoolExecutor(48) as executor:\n", " files = list(executor.map(download, dates))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3345\n" ] }, { "data": { "text/plain": [ "3341" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(files))\n", "files = list(filter(None, files))\n", "len(files)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "438a2a0c07fb4367b18a4deff93364e9", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3345 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def check_valid(value):\n", " if value is None:\n", " return False\n", " if value == \"\":\n", " return False\n", " return True\n", "\n", "def process_pattern_1(table, i, key):\n", " # second line\n", " if (key is not None) and (not check_valid(table[i][0])):\n", " air_quality = table[i][2]\n", " return air_quality\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " key = table[i][0]\n", " aqi = int(table[i][4])\n", " pollutant = table[i][5]\n", " air_quality = process_pattern_1(table, i+1, key)\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": None}}\n", " \n", "def process_pattern_2(table, i, key):\n", " # second line\n", " try:\n", " if (key is not None) and (not check_valid(table[i][0])):\n", " air_quality = table[i][2]\n", " return air_quality\n", " except Exception as e:\n", " print(table[i-1])\n", " raise e\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " try:\n", " key = table[i][0]\n", " if check_valid(table[i][1]):\n", " air_quality = table[i][1]\n", " need_for_second_line = False\n", " else:\n", " need_for_second_line = True\n", " aqi = int(table[i][4])\n", " pollutant = table[i][5] # p2\n", " n_stations = table[i][6]\n", " except Exception as e:\n", " print(table[i])\n", " print(table)\n", " raise e\n", " if need_for_second_line:\n", " air_quality = process_pattern_2(table, i+1, key)\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n", " \n", "def process_pattern_3(table, i, key):\n", " # second line\n", " if (key is not None) and (not check_valid(table[i][0])):\n", " air_quality = table[i][2]\n", " return air_quality\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " key = table[i][0]\n", " if check_valid(table[i][1]):\n", " air_quality = table[i][1]\n", " second_line_needed = False\n", " else:\n", " second_line_needed = True\n", " aqi = int(table[i][2])\n", " pollutant = table[i][3]\n", " n_stations = table[i][4]\n", " if second_line_needed:\n", " air_quality = process_pattern_3(table, i+1, key)\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n", "\n", "def process_pattern_4(table, i, key):\n", " # ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'Based on Number\\nof Monitoring\\nStations'], ['1', 'Agra', 'Moderate', '138', 'PM\\n2.5', '1'], [None, None, '', None, None, None], ['2', 'Ahmedabad', 'Satisfactory', '77', 'PM\\n10', '1'], [None, None, '', None, None, None], ['3', 'Aizawl', 'Satisfactory', '53', 'PM\\n2.5', '1'], [None, None, '', None, None, None], ['4', 'Ajmer', 'Satisfa\n", " # # invalid line\n", " # if (key is not None) and (not check_valid(table[i][0])):\n", " # air_quality = table[i][2]\n", " # return air_quality\n", " \n", " # first line\n", " if check_valid(table[i][0]):\n", " key = table[i][1]\n", " air_quality = table[i][2]\n", " aqi = int(table[i][3].split(\"\\n\")[0])\n", " pollutant = table[i][4]\n", " n_stations = table[i][5]\n", " return {key: {\"AQI\": aqi, \"Pollutant\": pollutant, \"Air Quality\": air_quality, \"Based on number of monitoring stations\": n_stations}}\n", " \n", "\n", "def process_table(table, start):\n", " data_dict = {}\n", " if table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant']:\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_1(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_1(table, i, None))\n", " \n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', None, None, 'Prominent\\nPollutant', 'Based on\\nnumber of\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nmonitoring\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', None, None, 'Prominent\\nPollutant', 'Based on number of\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nmonitoring\\nstations']) or (table[0] == ['City', 'Air Quality', None, None, 'Index\\nValue', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']) or (table[0] == ['City', 'Air Quality', None, None, None, None, 'Index Value', 'Prominent\\nPollutant', 'Based on\\nnumber of\\nstations']):\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_2(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_2(table, i, None))\n", " \n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif table[0] == ['City', 'Air Quality', 'Index Value', 'Prominent\\nPollutant', 'Based on number of\\nmonitoring stations']:\n", " # print(\"Pattern 3\")\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_3(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_3(table, i, None))\n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif (table[0] == ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Polluta\\nnt', 'Based on number\\nof monitoring\\nstations']) or (table[0] == ['S.No', 'City', 'Air Quality', 'Index Value', 'Prominent Pollutant', 'Based on Number\\nof Monitoring\\nStations']) or (table[0] == ['S.No', 'City', 'Air Quality', 'Index\\nValue', 'Prominent Pollutant', 'No. of Stations\\nParticipated/\\nTotal Stations']):\n", " # print(\"Pattern 4\")\n", " table = table[1:]\n", " for i in range(len(table)):\n", " data = process_pattern_4(table, i, None)\n", " if data:\n", " data_dict.update(process_pattern_4(table, i, None))\n", " \n", " df = pd.DataFrame(data_dict).T\n", " df.index.name = \"City\"\n", " df.reset_index(inplace=True, drop=False)\n", " return df, None\n", " elif (table[0] == ['Good', 'Minimal impact']) or (table[0] == ['Good', 'Minimal Impact']) or (table[0] == ['AQI', 'Category', 'Color Code', 'Possible Health Impacts']):\n", " # print(\"Not a data table\")\n", " return None, None\n", " else:\n", " print(table)\n", " raise ValueError(\"Table pattern not recognized\")\n", "\n", "def process_file(date):\n", " folders = [\"AQI_data\", \"AQI_data_csv\"]\n", "\n", " for folder in folders:\n", " if not os.path.exists(folder):\n", " os.makedirs(folder)\n", " file_exists, file_path, file_name = check_exists(date)\n", " if not file_exists:\n", " print(f\"File {file_name} does not exist\")\n", " return None\n", " save_path = file_name.replace(\".pdf\",\".csv\")\n", " if exists(f\"AQI_data_csv/{save_path}\"):\n", " try:\n", " pd.read_csv(f\"AQI_data_csv/{save_path}\")\n", " # print(f\"File {save_path} already exists\")\n", " return None\n", " except Exception as e:\n", " print(f\"File {save_path} is corrupted and will be overwritten\")\n", " \n", " tables = []\n", " with pdfplumber.open(file_path) as pdf:\n", " for page in pdf.pages:\n", " table = page.extract_table()\n", " if table:\n", " tables.append(table)\n", " try:\n", " assert len(tables) > 0, f\"No tables found in {file_path}\"\n", " except AssertionError:\n", " print(f\"No tables found in {file_path}\")\n", " return None\n", "\n", " df_list = []\n", " for table in tables:\n", " try:\n", " df, _ = process_table(table, 0)\n", " if df is not None:\n", " df_list.append(df)\n", " except Exception as e:\n", " print(f\"Ignoring a table for {file_name}\")\n", " # print(table)\n", " print(\"Error message:\", e)\n", " \n", " if len(df_list) == 0:\n", " print(f\"No valid tables found in {file_name}\")\n", " return None\n", " \n", " df = pd.concat(df_list, ignore_index=True)\n", " df['Date'] = date\n", " df.to_csv(f\"AQI_data_csv/{save_path}\", index=False)\n", "\n", "from joblib import Parallel, delayed\n", "\n", "# dfs = {}\n", "# for file_path in tqdm(files[1000:]):\n", " # print(file_path)\n", " # df = process_file(file_path)\n", " # dfs[file_path] s= df\n", "# print(dates[15:16])\n", "_ = Parallel(48)(delayed(process_file)(file_path) for file_path in tqdm(dates))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating Merged DataFrame" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merged CSV saved as AQI_data_csv/merged.csv\n" ] } ], "source": [ "import os\n", "import pandas as pd\n", "\n", "def merge_csv_files(folder_path, output_file):\n", " csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]\n", " \n", " if not csv_files:\n", " print(\"No CSV files found in the folder.\")\n", " return\n", "\n", " df_list = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]\n", " merged_df = pd.concat(df_list, ignore_index=True)\n", "\n", " merged_df.to_csv(output_file, index=False)\n", " print(f\"Merged CSV saved as {output_file}\")\n", "\n", "# Example usage\n", "folder_path = \"AQI_data_csv\"\n", "output_file = \"AQI_data_csv/merged.csv\"\n", "merge_csv_files(folder_path, output_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Postprocessing" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "397732" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df = pd.read_csv(\"AQI_data_csv/merged.csv\")\n", "len(merged_df)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Agartala', 'Agra', 'Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola', 'Alwar', 'Ambala', 'Amravati', 'Amritsar', 'Anantapur', 'Angul', 'Ankleshwar', 'Araria', 'Ariyalur', 'Arrah', 'Asansol', 'Aurangabad (Bihar)', 'Aurangabad(Maharashtra)', 'Baddi', 'Badlapur', 'Bagalkot', 'Baghpat', 'Bahadurgarh', 'Balasore', 'Ballabgarh', 'Banswara', 'Baran', 'Barbil', 'Bareilly', 'Baripada', 'Barmer', 'Barrackpore', 'Bathinda', 'Begusarai', 'Belapur', 'Belgaum', 'Bengaluru', 'Bettiah', 'Bhagalpur', 'Bharatpur', 'Bhilai', 'Bhilwara', 'Bhiwadi', 'Bhiwandi', 'Bhiwani', 'Bhopal', 'Bhubaneswar', 'Bidar', 'Bihar Sharif', 'Bikaner', 'Bilaspur', 'Bileipada', 'Boisar', 'Brajrajnagar', 'Bulandshahr', 'Bundi', 'Buxar', 'Byasanagar', 'Byrnihat', 'Chamarajanagar', 'Chandigarh', 'Chandrapur', 'Charkhi Dadri', 'Chengalpattu', 'Chennai', 'Chhal', 'Chhapra', 'Chikkaballapur', 'Chikkamagaluru', 'Chittoor', 'Chittorgarh', 'Churu', 'Coimbtore', 'Cuddalore', 'Cuttack', 'Damoh', 'Darbhanga', 'Dausa', 'Davanagere', 'Dehradun', 'Delhi', 'Dewas', 'Dhanbad', 'Dharuhera', 'Dharwad', 'Dholpur', 'Dhule', 'Dindigul', 'Dungarpur', 'Durgapur', 'Eloor', 'Ernakulam', 'Faridabad', 'Fatehabad', 'Firozabad', 'Gadag', 'Gandhinagar', 'Gangtok', 'Gaya', 'Ghaziabad', 'Gorakhpur', 'Greater_Noida', 'Gummidipoondi', 'Gurugram', 'Guwahati', 'Gwalior', 'Hajipur', 'Haldia', 'Hanumangarh', 'Hapur', 'Hassan', 'Haveri', 'Hisar', 'Hosur', 'Howrah', 'Hubballi', 'Hyderabad', 'Imphal', 'Indore', 'Jabalpur', 'Jaipur', 'Jaisalmer', 'Jalandhar', 'Jalgaon', 'Jalna', 'Jalore', 'Jhalawar', 'Jhansi', 'Jharsuguda', 'Jhunjhunu', 'Jind', 'Jodhpur', 'Jorapokhar', 'Kadapa', 'Kaithal', 'Kalaburgi', 'Kalyan', 'Kanchipuram', 'Kannur', 'Kanpur', 'Karauli', 'Karnal', 'Karur', 'Karwar', 'Kashipur', 'Katihar', 'Katni', 'Keonjhar', 'Khanna', 'Khurja', 'Kishanganj', 'Kochi', 'Kohima', 'Kolar', 'Kolhapur', 'Kolkata', 'Kollam', 'Koppal', 'Korba', 'Kota', 'Kozhikode', 'Kunjemura', 'Kurushketra', 'Latur', 'Loni_Ghaziabad', 'Lucknow', 'Ludhiana', 'Madurai', 'Mahad', 'Maihar', 'Malegaon', 'Mandi Gobindgarh', 'Mandideep', 'Mandikhera', 'Manesar', 'Mangalore', 'Manguraha', 'Medikeri', 'Meerut', 'Milupara', 'Mira-Bhayandar', 'Moradabad', 'Motihari', 'Mumbai', 'Munger', 'Muzaffarnagar', 'Muzaffarpur', 'Mysuru', 'NOIDA', 'Nagaon', 'Nagapattinam', 'Nagaur', 'Nagpur', 'Naharlagun', 'Nalbari', 'Nanded', 'Nandesari', 'Narnaul', 'Nashik', 'Navi Mumbai', 'Nayagarh', 'Noida', 'Ooty', 'Pali', 'Palkalaiperur', 'Palwal', 'Panchkula', 'Panipat', 'Parbhani', 'Pathardih', 'Patiala', 'Patna', 'Pimpri-Chinchwad', 'Pithampur', 'Pratapgarh', 'Prayagraj', 'Puducherry', 'Pudukottai', 'Pune', 'Purnia', 'Raichur', 'Raipur', 'Rairangpur', 'Rajamahendravaram', 'Rajgir', 'Rajsamand', 'Ramanagara', 'Ramanathapuram', 'Ranipet', 'Ratlam', 'Rishikesh', 'Rohtak', 'Rourkela', 'Rupnagar', 'Sagar', 'Saharsa', 'Salem', 'Samastipur', 'Sangli', 'Sasaram', 'Satna', 'Sawai Madhopur', 'Shillong', 'Shivamogga', 'Sikar', 'Silchar', 'Siliguri', 'Singrauli', 'Sirohi', 'Sirsa', 'Sivasagar', 'Siwan', 'Solapur', 'Sonipat', 'Sri Ganganagar', 'Sri Vijaya Puram', 'Srinagar', 'Suakati', 'Surat', 'Talcher', 'Tensa', 'Thane', 'Thanjavur', 'Thiruvananthapuram', 'Thoothukudi', 'Thrissur', 'Tiruchirappalli', 'Tirunelveli', 'Tirupati', 'Tirupur', 'Tonk', 'Tumidih', 'Udaipur', 'Udupi', 'Ujjain', 'Ulhasnagar', 'Vapi', 'Varanasi', 'Vatva', 'Vellore', 'Vijayapura', 'Vijayawada', 'Virar', 'Virudhunagar', 'Visakhapatnam', 'Vrindavan', 'Yadgir', 'Yamunanagar']\n" ] } ], "source": [ "city_mapping = {\n", " \"Amaravati\": \"Amravati\",\n", " \"Asanol\": \"Asansol\",\n", " \"Greater Noida\": \"Greater_Noida\",\n", " \"GandhiNagar\": \"Gandhinagar\",\n", " \"Gurgaon\": \"Gurugram\",\n", " \"Coimbatore\": \"Coimbtore\",\n", " \"Kalaburagi\": \"Kalaburgi\",\n", " \"Kurukshetra\": \"Kurushketra\",\n", " \"Loni_Dehat\": \"Loni_Ghaziabad\",\n", " \"Madikeri\": \"Medikeri\",\n", " \"Manglore\": \"Mangalore\",\n", " \"Pimpri Chinchwad\": \"Pimpri-Chinchwad\",\n", " \"Tumakuru\": \"Tumidih\",\n", " \"Tirumala\": \"Tirupati\",\n", " \"Tiruppur\": \"Tirupur\",\n", " \"Yamuna Nagar\": \"Yamunanagar\",\n", " \"vellore\": \"Vellore\" # duplicate, can map to itself or be handled separately\n", "}\n", "def replace_it(x):\n", " x = x.strip().replace(\"\\n\",\"\")\n", " if x in city_mapping:\n", " return city_mapping[x]\n", " else:\n", " return x\n", "\n", "merged_df['City'] = merged_df['City'].apply(lambda x: replace_it(x))\n", "merged_df = merged_df[merged_df.City != \"Aurangabad\"]\n", "print(merged_df['City'].value_counts().sort_index().index.tolist())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "State\n", "Andaman and Nicobar 6\n", "Andhra Pradesh 11546\n", "Arunachal Pradesh 614\n", "Assam 5099\n", "Bihar 28633\n", "Chandigarh 1980\n", "Chhattisgarh 5357\n", "Delhi 3330\n", "Gujarat 12195\n", "Haryana 50177\n", "Himachal Pradesh 1022\n", "Jammu and Kashmir 822\n", "Jharkhand 2076\n", "Karnataka 35248\n", "Kerala 11549\n", "Madhya Pradesh 31326\n", "Maharashtra 39193\n", "Manipur 790\n", "Meghalaya 1956\n", "Mizoram 1535\n", "Nagaland 1398\n", "Odisha 12363\n", "Puducherry 1433\n", "Punjab 19676\n", "Rajasthan 37729\n", "Sikkim 812\n", "Tamil Nadu 14080\n", "Telangana 3322\n", "Tripura 1442\n", "Uttar Pradesh 41800\n", "Uttarakhand 2156\n", "West Bengal 15406\n", "Name: count, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_to_state = {\n", " 'Agartala': 'Tripura', 'Agra': 'Uttar Pradesh', 'Ahmedabad': 'Gujarat', 'Ahmednagar': 'Maharashtra',\n", " 'Aizawl': 'Mizoram', 'Ajmer': 'Rajasthan', 'Akola': 'Maharashtra', 'Alwar': 'Rajasthan', \n", " 'Ambala': 'Haryana', 'Amravati': 'Maharashtra', 'Amritsar': 'Punjab', 'Anantapur': 'Andhra Pradesh', \n", " 'Angul': 'Odisha', 'Ankleshwar': 'Gujarat', 'Araria': 'Bihar', 'Ariyalur': 'Tamil Nadu', \n", " 'Arrah': 'Bihar', 'Asansol': 'West Bengal', 'Aurangabad (Bihar)': 'Bihar', \n", " 'Aurangabad(Maharashtra)': 'Maharashtra', 'Baddi': 'Himachal Pradesh', 'Badlapur': 'Maharashtra', \n", " 'Bagalkot': 'Karnataka', 'Baghpat': 'Uttar Pradesh', 'Bahadurgarh': 'Haryana', 'Balasore': 'Odisha', \n", " 'Ballabgarh': 'Haryana', 'Banswara': 'Rajasthan', 'Baran': 'Rajasthan', 'Barbil': 'Odisha', \n", " 'Bareilly': 'Uttar Pradesh', 'Baripada': 'Odisha', 'Barmer': 'Rajasthan', 'Barrackpore': 'West Bengal', \n", " 'Bathinda': 'Punjab', 'Begusarai': 'Bihar', 'Belapur': 'Maharashtra', 'Belgaum': 'Karnataka', \n", " 'Bengaluru': 'Karnataka', 'Bettiah': 'Bihar', 'Bhagalpur': 'Bihar', 'Bharatpur': 'Rajasthan', \n", " 'Bhilai': 'Chhattisgarh', 'Bhilwara': 'Rajasthan', 'Bhiwadi': 'Rajasthan', 'Bhiwandi': 'Maharashtra', \n", " 'Bhiwani': 'Haryana', 'Bhopal': 'Madhya Pradesh', 'Bhubaneswar': 'Odisha', 'Bidar': 'Karnataka', \n", " 'Bihar Sharif': 'Bihar', 'Bikaner': 'Rajasthan', 'Bilaspur': 'Chhattisgarh', 'Bileipada': 'Odisha', \n", " 'Boisar': 'Maharashtra', 'Brajrajnagar': 'Odisha', 'Bulandshahr': 'Uttar Pradesh', 'Bundi': 'Rajasthan', \n", " 'Buxar': 'Bihar', 'Byasanagar': 'Odisha', 'Byrnihat': 'Meghalaya', 'Chamarajanagar': 'Karnataka', \n", " 'Chandigarh': 'Chandigarh', 'Chandrapur': 'Maharashtra', 'Charkhi Dadri': 'Haryana', \n", " 'Chengalpattu': 'Tamil Nadu', 'Chennai': 'Tamil Nadu', 'Chhal': 'Chhattisgarh', 'Chhapra': 'Bihar', \n", " 'Chikkaballapur': 'Karnataka', 'Chikkamagaluru': 'Karnataka', 'Chittoor': 'Andhra Pradesh', \n", " 'Chittorgarh': 'Rajasthan', 'Churu': 'Rajasthan', 'Coimbtore': 'Tamil Nadu', 'Cuddalore': 'Tamil Nadu', \n", " 'Cuttack': 'Odisha', 'Damoh': 'Madhya Pradesh', 'Darbhanga': 'Bihar', 'Dausa': 'Rajasthan', \n", " 'Davanagere': 'Karnataka', 'Dehradun': 'Uttarakhand', 'Delhi': 'Delhi', 'Dewas': 'Madhya Pradesh', \n", " 'Dhanbad': 'Jharkhand', 'Dharuhera': 'Haryana', 'Dharwad': 'Karnataka', 'Dholpur': 'Rajasthan', \n", " 'Dhule': 'Maharashtra', 'Dindigul': 'Tamil Nadu', 'Dungarpur': 'Rajasthan', 'Durgapur': 'West Bengal', \n", " 'Eloor': 'Kerala', 'Ernakulam': 'Kerala', 'Faridabad': 'Haryana', 'Fatehabad': 'Haryana', \n", " 'Firozabad': 'Uttar Pradesh', 'Gadag': 'Karnataka', 'Gandhinagar': 'Gujarat', 'Gangtok': 'Sikkim', \n", " 'Gaya': 'Bihar', 'Ghaziabad': 'Uttar Pradesh', 'Gorakhpur': 'Uttar Pradesh', 'Greater_Noida': 'Uttar Pradesh', \n", " 'Gummidipoondi': 'Tamil Nadu', 'Gurugram': 'Haryana', 'Guwahati': 'Assam', 'Gwalior': 'Madhya Pradesh', \n", " 'Hajipur': 'Bihar', 'Haldia': 'West Bengal', 'Hanumangarh': 'Rajasthan', 'Hapur': 'Uttar Pradesh', \n", " 'Hassan': 'Karnataka', 'Haveri': 'Karnataka', 'Hisar': 'Haryana', 'Hosur': 'Tamil Nadu', 'Howrah': 'West Bengal', \n", " 'Hubballi': 'Karnataka', 'Hyderabad': 'Telangana', 'Imphal': 'Manipur', 'Indore': 'Madhya Pradesh', \n", " 'Jabalpur': 'Madhya Pradesh', 'Jaipur': 'Rajasthan', 'Jaisalmer': 'Rajasthan', 'Jalandhar': 'Punjab', \n", " 'Jalgaon': 'Maharashtra', 'Jalna': 'Maharashtra', 'Jalore': 'Rajasthan', 'Jhalawar': 'Rajasthan', \n", " 'Jhansi': 'Uttar Pradesh', 'Jharsuguda': 'Odisha', 'Jhunjhunu': 'Rajasthan', 'Jind': 'Haryana', \n", " 'Jodhpur': 'Rajasthan', 'Jorapokhar': 'Jharkhand', 'Kadapa': 'Andhra Pradesh', 'Kaithal': 'Haryana', \n", " 'Kalaburgi': 'Karnataka', 'Kalyan': 'Maharashtra', 'Kanchipuram': 'Tamil Nadu', 'Kannur': 'Kerala', \n", " 'Kanpur': 'Uttar Pradesh', 'Karauli': 'Rajasthan', 'Karnal': 'Haryana', 'Karur': 'Tamil Nadu', \n", " 'Karwar': 'Karnataka', 'Kashipur': 'Uttarakhand', 'Katihar': 'Bihar', 'Katni': 'Madhya Pradesh', \n", " 'Keonjhar': 'Odisha', 'Khanna': 'Punjab', 'Khurja': 'Uttar Pradesh', 'Kishanganj': 'Bihar', \n", " 'Kochi': 'Kerala', 'Kohima': 'Nagaland', 'Kolar': 'Karnataka', 'Kolhapur': 'Maharashtra', \n", " 'Kolkata': 'West Bengal', 'Kollam': 'Kerala', 'Koppal': 'Karnataka', 'Korba': 'Chhattisgarh', \n", " 'Kota': 'Rajasthan', 'Kozhikode': 'Kerala', 'Kunjemura': 'Jharkhand', 'Kurushketra': 'Haryana', \n", " 'Latur': 'Maharashtra', 'Loni_Ghaziabad': 'Uttar Pradesh', 'Lucknow': 'Uttar Pradesh', 'Ludhiana': 'Punjab', \n", " 'Madurai': 'Tamil Nadu', 'Mahad': 'Maharashtra', 'Maihar': 'Madhya Pradesh', 'Malegaon': 'Maharashtra', \n", " 'Mandi Gobindgarh': 'Punjab', 'Mandideep': 'Madhya Pradesh', 'Mandikhera': 'Haryana', 'Manesar': 'Haryana', \n", " 'Mangalore': 'Karnataka', 'Manguraha': 'Bihar', 'Medikeri': 'Karnataka', 'Meerut': 'Uttar Pradesh', \n", " 'Milupara': 'Chhattisgarh', 'Mira-Bhayandar': 'Maharashtra', 'Moradabad': 'Uttar Pradesh', \n", " 'Motihari': 'Bihar', 'Mumbai': 'Maharashtra', 'Munger': 'Bihar', 'Muzaffarnagar': 'Uttar Pradesh', \n", " 'Muzaffarpur': 'Bihar', 'Mysuru': 'Karnataka', 'NOIDA': 'Uttar Pradesh', 'Nagaon': 'Assam', \n", " 'Nagapattinam': 'Tamil Nadu', 'Nagaur': 'Rajasthan', 'Nagpur': 'Maharashtra', 'Naharlagun': 'Arunachal Pradesh', \n", " 'Nalbari': 'Assam', 'Nanded': 'Maharashtra', 'Nandesari': 'Gujarat', 'Narnaul': 'Haryana', 'Nashik': 'Maharashtra',\n", " 'Navi Mumbai': 'Maharashtra',\n", " 'Nayagarh': 'Odisha',\n", " 'Noida': 'Uttar Pradesh',\n", " 'Ooty': 'Tamil Nadu',\n", " 'Pali': 'Rajasthan',\n", " 'Palkalaiperur': 'Tamil Nadu',\n", " 'Palwal': 'Haryana',\n", " 'Panchkula': 'Haryana',\n", " 'Panipat': 'Haryana',\n", " 'Parbhani': 'Maharashtra',\n", " 'Pathardih': 'Jharkhand',\n", " 'Patiala': 'Punjab',\n", " 'Patna': 'Bihar',\n", " 'Pimpri-Chinchwad': 'Maharashtra',\n", " 'Pithampur': 'Madhya Pradesh',\n", " 'Pratapgarh': 'Rajasthan',\n", " 'Prayagraj': 'Uttar Pradesh',\n", " 'Puducherry': 'Puducherry',\n", " 'Pudukottai': 'Tamil Nadu',\n", " 'Pune': 'Maharashtra',\n", " 'Purnia': 'Bihar',\n", " 'Raichur': 'Karnataka',\n", " 'Raipur': 'Chhattisgarh',\n", " 'Rairangpur': 'Odisha',\n", " 'Rajamahendravaram': 'Andhra Pradesh',\n", " 'Rajgir': 'Bihar',\n", " 'Rajsamand': 'Rajasthan',\n", " 'Ramanagara': 'Karnataka',\n", " 'Ramanathapuram': 'Tamil Nadu',\n", " 'Ranipet': 'Tamil Nadu',\n", " 'Ratlam': 'Madhya Pradesh',\n", " 'Rishikesh': 'Uttarakhand',\n", " 'Rohtak': 'Haryana',\n", " 'Rourkela': 'Odisha',\n", " 'Rupnagar': 'Punjab',\n", " 'Sagar': 'Madhya Pradesh',\n", " 'Saharsa': 'Bihar',\n", " 'Salem': 'Tamil Nadu',\n", " 'Samastipur': 'Bihar',\n", " 'Sangli': 'Maharashtra',\n", " 'Sasaram': 'Bihar',\n", " 'Satna': 'Madhya Pradesh',\n", " 'Sawai Madhopur': 'Rajasthan',\n", " 'Shillong': 'Meghalaya',\n", " 'Shivamogga': 'Karnataka',\n", " 'Sikar': 'Rajasthan',\n", " 'Silchar': 'Assam',\n", " 'Siliguri': 'West Bengal',\n", " 'Singrauli': 'Madhya Pradesh',\n", " 'Sirohi': 'Rajasthan',\n", " 'Sirsa': 'Haryana',\n", " 'Sivasagar': 'Assam',\n", " 'Siwan': 'Bihar',\n", " 'Solapur': 'Maharashtra',\n", " 'Sonipat': 'Haryana',\n", " 'Sri Ganganagar': 'Rajasthan',\n", " 'Sri Vijaya Puram': 'Andaman and Nicobar',\n", " 'Srinagar': 'Jammu and Kashmir',\n", " 'Suakati': 'Odisha',\n", " 'Surat': 'Gujarat',\n", " 'Talcher': 'Odisha',\n", " 'Tensa': 'Odisha',\n", " 'Thane': 'Maharashtra',\n", " 'Thanjavur': 'Tamil Nadu',\n", " 'Thiruvananthapuram': 'Kerala',\n", " 'Thoothukudi': 'Tamil Nadu',\n", " 'Thrissur': 'Kerala',\n", " 'Tiruchirappalli': 'Tamil Nadu',\n", " 'Tirunelveli': 'Tamil Nadu',\n", " 'Tirupati': 'Andhra Pradesh',\n", " 'Tirupur': 'Tamil Nadu',\n", " 'Tonk': 'Rajasthan',\n", " 'Tumidih': 'Chhattisgarh',\n", " 'Udaipur': 'Rajasthan',\n", " 'Udupi': 'Karnataka',\n", " 'Ujjain': 'Madhya Pradesh',\n", " 'Ulhasnagar': 'Maharashtra',\n", " 'Vapi': 'Gujarat',\n", " 'Varanasi': 'Uttar Pradesh', \n", " 'Vatva': 'Gujarat', 'Vellore': 'Tamil Nadu',\n", " 'Vijayapura': 'Karnataka',\n", " 'Vijayawada': 'Andhra Pradesh',\n", " 'Virar': 'Maharashtra',\n", " 'Virudhunagar': 'Tamil Nadu',\n", " 'Visakhapatnam': 'Andhra Pradesh',\n", " 'Vrindavan': 'Uttar Pradesh',\n", " 'Yadgir': 'Karnataka',\n", " 'Yamunanagar': 'Haryana'\n", "}\n", "merged_df['State'] = merged_df['City'].apply(lambda x: city_to_state[x])\n", "merged_df['State'].value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | City | \n", "AQI | \n", "Pollutant | \n", "Air Quality | \n", "Based on number of monitoring stations | \n", "Date | \n", "State | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Agra | \n", "417 | \n", "PM\\n2.5 | \n", "Severe | \n", "1 | \n", "2016-01-01 | \n", "Uttar Pradesh | \n", "
1 | \n", "Bengaluru | \n", "95 | \n", "PM , PM\\n2.5 10 | \n", "Satisfactory | \n", "5 | \n", "2016-01-01 | \n", "Karnataka | \n", "