{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 82, "metadata": { "cellView": "form", "id": "AXI2uCSkxx7m" }, "outputs": [], "source": [ "#@title Setup\n", "\n", "%%capture\n", "!pip install networkx pulp numpy pandas\n", "\n", "!rm -rf ./data/\n", "!mkdir -p ./data/\n", "!wget -c -O ./data/lastfm_asia.zip \"https://snap.stanford.edu/data/lastfm_asia.zip\"\n", "!unzip -q ./data/lastfm_asia.zip -d ./data/" ] }, { "cell_type": "code", "source": [ "#@title Problem 3: Linear Programming\n", "\n", "\n", "from pulp import *\n", "from IPython.display import HTML, display\n", "\n", "def display_table(table):\n", " display(HTML(\n", " '{}
'.format(\n", " ''.join(\n", " '{}'.format(''.join(str(_) for _ in row)) for row in table)\n", " )\n", " ))\n", "\n", "problem = LpProblem(\"MSML_602_Midterm_Q3\", LpMaximize)\n", "\n", "X = LpVariable(\"X\", cat=\"Integer\")\n", "Y = LpVariable(\"Y\", cat=\"Integer\")\n", "\n", "problem += (5 * X) + (3 * Y), \"Objective\"\n", "problem += X + (2 * Y) <= 14, \"Constraint 1\"\n", "problem += (3* X) - Y >= 0, \"Constraint 2\"\n", "problem += X - Y <= 2, \"Constraint 3\"\n", "\n", "problem.solve()\n", "print(\"Solution:\\n\")\n", "\n", "data = [[\"Variable\", \"Value\"]] + [[v.name, v.varValue] for v in problem.variables()]\n", "data += [[\"Max value for objective function: \", problem.objective.value()]]\n", "display_table(data)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 140 }, "cellView": "form", "id": "ALmlZnbcx-9e", "outputId": "45e2c507-3265-4b22-e21a-2d6dbd72f05f" }, "execution_count": 83, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Solution:\n", "\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "
VariableValue
X6.0
Y4.0
Max value for objective function: 42.0
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "#@title Problem 5: Graph Metrics\n", "\n", "import pandas as pd \n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "\n", "df = pd.read_csv(\"/content/data/lasftm_asia/lastfm_asia_edges.csv\")\n", "G = nx.from_pandas_edgelist(df, source=\"node_1\", target=\"node_2\")\n", "shortest_path = nx.shortest_path_length(G, 0)\n", "del shortest_path[0]\n", "num = len(shortest_path)\n", "total_length = sum([shortest_path[k] for k in shortest_path])\n", "avg_shortest_path = total_length / num\n", "print(f\"The average shortest path length from node 0 to all other nodes is: {avg_shortest_path}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "cellView": "form", "id": "87mMC-B1yJoq", "outputId": "83ff28ec-7d51-4f6a-ced6-358538a58f83" }, "execution_count": 84, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The average shortest path length from node 0 to all other nodes is: 5.651974288337925\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Problem 6: Extracting Webpage Data" ], "metadata": { "id": "mAiJRhb5iW5O" } }, { "cell_type": "code", "source": [ "#@title Scraping result\n", "\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd \n", "import numpy as np\n", "\n", "page = requests.get(\"https://www.worldometers.info/coronavirus/#countries\")\n", "html = page.content\n", "\n", "soup = BeautifulSoup(html, 'html.parser')\n", "table = soup.find(\"table\", {\"id\": \"main_table_countries_today\"})\n", "\n", "cols = [\n", " '#', 'Country', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered',\n", " 'NewRecovered','ActiveCases','Serious,Critical','TotalCases/1M pop','Deaths/1M pop', \n", " 'TotalTests', 'Tests/1M pop', 'Population', 'Continent', '1 Case every X ppl', '1 Death every X ppl',\n", " '1 Test every X ppl', 'New Cases/1M pop', 'New Deaths/1M pop', 'Active Cases/1M pop'\n", "]\n", "\n", "tbody = table.find(\"tbody\")\n", "rows = tbody.find_all(\"tr\")\n", "\n", "data = []\n", "for row in rows:\n", " cells = row.find_all(\"td\")\n", " values = [c.text for c in cells]\n", " data.append(values)\n", "\n", "def sanitize_country_number(row):\n", " val = row[\"#\"]\n", " if not val.strip():\n", " return np.NaN\n", " else:\n", " return val\n", "\n", "def fill_active_cases(row):\n", " val = row[\"ActiveCases\"]\n", " if not np.isnan(val):\n", " return val\n", " active_per_1_mil = row[\"Active Cases/1M pop\"]\n", " if np.isnan(active_per_1_mil):\n", " return np.nan\n", " population = row[\"Population\"]\n", " return (active_per_1_mil/1000000) * population\n", "\n", "def to_float(col):\n", " def mapper(row):\n", " if row[col] == \"N/A\":\n", " return np.NaN\n", " val = row[col]\n", " val = val.replace(\",\", \"\").strip()\n", " if not val:\n", " return np.NaN\n", " return float(val)\n", " return mapper \n", "\n", "df = pd.DataFrame(data, columns=cols)\n", "df.replace(r\"\\n\", \"\", regex=True, inplace=True)\n", "\n", "df.head()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 386 }, "cellView": "form", "id": "Ay-ceRkkzcVg", "outputId": "3b1e8535-f211-45ba-9b90-85c279e522ec" }, "execution_count": 85, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " # Country TotalCases NewCases TotalDeaths NewDeaths \\\n", "0 North America 118,308,960 +16,354 1,557,219 +76 \n", "1 Asia 195,343,819 +168,618 1,491,630 +230 \n", "2 Europe 235,496,414 +41,389 1,948,669 +165 \n", "3 South America 64,557,158 +10,126 1,333,737 +79 \n", "4 Oceania 12,691,699 +3,057 21,779 +9 \n", "\n", " TotalRecovered NewRecovered ActiveCases Serious,Critical ... TotalTests \\\n", "0 113,762,872 +16,362 2,988,869 7,881 ... \n", "1 188,186,652 +78,736 5,665,537 9,159 ... \n", "2 229,427,346 +175,758 4,120,399 7,685 ... \n", "3 62,884,992 +7,699 338,429 10,119 ... \n", "4 12,512,305 157,615 97 ... \n", "\n", " Tests/1M pop Population Continent 1 Case every X ppl \\\n", "0 North America \n", "1 Asia \n", "2 Europe \n", "3 South America \n", "4 Australia/Oceania \n", "\n", " 1 Death every X ppl 1 Test every X ppl New Cases/1M pop New Deaths/1M pop \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "\n", " Active Cases/1M pop \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "\n", "[5 rows x 22 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#CountryTotalCasesNewCasesTotalDeathsNewDeathsTotalRecoveredNewRecoveredActiveCasesSerious,Critical...TotalTestsTests/1M popPopulationContinent1 Case every X ppl1 Death every X ppl1 Test every X pplNew Cases/1M popNew Deaths/1M popActive Cases/1M pop
0North America118,308,960+16,3541,557,219+76113,762,872+16,3622,988,8697,881...North America
1Asia195,343,819+168,6181,491,630+230188,186,652+78,7365,665,5379,159...Asia
2Europe235,496,414+41,3891,948,669+165229,427,346+175,7584,120,3997,685...Europe
3South America64,557,158+10,1261,333,737+7962,884,992+7,699338,42910,119...South America
4Oceania12,691,699+3,05721,779+912,512,305157,61597...Australia/Oceania
\n", "

5 rows × 22 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 85 } ] }, { "cell_type": "code", "source": [ "#@title Data sanitization / generation\n", "\n", "#@markdown Some of the countries (actually ships, in this case) did not have any population data, so I excluded those records from the dataset.\n", "\n", "#@markdown Some countries didn't have data for exact active cases, but had data for **active cases per 1 million population**. \n", "#@markdown For these countries, I calculated their active cases by using the active cases per 1 million population data as follows: \n", "\n", "#@markdown ```Active Cases = (Active cases per 1 million population / 1,000,000) * Population```\n", "\n", "df[\"country_number\"] = df.apply(sanitize_country_number, axis=1)\n", "\n", "data_by_country = df[df[\"country_number\"].notna()].copy()\n", "data_by_country[\"ActiveCases\"] = data_by_country.apply(to_float(\"ActiveCases\"), axis=1)\n", "data_by_country[\"Active Cases/1M pop\"] = data_by_country.apply(to_float(\"Active Cases/1M pop\"), axis=1)\n", "data_by_country[\"Population\"] = data_by_country.apply(to_float(\"Population\"), axis=1)\n", "data_by_country[\"ActiveCases\"] = data_by_country.apply(fill_active_cases, axis=1)\n", "data_by_country[data_by_country[\"ActiveCases\"] == \"N/A\"].head(20)\n", "aggregated = data_by_country.groupby(\"Country\").agg({'ActiveCases':'mean', 'Population':'sum'}, as_index=False)\n", "aggregated.reset_index(inplace=True)\n", "dropped_countries = aggregated[aggregated[\"Population\"] == 0 ]\n", "aggregated = aggregated[aggregated[\"Population\"] != 0 ]\n", "aggregated[\"PercentageInfected\"] = aggregated.apply(lambda x: x[\"ActiveCases\"]/x[\"Population\"], axis=1)\n", "aggregated.sort_values([\"PercentageInfected\"], ascending=False, inplace=True)\n", "\n", "print(\"These were the countries(ships) that didn't have population data:\\n\")\n", "print(dropped_countries)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "cellView": "form", "id": "Roitzj22-VO5", "outputId": "5dc61fa2-31c9-4fa4-828c-c9f88fb449e2" }, "execution_count": 86, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "These were the countries(ships) that didn't have population data:\n", "\n", " Country ActiveCases Population\n", "56 Diamond Princess 0.0 0.0\n", "120 MS Zaandam 0.0 0.0\n" ] } ] }, { "cell_type": "code", "source": [ "#@title Average active cases & the proportion of the total population affected\n", "\n", "from IPython.display import HTML, display\n", "\n", "def display_table(table):\n", " display(HTML(\n", " '{}
'.format(\n", " ''.join(\n", " '{}'.format(''.join(str(_) for _ in row)) for row in table)\n", " )\n", " ))\n", "\n", "avg_active_cases = aggregated[\"ActiveCases\"].mean()\n", "\n", "aggr = aggregated.agg({\"ActiveCases\": \"sum\", \"Population\": \"sum\"}, as_index=False)\n", "final_df = aggr.to_frame().T\n", "final_df[\"PercentageInfected\"] = final_df.apply(lambda x: (x[\"ActiveCases\"]/x[\"Population\"]) * 100, axis=1)\n", "percentage_infected = final_df[\"PercentageInfected\"].to_numpy()[0]\n", "\n", "display(HTML(\n", " \"\"\"\n", "

Result:

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Average active cases:{0:.2f}
Proportion of total
population currently infected:
{1:.2f}%
\n", "
\n", " \"\"\".format(avg_active_cases, percentage_infected))\n", ")\n", "\n", "\n", "print(\"\"\"\n", "I was unsure whether the problem wanted the percentage of the population\n", "affected for each country, so I have included the percentage for each country \n", "as well, just in case:\n", "\"\"\")\n", "aggregated.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 439 }, "cellView": "form", "id": "lRtwSfqSAPAY", "outputId": "a7037d5d-fbd6-48b3-e47b-32090720dfd1" }, "execution_count": 87, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "

Result:

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Average active cases:60038.20
Proportion of total
population currently infected:
0.17%
\n", "
\n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "I was unsure whether the problem wanted the percentage of the population\n", "affected for each country, so I have included the percentage for each country \n", "as well, just in case:\n", "\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " Country ActiveCases Population PercentageInfected\n", "129 Martinique 222576.901869 374087.0 0.594987\n", "68 Faeroe Islands 26936.998989 49233.0 0.547133\n", "195 St. Barth 4854.999825 9945.0 0.488185\n", "84 Guadeloupe 193026.939904 399794.0 0.482816\n", "93 Iceland 130899.111498 345393.0 0.378986" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryActiveCasesPopulationPercentageInfected
129Martinique222576.901869374087.00.594987
68Faeroe Islands26936.99898949233.00.547133
195St. Barth4854.9998259945.00.488185
84Guadeloupe193026.939904399794.00.482816
93Iceland130899.111498345393.00.378986
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 87 } ] } ] }