{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "5fcd59bc-72a4-4de7-9cdb-1b6eca9407fb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-12-02 22:03:04.459 No runtime found, using MemoryCacheStorageManager\n" ] } ], "source": [ "import ibis\n", "from ibis import _\n", "import streamlit as st\n", "\n", "con = ibis.duckdb.connect(\"duck.db\", extensions=['httpfs', 'spatial', 'h3'])\n", "\n", "# Local cloud\n", "minio_key = st.secrets[\"MINIO_KEY\"]\n", "minio_secret = st.secrets[\"MINIO_SECRET\"]\n", "query= f'''\n", "CREATE OR REPLACE SECRET secret2 (\n", " TYPE S3,\n", " KEY_ID '{minio_key}',\n", " SECRET '{minio_secret}',\n", " ENDPOINT 'minio.carlboettiger.info',\n", " URL_STYLE 'path',\n", " SCOPE 's3://cboettig/gbif'\n", ");\n", "'''\n", "con.raw_sql(query)\n", "\n", "## Limits are sometimes good \n", "#con.raw_sql(\"SET memory_limit = '20GB';\")\n", "#con.raw_sql(\"set threads=40;\")\n", "\n", "gbif = con.read_parquet(\"s3://cboettig/gbif/2024-10-01/**\")\n", "\n", "# can/should we add explicit spatial index to gbif first? using RTree takes too much memory" ] }, { "cell_type": "code", "execution_count": null, "id": "2e1f1ee1-8048-4f2b-83a0-ca8cc8fc9067", "metadata": {}, "outputs": [], "source": [ "# Copy to local cloud as parquet\n", "\n", "(con\n", " .read_geo(\"/vsicurl/https://data.source.coop/cboettig/us-boundaries/mappinginequality.json\")\n", " .to_parquet(\"s3://cboettig/gbif/mappinginequality.parquet\")\n", ")\n", "\n", "# can/should we add explicit spatial index to mappinginequality polygons first? \n", "# would local duckdb table version be even better/faster? \n" ] }, { "cell_type": "markdown", "id": "3891abb6-3652-4217-8615-106d354ff131", "metadata": {}, "source": [ "We iterate through the city list to do this efficiently. " ] }, { "cell_type": "code", "execution_count": 24, "id": "32a2b4c1-e08b-4fbb-b891-ac19053a4585", "metadata": {}, "outputs": [], "source": [ "## select cities from the list we haven't yet written (allows resume).\n", "import minio\n", "import re\n", "\n", "minio_key = st.secrets[\"MINIO_KEY\"]\n", "minio_secret = st.secrets[\"MINIO_SECRET\"]\n", "mc = minio.Minio(\"minio.carlboettiger.info\", minio_key, minio_secret)\n", "obj = mc.list_objects(\"cboettig\", \"gbif/redlined/\", recursive=True)\n", "\n", "pattern = r\"gbif/redlined/|\\.parquet$\"\n", "finished_cities = [re.sub(pattern, \"\", i.object_name) for i in obj if not i.is_dir]\n", "\n", "mappinginequality = con.read_parquet(\"s3://cboettig/gbif/mappinginequality.parquet\")\n", "all_cities = (mappinginequality.select(_.city, _.state)\n", " .distinct()\n", " .mutate(index = \"state=\" + _.state + \"/\" + \"city=\" + _.city)\n", " .execute()[\"index\"]\n", " )\n", "\n", "remaining_cities = set(all_cities) - set(finished_cities)\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "58e4a062-8f7e-45d2-a41f-fdc49b1c9f03", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'state=SC/city=Sumter'}" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(remaining_cities)\n", "remaining_cities" ] }, { "cell_type": "code", "execution_count": 26, "id": "c3a4005c-1e8c-4f2a-a93c-1c158c9c26ab", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "state=SC/city=Sumter\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9dee1bc3c5c34370892812ff0894d8e4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 15min 54s, sys: 4min 38s, total: 20min 33s\n", "Wall time: 48.8 s\n" ] } ], "source": [ "%%time \n", "\n", "## And here we go, long-running loop over each city\n", "for i in remaining_cities:\n", " gdf = (mappinginequality\n", " .mutate(index = \"state=\" + _.state + \"/\" + \"city=\" + _.city)\n", " .filter(_.index == i)\n", " .mutate(area = _.geom.area())\n", " )\n", "\n", " print(i)\n", " \n", " bounds = gdf.execute().total_bounds\n", " points = (gbif\n", " .filter(_.decimallongitude >= bounds[0], \n", " _.decimallongitude < bounds[2], \n", " _.decimallatitude >= bounds[1], \n", " _.decimallatitude < bounds[3])\n", " )\n", " \n", " (gdf\n", " .join(points, gdf.geom.intersects(points.geom))\n", " .to_parquet(f\"s3://cboettig/gbif/redlined/{i}.parquet\")\n", " )\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7e0ed68d-e6ba-44c8-b7a0-1f284b11cb96", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 89, "id": "976eb48e-eaf1-45db-9034-84ac33eeb506", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5bed6c7dc2ae4962994a4bf8920d12fb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# standardize grades and consolidate to single parquet?\n", "\n", "(con.read_parquet(\"s3://cboettig/gbif/redlined/**\")\n", " .mutate(grade=_.grade.strip())\n", " .mutate(grade=_.grade.cases((('A', 'A'), ('B', 'B'), ('C', 'C'), ('D', 'D')), default = \"None\" ) )\n", " .to_parquet(\"s3://cboettig/gbif/redlined_cities_gbif.parquet\")\n", ")\n", "\n" ] }, { "cell_type": "code", "execution_count": 96, "id": "5ce74a60-c30b-4a78-9653-b34e788227b3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.59 s, sys: 22.9 s, total: 25.5 s\n", "Wall time: 700 ms\n" ] }, { "data": { "text/html": [ "
\n", " | grade | \n", "n | \n", "area | \n", "density | \n", "
---|---|---|---|---|
0 | \n", "A | \n", "3048483 | \n", "1008.034173 | \n", "3024.186167 | \n", "
1 | \n", "B | \n", "5506468 | \n", "2517.018555 | \n", "2187.694639 | \n", "
2 | \n", "D | \n", "3491760 | \n", "1896.670719 | \n", "1840.994309 | \n", "
3 | \n", "C | \n", "6353620 | \n", "3539.963360 | \n", "1794.826487 | \n", "
4 | \n", "None | \n", "6021724 | \n", "16665.329460 | \n", "361.332431 | \n", "