{ "cells": [ { "cell_type": "markdown", "id": "4b4adc2a-bf0c-4ace-87be-dbaf90be0125", "metadata": {}, "source": [ "# Pre-processing script" ] }, { "cell_type": "code", "execution_count": null, "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "import ibis\n", "from ibis import _\n", "import geopandas as gpd\n", "import duckdb\n", "from cng.utils import ST_MakeValid\n", "\n", "con = ibis.duckdb.connect(extensions=[\"spatial\"])\n", "path = '../data/ca-layers/'\n", "\n", "# CA Nature data \n", "ca_raw_parquet = \"https://data.source.coop/cboettig/ca30x30/ca_areas.parquet\"\n", "\n", "# Boundary of CA, used to computed 'non-conserved' areas\n", "ca_boundary_shape = \"../data/ca_shape\"\n", "ca_boundary_parquet = path + \"ca_boundary.parquet\"\n", "\n", "# Ecoregions\n", "ca_ecoregions_shape = \"../data/ecoregions/ACE_Ecoregions_BaileyDerived_2022.shp\"\n", "ca_ecoregions_parquet = path + \"ace_ecoregions.parquet\"\n", "\n", "# file to save non-conserved areas; costly operation so we save results \n", "ca_nonconserved_parquet = path + \"ca-30x30-nonconserved-500m-simplified.parquet\" \n", "ca_nonconserved_eco_parquet = path + \"ca-30x30-nonconserved-500m-simplified-eco.parquet\" \n", "\n", "# temp file used to compute zonal stats: has conserved + non-conserved areas \n", "ca_temp_parquet = path + \"ca-30x30-temp.parquet\" \n", "\n", "# final files: conserved + non-conserved areas + zonal stats \n", "ca_parquet = path + \"ca-30x30.parquet\"\n", "ca_pmtiles = path + \"ca-30x30.pmtiles\" #excludes non-conserved geometries\n", "\n", "#vector data \n", "svi = path + 'SVI2022_US_tract' #EPSG:4326\n", "justice40 = path + 'disadvantaged-communities'#ESRI:102039\n", "fire = path + 'calfire-2023' #EPSG:4326\n", "rxburn = path + 'calfire-rxburn-2023' #EPSG:4326\n", "\n", "#raster data \n", "irrecoverable_c = path + 'ca_irrecoverable_c_2018_cog' # EPSG:3857\n", "manageable_c = path + 'ca_manageable_c_2018_cog'# EPSG:3857\n", "richness = path + 'SpeciesRichness_All' # EPSG:3857\n", "rsr = path + 'RSR_All'# EPSG:3857" ] }, { "cell_type": "markdown", "id": "907235f6-48a5-4c55-b779-3bb6839acf2b", "metadata": {}, "source": [ "# Step 1: Computing all \"non-conserved\" areas" ] }, { "cell_type": "markdown", "id": "c6c1cbf5-bc6e-4238-ab87-c467067235c0", "metadata": {}, "source": [ "#### Convert CA boundary to parquet " ] }, { "cell_type": "code", "execution_count": null, "id": "38091012-586e-4091-8f0d-a0aa868a04cf", "metadata": {}, "outputs": [], "source": [ "# Using a shape file of CA boundary and converting to parquet file \n", "ca_boundary = gpd.read_file(ca_boundary_shape).to_crs(epsg = 3310)\n", "ca_boundary.to_parquet(ca_boundary_parquet)" ] }, { "cell_type": "markdown", "id": "3dfcb35b-e6a9-4a89-af05-c65909191f2b", "metadata": {}, "source": [ "#### Computing difference: Non-conserved areas = CA Boundary - Conserved Areas" ] }, { "cell_type": "code", "execution_count": null, "id": "7aedc147-6601-4ca7-9316-ddea5cab154a", "metadata": {}, "outputs": [], "source": [ "# This chunk will take ~2 hours to run \n", "conn = ibis.duckdb.connect(\"tmp\", extensions=[\"spatial\"]) #save to disk\n", "\n", "# CA Boundary \n", "ca_all_tbl = (\n", " conn.read_parquet(ca_boundary_parquet)\n", " .rename(geom = \"geometry\")\n", " .cast({\"geom\": \"geometry\"})\n", ")\n", "\n", "\n", "# CA-Nature data / protected areas \n", "tbl = (\n", " conn.read_parquet(ca_raw_parquet)\n", " .cast({\"SHAPE\": \"geometry\"})\n", " .rename(geom = \"SHAPE\", gid = \"OBJECTID\")\n", ")\n", "\n", "conn.create_table(\"t1\", ca_all_tbl, overwrite = True)\n", "conn.create_table(\"t2\", tbl.filter(_.Release_Year == 2024), overwrite = True)\n", "\n", "# simplified all geometries 500m so the kernel doesn't crash\n", "# computing difference\n", "conn.conn.execute('''\n", "CREATE TABLE not_in_pad AS\n", "WITH t2_simplified AS (\n", " SELECT ST_Simplify(geom, 500) AS geom\n", " FROM t2\n", "),\n", "t2_union AS (\n", " SELECT ST_Union_Agg(geom) AS geom\n", " FROM t2_simplified\n", ")\n", "SELECT \n", " ST_Difference(t1.geom, t2_union.geom) AS geom\n", "FROM \n", " t1, t2_union;\n", "''')\n", "\n", "\n", "# save to parquet file so we don't have to run this again\n", "nonconserved = conn.table(\"not_in_pad\")\n", "nonconserved.execute().to_parquet(ca_nonconserved_parquet)" ] }, { "cell_type": "markdown", "id": "845eb0ed-3392-4346-9b7e-959cd97a274f", "metadata": {}, "source": [ "#### Get ecoregions - convert them to parquet" ] }, { "cell_type": "code", "execution_count": null, "id": "b43fee2c-b8c4-4076-b87b-089676031165", "metadata": {}, "outputs": [], "source": [ "eco = gpd.read_file(ca_ecoregions_shape)\n", "eco.to_parquet(ca_ecoregions_parquet)" ] }, { "cell_type": "markdown", "id": "b5689850-80fa-4cc9-87e9-46074b8d9107", "metadata": {}, "source": [ "#### Compute ecoregion for non-conserved areas" ] }, { "cell_type": "code", "execution_count": null, "id": "070bbdde-b141-4a63-8f8a-984dd01fd51a", "metadata": {}, "outputs": [], "source": [ "con = ibis.duckdb.connect(extensions=[\"spatial\"])\n", "\n", "eco = con.read_parquet(ca_ecoregions_parquet)\n", "non = con.read_parquet(ca_nonconserved_parquet)\n", "\n", "con.create_table(\"eco\", eco.select(\"ECOREGION_\",\"geometry\"), overwrite = True)\n", "con.create_table(\"non\", non, overwrite = True)\n", "\n", "#split up the non-conserved areas by ecoregions\n", "con.con.execute('''\n", "CREATE TABLE non_conserved_eco AS\n", "SELECT \n", " non.*, \n", " eco.ECOREGION_ AS ecoregion,\n", " ST_Intersection(non.geom, eco.geometry) AS geom -- Split non into ecoregions\n", "FROM non\n", "JOIN eco \n", "ON ST_Intersects(non.geom, eco.geometry)\n", "WHERE ST_GeometryType(ST_Intersection(non.geom, eco.geometry)) IN ('POLYGON', 'MULTIPOLYGON');\n", "''')\n", "\n", "# save to parquet file so we don't have to run this again\n", "non_eco = (con.table(\"non_conserved_eco\")\n", " .drop('geom')\n", " .rename(geom = \"geom_1\")\n", " .mutate(geom = ST_MakeValid(_.geom))\n", " .mutate(id=ibis.row_number().over())\n", " )\n", "\n", "non_conserved_eco = non_eco.execute()\n", "non_conserved_eco.to_parquet(ca_nonconserved_eco_parquet)" ] }, { "cell_type": "markdown", "id": "ce52b1e0-027e-4915-9e7b-e51e946560ed", "metadata": {}, "source": [ "#### Non-conserved areas need to match CA Nature schema when merging" ] }, { "cell_type": "code", "execution_count": null, "id": "0f9666d1-7c2b-45af-9399-e4189bba34f5", "metadata": {}, "outputs": [], "source": [ "# match CA Nature schema \n", "nonconserved_clean = (\n", " con.read_parquet(ca_nonconserved_eco_parquet)\n", " .cast({\"geom\": \"geometry\"})\n", " .mutate(established = ibis.null(), gap_code = 0, name = ibis.literal(\"Non-Conserved Areas\"),\n", " access_type = ibis.null(), manager = ibis.null(), manager_type = ibis.null(),\n", " easement = ibis.null(), type = ibis.literal(\"Land\"),\n", " status = ibis.literal(\"non-conserved\"),\n", " acres = _.geom.area() / 4046.8564224 #convert sq meters to acres\n", " )\n", " .cast({\"established\": \"string\", \"gap_code\": \"int16\", \"status\": \"string\",\"name\": \"string\",\n", " \"access_type\": \"string\", \"manager\": \"string\", \"manager_type\": \"string\",\n", " \"ecoregion\": \"string\", \"easement\": \"string\", \"id\": \"int64\", \"type\": \"string\",\n", " \"acres\":\"float32\"}) #match schema to CA Nature\n", ")" ] }, { "cell_type": "markdown", "id": "104254ef-f6e9-4f03-8797-de55091774d5", "metadata": {}, "source": [ "# Step 2: Isolate pre-2024 from 2024 polygons" ] }, { "cell_type": "code", "execution_count": null, "id": "a3d4f189-1563-4868-9f1f-64d67569df27", "metadata": {}, "outputs": [], "source": [ "# negative buffer to account for overlapping boundaries. \n", "buffer = -30 #30m buffer \n", "\n", "tbl = (\n", " con.read_parquet(ca_raw_parquet)\n", " .cast({\"SHAPE\": \"geometry\"})\n", " .rename(geom = \"SHAPE\")\n", " .filter(_.reGAP < 3) # only gap 1 and 2 count towards 30x30\n", ")\n", "\n", "# polygons with release_year 2024 are a superset of release_year 2023. \n", "# use anti_join to isolate the objects that are in release_year 2024 but not release_year 2023 (aka newly established). \n", "tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer)) \n", "tbl_2024 = tbl.filter(_.Release_Year == 2024)\n", "intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))" ] }, { "cell_type": "markdown", "id": "1f335433-ff89-4966-bf98-c11a0b233686", "metadata": {}, "source": [ "# Step 3: Join all protected land data into single parquet file " ] }, { "cell_type": "code", "execution_count": null, "id": "a59c976b-3c36-40f9-a15b-cefcd155c647", "metadata": {}, "outputs": [], "source": [ "# %%time\n", "new2024 = intersects.select(\"OBJECTID\").mutate(established = ibis.literal(\"2024\")) # saving IDs to join on\n", "\n", "ca_merged = (con\n", " .read_parquet(ca_raw_parquet)\n", " .cast({\"SHAPE\": \"geometry\"})\n", " .mutate(area = _.SHAPE.area())\n", " .filter(_.Release_Year == 2024) # having both 2023 and 2024 is redudant since 2024 is the superset.\n", " .left_join(new2024, \"OBJECTID\") # newly established 2024 polygons \n", " .mutate(established=_.established.fill_null(\"pre-2024\")) \n", " .rename(name = \"cpad_PARK_NAME\", access_type = \"cpad_ACCESS_TYP\", manager = \"cpad_MNG_AGENCY\",\n", " manager_type = \"cpad_MNG_AG_LEV\", id = \"OBJECTID\", type = \"TYPE\", \n", " ecoregion = \"CA_Ecoregion_Name\", acres = \"Acres\", gap_code = \"reGAP\", geom = \"SHAPE\")\n", " .cast({\"gap_code\": \"int16\"})\n", " .cast({\"id\": \"int64\"})\n", " .mutate(manager = _.manager.substitute({\"\": \"Unknown\"})) \n", " .mutate(manager_type = _.manager_type.substitute({\"\": \"Unknown\"}))\n", " .mutate(access_type = _.access_type.substitute({\"\": \"Unknown Access\"}))\n", " .mutate(name = _.name.substitute({\"\": \"Unknown\"}))\n", " .mutate(manager_type = _.manager_type.substitute({\"Home Owners Association\": \"HOA\"}))\n", " .mutate(easement=_.Easement.cast(\"string\").substitute({\"0\": \"False\", \"1\": \"True\"}))\n", " .mutate(status=_.gap_code.cast(\"string\")\n", " .substitute({\"1\": \"30x30-conserved\", \"2\": \"30x30-conserved\", \"3\": \"other-conserved\", \n", " \"4\": \"unknown\"}))\n", " .select(_.established, _.gap_code, _.status, _.name, _.access_type, _.manager, _.manager_type,\n", " _.ecoregion, _.easement, _.acres, _.id, _.type, _.geom)\n", " .union(nonconserved_clean)\n", " .mutate(acres = _.acres.round(4))\n", " .mutate(geom = ST_MakeValid(_.geom))\n", " .drop_null(['geom'],how = \"any\")\n", " )\n", "\n", "\n", "gdf = ca_merged.execute()\n", "gdf.set_crs(\"epsg:3310\").to_parquet(ca_temp_parquet) # saving to temp file to compute zonal stats " ] }, { "cell_type": "markdown", "id": "44d64f2b-a65b-4ac1-9943-2d96f5c91e1d", "metadata": {}, "source": [ "# Step 4: Compute zonal stats" ] }, { "cell_type": "markdown", "id": "e129b0cc-ee7d-4e58-a8d8-d6f2476bd62c", "metadata": {}, "source": [ "#### Functions: Reproject and compute overlap for vector data " ] }, { "cell_type": "code", "execution_count": null, "id": "fdeeb7ac-efa0-4a7b-9143-72d8ec911809", "metadata": {}, "outputs": [], "source": [ "def reproject_vectors(file, vec): # change data layer projections to match CA Nature data \n", " vec = vec.rename_geometry('geom')\n", " vec[\"geom\"] = vec[\"geom\"].make_valid()\n", " vec = vec.to_crs(\"EPSG:3310\")\n", " vec.to_parquet(file + '-epsg3310.parquet')\n", " return\n", "\n", "def vector_vector_stats(base, data_layer):\n", " t1 = con.read_parquet(base).select(_.id, _.geom)\n", " t2 = con.read_parquet(data_layer).select(_.geom, _.value)\n", " expr = (t1\n", " .left_join(t2, t1.geom.intersects(t2.geom))\n", " .group_by(t1.id, t1.geom)\n", " .agg(overlap_fraction = (t1.geom.intersection(t2.geom).area() / t1.geom.area() *t2.value) \n", " .sum().coalesce(0).round(3) ) #weighted overlap, based on t2.value\n", " )\n", " ibis.to_sql(expr)\n", " stats = expr.execute()\n", " return stats[['id','overlap_fraction']]" ] }, { "cell_type": "markdown", "id": "f45a0f52-6d18-45b4-8585-af3f1190b000", "metadata": {}, "source": [ "#### Compute zonal stats with vector data " ] }, { "cell_type": "code", "execution_count": null, "id": "b110da15-d2ac-4457-9241-f02f44dc436a", "metadata": {}, "outputs": [], "source": [ "%%time\n", "vectors = [svi,justice40 ,fire,rxburn]\n", "names = ['svi','disadvantaged_communities','fire','rxburn']\n", "\n", "# read in data if it's not already created \n", "if 'gdf' not in locals(): \n", " gdf_stats = gpd.read_parquet(ca_temp_parquet) \n", "\n", "else: \n", " gdf_stats = gdf\n", "\n", " # set the index to the col we are joining on for gpd.join()\n", "gdf_stats = gdf_stats.set_index('id')\n", "\n", "for file,name in zip(vectors,names):\n", " vec = gpd.read_parquet(file + '.parquet') #load in vector data layer \n", "\n", " # filter: we only want 10 year range for fire\n", " if name in ['fire','rxburn']:\n", " vec = vec[vec['YEAR_']>=2013] \n", " vec['value'] = 1 #used in overlap calculation, 1 = fire occured \n", "\n", " # filter: only want CA data, not nationwide. \n", " if name == 'svi': \n", " vec = vec[(vec['STATE']==\"California\") & (vec['RPL_THEMES'] != -999)] #removing empty values \n", " vec['value'] = vec['RPL_THEMES'] #overlap calculation is weighted on svi index\n", "\n", " # filter: only want CA, and only disadvantaged communities \n", " if name == 'disadvantaged_communities':\n", " vec = vec[(vec['StateName']==\"California\") & (vec['Disadvan'] ==1)]\n", " vec['value'] = 1 #used in overlap calculation, 1 = disadvantaged \n", " \n", " # change projection to match CA Nature data \n", " reproject_vectors(file, vec) \n", "\n", " # compute zonal stats \n", " vector_stats = vector_vector_stats(ca_temp_parquet, file + '-epsg3310.parquet') \n", " vector_stats = vector_stats.rename(columns ={'overlap_fraction':name}) \n", "\n", " # joining new zonal stats column with CA Nature data. \n", " gdf_stats = gdf_stats.join(vector_stats.set_index('id')) \n", "\n", "gdf_stats = gdf_stats.reset_index()\n", "gdf_stats.to_parquet(ca_parquet) #save CA Nature + zonal stats " ] }, { "cell_type": "markdown", "id": "e0fccaf3-50a8-4324-82fa-34838987334b", "metadata": {}, "source": [ "#### Function: Reproject raster data" ] }, { "cell_type": "code", "execution_count": null, "id": "aade11d9-87b9-403d-bad1-3069663807a9", "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "\n", "def raster_reprojection(input_file, output_file, epsg=\"EPSG:3310\"):\n", " cmd = [\n", " \"gdalwarp\",\n", " \"-t_srs\", epsg,\n", " input_file,\n", " output_file\n", " ]\n", " try:\n", " subprocess.run(cmd, check=True)\n", " print(f\"Reprojection successful! Output saved to: {output_file}\")\n", " except subprocess.CalledProcessError as e:\n", " print(f\"Error occurred during reprojection: {e}\")" ] }, { "cell_type": "markdown", "id": "94e924fd-d927-4458-ba1f-670b4047d149", "metadata": {}, "source": [ "#### Compute zonal stats with raster data" ] }, { "cell_type": "code", "execution_count": null, "id": "3ce1bc61-eabd-4a73-ba34-a1707bc14f74", "metadata": {}, "outputs": [], "source": [ "%%time\n", "import rasterio\n", "from exactextract import exact_extract\n", "\n", "rasters = [irrecoverable_c, manageable_c, richness, rsr]\n", "names = ['irrecoverable_carbon','manageable_carbon','richness','rsr']\n", "\n", "if 'gdf_stats' not in locals(): \n", " gdf_stats = gpd.read_parquet(ca_parquet) # read in data if it's not already created \n", " \n", "# need to make the following changes to our data for exact_extract() to work:\n", "gdf_stats = gdf_stats.rename(columns ={'id':'ca_id'}) #rename 'id' because it conflicts with a raster field. \n", "gdf_stats.to_parquet(ca_parquet) #saving updated parquet to file to use for exact_extract()\n", "\n", "for file,name in zip(rasters,names):\n", " raster_reprojection(file+'.tif', file+'_epsg3310.tif') #reproject rasters to match CA Nature\n", " raster_stats = exact_extract(file+'_epsg3310.tif', ca_parquet, [\"mean\"], include_cols=[\"ca_id\"], output = 'pandas') #zonal stats \n", " \n", " #the column we want is 'band_1_mean'; these rasters have multiple bands. \n", " if name in ['irrecoverable_carbon','manageable_carbon']:\n", " raster_stats = raster_stats[['ca_id','band_1_mean']] \n", " raster_stats = raster_stats.rename(columns ={'band_1_mean':name}) \n", "\n", " #these rasters have only 1 band, so zonal stats column is 'mean'\n", " elif name in ['richness','rsr']:\n", " raster_stats = raster_stats[['ca_id','mean']] \n", " raster_stats = raster_stats.rename(columns ={'mean':name})\n", "\n", " raster_stats[name] = raster_stats[name].round(3) #rounding stats \n", " \n", " # joining with gpd.join(), need to set an index \n", " gdf_stats = gdf_stats.set_index(\"ca_id\").join(raster_stats.set_index(\"ca_id\")) \n", "\n", " # exact_extract() won't work with index, so now that it's joined, we reset the index. \n", " gdf_stats = gdf_stats.reset_index() \n", "\n", "gdf_stats = gdf_stats.rename(columns ={'ca_id':'id'}) #reverting back to \"id\" col name, since we are finished with exact_extract() \n", "\n", "\n", "# reproject to epsg:4326 since that's what pmtiles requires and we want to match that \n", "gdf_stats = gdf_stats.to_crs(\"epsg:4326\")\n", "gdf_stats.to_parquet(ca_parquet) # save results " ] }, { "cell_type": "markdown", "id": "ec619f4e-1338-492a-a334-a7796f4f55a1", "metadata": {}, "source": [ "# Step 5: Upload file + Generate PMTiles" ] }, { "cell_type": "code", "execution_count": null, "id": "30f47b26-cd18-4e8c-a19b-9d1f19b10873", "metadata": {}, "outputs": [], "source": [ "from cng.utils import hf_upload, s3_cp,set_secrets, to_pmtiles\n", "\n", "# upload parquet to minio and HF\n", "hf_upload('ca-30x30.parquet', ca_parquet)\n", "s3_cp(ca_parquet, \"s3://public-ca30x30/ca-30x30.parquet\", \"minio\")\n", "\n", "#to use PMTiles, need to convert to geojson\n", "ca_geojson = (con\n", " .read_parquet(ca_parquet)\n", " # .filter(_.status != 'non-conserved') #omitting the non-conserved to only for pmtiles \n", " )\n", "\n", "#can't go directly from parquet -> pmtiles, need to go parquet -> geojson -> pmtiles \n", "ca_geojson.execute().to_file(path + 'ca-30x30.geojson') \n", "pmtiles = to_pmtiles(path+ 'ca-30x30.geojson', ca_pmtiles, options = ['--extend-zooms-if-still-dropping'])\n", "\n", "# upload pmtiles to minio and HF\n", "hf_upload('ca-30x30.pmtiles', ca_pmtiles)\n", "s3_cp(ca_pmtiles, \"s3://public-ca30x30/ca-30x30.pmtiles\", \"minio\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 }