# Pre-processing script

In [None]:
import ibis
from ibis import _
import geopandas as gpd
import duckdb
from cng.utils import ST_MakeValid

con = ibis.duckdb.connect(extensions=["spatial"])
path = '../data/ca-layers/'

# CA Nature data 
ca_raw_parquet = "https://data.source.coop/cboettig/ca30x30/ca_areas.parquet"

# Boundary of CA, used to computed 'non-conserved' areas
ca_boundary_shape = "../data/ca_shape"
ca_boundary_parquet = path + "ca_boundary.parquet"

# Ecoregions
ca_ecoregions_shape = "../data/ecoregions/ACE_Ecoregions_BaileyDerived_2022.shp"
ca_ecoregions_parquet = path + "ace_ecoregions.parquet"

# file to save non-conserved areas; costly operation so we save results 
ca_nonconserved_parquet = path + "ca-30x30-nonconserved-500m-simplified.parquet" 
ca_nonconserved_eco_parquet = path + "ca-30x30-nonconserved-500m-simplified-eco.parquet" 

# temp file used to compute zonal stats: has conserved + non-conserved areas 
ca_temp_parquet = path + "ca-30x30-temp.parquet" 

# final files: conserved + non-conserved areas + zonal stats 
ca_parquet = path + "ca-30x30.parquet"
ca_pmtiles = path + "ca-30x30.pmtiles" #excludes non-conserved geometries

#vector data 
svi = path + 'SVI2022_US_tract' #EPSG:4326
justice40 = path + 'disadvantaged-communities'#ESRI:102039
fire = path + 'calfire-2023' #EPSG:4326
rxburn = path + 'calfire-rxburn-2023' #EPSG:4326

#raster data 
irrecoverable_c = path + 'ca_irrecoverable_c_2018_cog' # EPSG:3857
manageable_c = path + 'ca_manageable_c_2018_cog'# EPSG:3857
richness = path + 'SpeciesRichness_All' # EPSG:3857
rsr = path + 'RSR_All'# EPSG:3857

# Step 1: Computing all "non-conserved" areas

#### Convert CA boundary to parquet 

In [None]:
# Using a shape file of CA boundary and converting to parquet file 
ca_boundary = gpd.read_file(ca_boundary_shape).to_crs(epsg = 3310)
ca_boundary.to_parquet(ca_boundary_parquet)

#### Computing difference: Non-conserved areas = CA Boundary - Conserved Areas

In [None]:
# This chunk will take ~2 hours to run 
conn = ibis.duckdb.connect("tmp", extensions=["spatial"]) #save to disk

# CA Boundary 
ca_all_tbl = (
 conn.read_parquet(ca_boundary_parquet)
 .rename(geom = "geometry")
 .cast({"geom": "geometry"})
)


# CA-Nature data / protected areas 
tbl = (
 conn.read_parquet(ca_raw_parquet)
 .cast({"SHAPE": "geometry"})
 .rename(geom = "SHAPE", gid = "OBJECTID")
)

conn.create_table("t1", ca_all_tbl, overwrite = True)
conn.create_table("t2", tbl.filter(_.Release_Year == 2024), overwrite = True)

# simplified all geometries 500m so the kernel doesn't crash
# computing difference
conn.conn.execute('''
CREATE TABLE not_in_pad AS
WITH t2_simplified AS (
 SELECT ST_Simplify(geom, 500) AS geom
 FROM t2
),
t2_union AS (
 SELECT ST_Union_Agg(geom) AS geom
 FROM t2_simplified
)
SELECT 
 ST_Difference(t1.geom, t2_union.geom) AS geom
FROM 
 t1, t2_union;
''')


# save to parquet file so we don't have to run this again
nonconserved = conn.table("not_in_pad")
nonconserved.execute().to_parquet(ca_nonconserved_parquet)

#### Get ecoregions - convert them to parquet

In [None]:
eco = gpd.read_file(ca_ecoregions_shape)
eco.to_parquet(ca_ecoregions_parquet)

#### Compute ecoregion for non-conserved areas

In [None]:
con = ibis.duckdb.connect(extensions=["spatial"])

eco = con.read_parquet(ca_ecoregions_parquet)
non = con.read_parquet(ca_nonconserved_parquet)

con.create_table("eco", eco.select("ECOREGION_","geometry"), overwrite = True)
con.create_table("non", non, overwrite = True)

#split up the non-conserved areas by ecoregions
con.con.execute('''
CREATE TABLE non_conserved_eco AS
SELECT 
 non.*, 
 eco.ECOREGION_ AS ecoregion,
 ST_Intersection(non.geom, eco.geometry) AS geom -- Split non into ecoregions
FROM non
JOIN eco 
ON ST_Intersects(non.geom, eco.geometry)
WHERE ST_GeometryType(ST_Intersection(non.geom, eco.geometry)) IN ('POLYGON', 'MULTIPOLYGON');
''')

# save to parquet file so we don't have to run this again
non_eco = (con.table("non_conserved_eco")
 .drop('geom')
 .rename(geom = "geom_1")
 .mutate(geom = ST_MakeValid(_.geom))
 .mutate(id=ibis.row_number().over())
 )

non_conserved_eco = non_eco.execute()
non_conserved_eco.to_parquet(ca_nonconserved_eco_parquet)

#### Non-conserved areas need to match CA Nature schema when merging

In [None]:
# match CA Nature schema 
nonconserved_clean = (
 con.read_parquet(ca_nonconserved_eco_parquet)
 .cast({"geom": "geometry"})
 .mutate(established = ibis.null(), gap_code = 0, name = ibis.literal("Non-Conserved Areas"),
 access_type = ibis.null(), manager = ibis.null(), manager_type = ibis.null(),
 easement = ibis.null(), type = ibis.literal("Land"),
 status = ibis.literal("non-conserved"),
 acres = _.geom.area() / 4046.8564224 #convert sq meters to acres
 )
 .cast({"established": "string", "gap_code": "int16", "status": "string","name": "string",
 "access_type": "string", "manager": "string", "manager_type": "string",
 "ecoregion": "string", "easement": "string", "id": "int64", "type": "string",
 "acres":"float32"}) #match schema to CA Nature
)

# Step 2: Isolate pre-2024 from 2024 polygons

In [None]:
# negative buffer to account for overlapping boundaries. 
buffer = -30 #30m buffer 

tbl = (
 con.read_parquet(ca_raw_parquet)
 .cast({"SHAPE": "geometry"})
 .rename(geom = "SHAPE")
 .filter(_.reGAP < 3) # only gap 1 and 2 count towards 30x30
)

# polygons with release_year 2024 are a superset of release_year 2023. 
# use anti_join to isolate the objects that are in release_year 2024 but not release_year 2023 (aka newly established). 
tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer)) 
tbl_2024 = tbl.filter(_.Release_Year == 2024)
intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))

# Step 3: Join all protected land data into single parquet file 

In [None]:
# %%time
new2024 = intersects.select("OBJECTID").mutate(established = ibis.literal("2024")) # saving IDs to join on

ca_merged = (con
 .read_parquet(ca_raw_parquet)
 .cast({"SHAPE": "geometry"})
 .mutate(area = _.SHAPE.area())
 .filter(_.Release_Year == 2024) # having both 2023 and 2024 is redudant since 2024 is the superset.
 .left_join(new2024, "OBJECTID") # newly established 2024 polygons 
 .mutate(established=_.established.fill_null("pre-2024")) 
 .rename(name = "cpad_PARK_NAME", access_type = "cpad_ACCESS_TYP", manager = "cpad_MNG_AGENCY",
 manager_type = "cpad_MNG_AG_LEV", id = "OBJECTID", type = "TYPE", 
 ecoregion = "CA_Ecoregion_Name", acres = "Acres", gap_code = "reGAP", geom = "SHAPE")
 .cast({"gap_code": "int16"})
 .cast({"id": "int64"})
 .mutate(manager = _.manager.substitute({"": "Unknown"})) 
 .mutate(manager_type = _.manager_type.substitute({"": "Unknown"}))
 .mutate(access_type = _.access_type.substitute({"": "Unknown Access"}))
 .mutate(name = _.name.substitute({"": "Unknown"}))
 .mutate(manager_type = _.manager_type.substitute({"Home Owners Association": "HOA"}))
 .mutate(easement=_.Easement.cast("string").substitute({"0": "False", "1": "True"}))
 .mutate(status=_.gap_code.cast("string")
 .substitute({"1": "30x30-conserved", "2": "30x30-conserved", "3": "other-conserved", 
 "4": "unknown"}))
 .select(_.established, _.gap_code, _.status, _.name, _.access_type, _.manager, _.manager_type,
 _.ecoregion, _.easement, _.acres, _.id, _.type, _.geom)
 .union(nonconserved_clean)
 .mutate(acres = _.acres.round(4))
 .mutate(geom = ST_MakeValid(_.geom))
 .drop_null(['geom'],how = "any")
 )


gdf = ca_merged.execute()
gdf.set_crs("epsg:3310").to_parquet(ca_temp_parquet) # saving to temp file to compute zonal stats 

# Step 4: Compute zonal stats

#### Functions: Reproject and compute overlap for vector data 

In [None]:
def reproject_vectors(file, vec): # change data layer projections to match CA Nature data 
 vec = vec.rename_geometry('geom')
 vec["geom"] = vec["geom"].make_valid()
 vec = vec.to_crs("EPSG:3310")
 vec.to_parquet(file + '-epsg3310.parquet')
 return

def vector_vector_stats(base, data_layer):
 t1 = con.read_parquet(base).select(_.id, _.geom)
 t2 = con.read_parquet(data_layer).select(_.geom, _.value)
 expr = (t1
 .left_join(t2, t1.geom.intersects(t2.geom))
 .group_by(t1.id, t1.geom)
 .agg(overlap_fraction = (t1.geom.intersection(t2.geom).area() / t1.geom.area() *t2.value) 
 .sum().coalesce(0).round(3) ) #weighted overlap, based on t2.value
 )
 ibis.to_sql(expr)
 stats = expr.execute()
 return stats[['id','overlap_fraction']]

#### Compute zonal stats with vector data 

In [None]:
%%time
vectors = [svi,justice40 ,fire,rxburn]
names = ['svi','disadvantaged_communities','fire','rxburn']

# read in data if it's not already created 
if 'gdf' not in locals(): 
 gdf_stats = gpd.read_parquet(ca_temp_parquet) 

else: 
 gdf_stats = gdf

 # set the index to the col we are joining on for gpd.join()
gdf_stats = gdf_stats.set_index('id')

for file,name in zip(vectors,names):
 vec = gpd.read_parquet(file + '.parquet') #load in vector data layer 

 # filter: we only want 10 year range for fire
 if name in ['fire','rxburn']:
 vec = vec[vec['YEAR_']>=2013] 
 vec['value'] = 1 #used in overlap calculation, 1 = fire occured 

 # filter: only want CA data, not nationwide. 
 if name == 'svi': 
 vec = vec[(vec['STATE']=="California") & (vec['RPL_THEMES'] != -999)] #removing empty values 
 vec['value'] = vec['RPL_THEMES'] #overlap calculation is weighted on svi index

 # filter: only want CA, and only disadvantaged communities 
 if name == 'disadvantaged_communities':
 vec = vec[(vec['StateName']=="California") & (vec['Disadvan'] ==1)]
 vec['value'] = 1 #used in overlap calculation, 1 = disadvantaged 
 
 # change projection to match CA Nature data 
 reproject_vectors(file, vec) 

 # compute zonal stats 
 vector_stats = vector_vector_stats(ca_temp_parquet, file + '-epsg3310.parquet') 
 vector_stats = vector_stats.rename(columns ={'overlap_fraction':name}) 

 # joining new zonal stats column with CA Nature data. 
 gdf_stats = gdf_stats.join(vector_stats.set_index('id')) 

gdf_stats = gdf_stats.reset_index()
gdf_stats.to_parquet(ca_parquet) #save CA Nature + zonal stats 

#### Function: Reproject raster data

In [None]:
import subprocess

def raster_reprojection(input_file, output_file, epsg="EPSG:3310"):
 cmd = [
 "gdalwarp",
 "-t_srs", epsg,
 input_file,
 output_file
 ]
 try:
 subprocess.run(cmd, check=True)
 print(f"Reprojection successful! Output saved to: {output_file}")
 except subprocess.CalledProcessError as e:
 print(f"Error occurred during reprojection: {e}")

#### Compute zonal stats with raster data

In [None]:
%%time
import rasterio
from exactextract import exact_extract

rasters = [irrecoverable_c, manageable_c, richness, rsr]
names = ['irrecoverable_carbon','manageable_carbon','richness','rsr']

if 'gdf_stats' not in locals(): 
 gdf_stats = gpd.read_parquet(ca_parquet) # read in data if it's not already created 
 
# need to make the following changes to our data for exact_extract() to work:
gdf_stats = gdf_stats.rename(columns ={'id':'ca_id'}) #rename 'id' because it conflicts with a raster field. 
gdf_stats.to_parquet(ca_parquet) #saving updated parquet to file to use for exact_extract()

for file,name in zip(rasters,names):
 raster_reprojection(file+'.tif', file+'_epsg3310.tif') #reproject rasters to match CA Nature
 raster_stats = exact_extract(file+'_epsg3310.tif', ca_parquet, ["mean"], include_cols=["ca_id"], output = 'pandas') #zonal stats 
 
 #the column we want is 'band_1_mean'; these rasters have multiple bands. 
 if name in ['irrecoverable_carbon','manageable_carbon']:
 raster_stats = raster_stats[['ca_id','band_1_mean']] 
 raster_stats = raster_stats.rename(columns ={'band_1_mean':name}) 

 #these rasters have only 1 band, so zonal stats column is 'mean'
 elif name in ['richness','rsr']:
 raster_stats = raster_stats[['ca_id','mean']] 
 raster_stats = raster_stats.rename(columns ={'mean':name})

 raster_stats[name] = raster_stats[name].round(3) #rounding stats 
 
 # joining with gpd.join(), need to set an index 
 gdf_stats = gdf_stats.set_index("ca_id").join(raster_stats.set_index("ca_id")) 

 # exact_extract() won't work with index, so now that it's joined, we reset the index. 
 gdf_stats = gdf_stats.reset_index() 

gdf_stats = gdf_stats.rename(columns ={'ca_id':'id'}) #reverting back to "id" col name, since we are finished with exact_extract() 


# reproject to epsg:4326 since that's what pmtiles requires and we want to match that 
gdf_stats = gdf_stats.to_crs("epsg:4326")
gdf_stats.to_parquet(ca_parquet) # save results 

# Step 5: Upload file + Generate PMTiles

In [None]:
from cng.utils import hf_upload, s3_cp,set_secrets, to_pmtiles

# upload parquet to minio and HF
hf_upload('ca-30x30.parquet', ca_parquet)
s3_cp(ca_parquet, "s3://public-ca30x30/ca-30x30.parquet", "minio")

#to use PMTiles, need to convert to geojson
ca_geojson = (con
 .read_parquet(ca_parquet)
 # .filter(_.status != 'non-conserved') #omitting the non-conserved to only for pmtiles 
 )

#can't go directly from parquet -> pmtiles, need to go parquet -> geojson -> pmtiles 
ca_geojson.execute().to_file(path + 'ca-30x30.geojson') 
pmtiles = to_pmtiles(path+ 'ca-30x30.geojson', ca_pmtiles, options = ['--extend-zooms-if-still-dropping'])

# upload pmtiles to minio and HF
hf_upload('ca-30x30.pmtiles', ca_pmtiles)
s3_cp(ca_pmtiles, "s3://public-ca30x30/ca-30x30.pmtiles", "minio")