cboettig commited on
Commit
8bd32d3
·
1 Parent(s): 4c44a48
Files changed (4) hide show
  1. app.py +187 -181
  2. pad-stats.parquet +3 -0
  3. pad.duckdb +3 -0
  4. preprocess.py +164 -151
app.py CHANGED
@@ -11,7 +11,6 @@
11
  # See the License for the specific language governing permissions and
12
  # limitations under the License.
13
 
14
- # +
15
  import leafmap.foliumap as leafmap
16
  import streamlit as st
17
  import altair as alt
@@ -19,73 +18,62 @@ import ibis
19
  from ibis import _
20
  import ibis.selectors as s
21
 
22
- # defaults
23
  private_color = "#DE881E" # orange #"#850101" # red
24
  tribal_color = "#BF40BF" # purple
25
  mixed_color = "#005a00" # green
26
  public_color = "#3388ff" # blue
27
 
 
28
  low = 2
29
  high = 3
30
  alpha = .5
31
  style_choice = "Manager Type"
 
32
 
33
 
34
- # +
35
  st.set_page_config(layout="wide", page_title="Protected Areas Explorer", page_icon=":globe:")
36
 
37
  '''
38
  # US Protected Area Database Explorer
39
 
40
  '''
41
- # -
42
-
43
- pad_pmtiles = "https://data.source.coop/cboettig/pad-us-3/pad-mobi.pmtiles"
44
-
45
- # +
46
- #parquet = "https://data.source.coop/cboettig/pad-us-3/pad-mobi.parquet"
47
- parquet = "https://minio.carlboettiger.info/public-biodiversity/pad-us-3/pad-mobi.parquet"
48
-
49
- @st.cache_resource
50
- def ibis_connection(parquet):
51
- return ibis.read_parquet(parquet)
52
-
53
- pad_data = ibis_connection(parquet)
54
 
 
 
 
 
55
 
56
- # +
57
- @st.cache_data
58
- def pad_stats(_pad_data):
59
- return (_pad_data
60
- .aggregate(min_richness = _.richness.min(),
61
- max_richness = _.richness.max(),
62
- mean_richness = _.richness.mean(),
63
- sd_richness = _.richness.std(),
64
- min_rsr = _.rsr.min(),
65
- max_rsr = _.rsr.max(),
66
- mean_rsr = _.rsr.mean(),
67
- sd_rsr = _.rsr.std())
68
- .to_pandas()
69
- )
70
-
71
- stats = pad_stats(pad_data)
72
- upper_rsr = stats["mean_rsr"][0] + stats["sd_rsr"][0]
73
- upper_richness = stats["mean_richness"][0] + stats["sd_richness"][0]
74
- # -
75
-
76
- m = leafmap.Map(center=[35, -100], zoom=4, layers_control=True, fullscreen_control=True)
77
 
78
- metadata = leafmap.pmtiles_metadata(pad_pmtiles)
79
- #print(f"layer names: {metadata['layer_names']}")
80
 
81
- # +
82
  custom_style = '''
83
  "blue"
84
  '''
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  manager = {
88
- 'property': 'bucket',
89
  'type': 'categorical',
90
  'stops': [
91
  ['public', public_color],
@@ -95,7 +83,7 @@ manager = {
95
  ]
96
  }
97
  easement = {
98
- 'property': 'FeatClass',
99
  'type': 'categorical',
100
  'stops': [
101
  ['Fee', public_color],
@@ -104,35 +92,43 @@ easement = {
104
  ]
105
  }
106
 
 
 
 
 
 
 
 
 
 
 
 
107
  gap = {
108
- 'property': 'GAP_Sts',
109
  'type': 'categorical',
110
  'stops': [
111
- ['1', "#26633d"],
112
- ['2', "#879647"],
113
- ['3', "#BBBBBB"],
114
- ['4', "#F8F8F8"]
115
  ]
116
  }
117
 
118
  iucn = {
119
- 'property': 'IUCN_Cat',
120
  'type': 'categorical',
121
  'stops': [
122
- ["Ia", "#4B0082"],
123
- ["Ib", "#663399"],
124
- ["II", "#7B68EE"],
125
- ["III", "#9370DB"],
126
- ["IV", "#8A2BE2"],
127
- ["V", "#9932CC"],
128
- ["VI", "#9400D3"],
129
- ["Other Conservation Area", "#DDA0DD"],
130
- ["Unassigned", "#F8F8F8"]
131
  ]
132
- }
133
-
134
-
135
-
136
 
137
  thresholds = ['case',
138
  ['<', ['get', 'richness'], low],
@@ -156,7 +152,6 @@ rsr = ["interpolate",
156
  0.006, "#850101"
157
  ]
158
 
159
-
160
  def pad_style(paint, alpha):
161
  return {
162
  "version": 8,
@@ -168,7 +163,7 @@ def pad_style(paint, alpha):
168
  "layers": [{
169
  "id": "public",
170
  "source": "pad",
171
- "source-layer": "pad-mobi",
172
  "type": "fill",
173
  "paint": {
174
  "fill-color": paint,
@@ -201,6 +196,7 @@ with st.sidebar:
201
  "IUCN Status Code": iucn,
202
  "Manager Type": manager,
203
  "Fee/Easement": easement,
 
204
  "Mean Richness": richness,
205
  "Mean RSR": rsr,
206
  "custom": eval(custom)}
@@ -244,6 +240,20 @@ with st.sidebar:
244
  hi="https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
245
  m.add_cog_layer(hi, palette="purples", name="Human Impact", transparent_bg=True, opacity = 0.8, zoom_to_layer=False)
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  if st.toggle("Custom map layers"):
248
 
249
  code = st.text_area(label = "leafmap code:",
@@ -280,162 +290,158 @@ with st.sidebar:
280
  "source": "source1",
281
  "source-layer": "mtbs_perims_DD",
282
  "type": "fill",
283
- "paint": {"fill-color": "#FFA500", "fill-opacity": 0.2}}]}
284
  m.add_pmtiles(usgs, name="Fire", style=combined_style, overlay=True, show=True, zoom_to_layer=False)
285
 
286
- # +
287
- # And here we go!
288
- m.to_streamlit(height=700)
289
- # -
290
-
291
-
292
  select_column = {
293
- "GAP Status Code": "GAP_Sts",
294
- "IUCN Status Code": "IUCN_Cat",
295
- "Manager Type": "bucket",
296
- "Fee/Easement": "FeatClass",
297
- "Mean Richness": "bucket",
298
- "Mean RSR": "bucket",
299
- "custom": "bucket"}
 
300
  column = select_column[style_choice]
301
 
 
302
  select_colors = {
303
  "GAP Status Code": gap["stops"],
304
  "IUCN Status Code": iucn["stops"],
305
  "Manager Type": manager["stops"],
306
  "Fee/Easement": easement["stops"],
 
307
  "Mean Richness": manager["stops"],
308
  "Mean RSR": manager["stops"],
309
- "custom": manager["stops"]}
310
- colors = ibis.memtable(select_colors[style_choice], columns = [column, "color"]).to_pandas()
311
-
312
- st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
 
 
314
 
315
- # +
316
- us_lower_48_area_m2 = 7.8e+12
317
 
318
- @st.cache_data()
319
- def summary_table(column = column, colors = colors):
320
- df = (pad_data.
321
- group_by(_[column]).
322
- aggregate(percent_protected = 100 * _.area.sum() / us_lower_48_area_m2,
323
- mean_richness = (_.richness * _.area).sum() / _.area.sum(),
324
- mean_rsr = (_.rsr * _.area).sum() / _.area.sum()
325
- ).
326
- mutate(percent_protected = _.percent_protected.round())
327
- ).inner_join(colors, column)
328
- return df.to_pandas()
329
-
330
- df = summary_table(column, colors)
331
- total_percent = df.percent_protected.sum()
332
-
333
-
334
- base = alt.Chart(df).encode(
335
- alt.Theta("percent_protected:Q").stack(True),
336
- alt.Color("color:N").scale(None).legend(None)
337
- )
338
 
339
- area_chart = (
340
- base.mark_arc(innerRadius=50, outerRadius=120) +
341
- base.mark_text(radius=165, size=20).encode(text=column) +
342
- base.mark_text(radius=135, size=20).encode(text="percent_protected:N")
343
- )
344
 
345
- # area_chart
346
 
347
- # +
348
- richness_chart = alt.Chart(df).mark_bar().encode(
349
- x=column,
350
- y='mean_richness',
351
- color=alt.Color('color').scale(None)
352
- )
353
 
 
 
 
 
 
 
 
354
 
355
- # +
356
- rsr_chart = alt.Chart(df).mark_bar().encode(
357
- x=column,
358
- y='mean_rsr',
359
- color=alt.Color('color').scale(None)
360
- )
361
- # -
362
 
363
- '''
364
- ## Summary Statistics
 
 
 
365
 
366
- '''
367
 
368
- # +
369
- col1, col2, col3 = st.columns(3)
370
 
371
- with col1:
372
- f"#### {total_percent} Percent of Continental US Area Covered"
373
- st.altair_chart(area_chart, use_container_width=True)
374
-
375
- # -
376
 
377
- with col2:
378
- "#### Mean Species Richness"
379
- st.altair_chart(richness_chart, use_container_width=True)
380
 
 
 
 
381
 
382
- with col3:
383
- "#### Mean Range-Size Rarity"
384
- st.altair_chart(rsr_chart, use_container_width=True)
385
 
386
- # +
387
- '''
388
- ## Custom queries
389
 
390
- Input custom python code below to interactively explore the data.
391
 
392
- '''
393
 
394
- col2_1, col2_2 = st.columns(2)
395
 
 
 
 
 
 
396
 
397
- sample_q = '''(
398
- ibis.read_parquet('https://minio.carlboettiger.info/public-biodiversity/pad-us-3/pad-mobi.parquet').
399
- group_by(_.bucket).
400
- aggregate(percent_protected = 100 * _.area.sum() / us_lower_48_area_m2,
401
- mean_richness = (_.richness * _.area).sum() / _.area.sum(),
402
- mean_rsr = (_.rsr * _.area).sum() / _.area.sum()
403
- ).
404
- mutate(percent_protected = _.percent_protected.round())
405
- )
406
- '''
407
-
408
- with col2_1:
409
- query = st.text_area(
410
- label = "Python code:",
411
- value = sample_q,
412
- height = 300)
413
 
414
- with col2_2:
415
- "Output table:"
416
- df = eval(query)
417
- st.write(df.to_pandas())
418
 
 
 
419
 
420
- # +
 
421
 
422
- '''
423
- ## Credits
424
 
425
- Author: Carl Boettiger, UC Berkeley
426
- License: BSD-2-clause
 
 
 
427
 
428
- ### Data sources
429
 
430
- - US Protected Areas Database v3 by USGS, data hosted on https://beta.source.coop/cboettig/us-pad-3. Citation: https://doi.org/10.5066/P9Q9LQ4B, License: Public Domain
431
- - Carbon-loss by Vizzuality, on https://beta.source.coop/repositories/vizzuality/lg-land-carbon-data. Citation: https://doi.org/10.1101/2023.11.01.565036, License: CC-BY
432
- - Human Footprint by Vizzuality, on https://beta.source.coop/repositories/vizzuality/hfp-100. Citation: https://doi.org/10.3389/frsen.2023.1130896, License: Public Domain
433
- - Fire polygons by USGS, reprocessed to PMTiles on https://beta.source.coop/cboettig/fire/. License: Public Domain.
434
- - Irrecoverable Carbon from Conservation International, reprocessed to COG on https://beta.source.coop/cboettig/carbon, citation: https://doi.org/10.1038/s41893-021-00803-6, License: CC-BY-NC
435
-
436
- ### Software
437
-
438
- Proudly built with a free and Open Source software stack: Streamlit (reactive application), HuggingFace (application hosting), Source.Coop (data hosting),
439
- using cloud-native data serializations in COG, PMTiles, and GeoParquet. Coded in pure python using leafmap and duckdb. Map styling with [MapLibre](https://maplibre.org/).
440
- '''
441
 
 
11
  # See the License for the specific language governing permissions and
12
  # limitations under the License.
13
 
 
14
  import leafmap.foliumap as leafmap
15
  import streamlit as st
16
  import altair as alt
 
18
  from ibis import _
19
  import ibis.selectors as s
20
 
21
+ # defaults, consider user palette via st.color_picker()
22
  private_color = "#DE881E" # orange #"#850101" # red
23
  tribal_color = "#BF40BF" # purple
24
  mixed_color = "#005a00" # green
25
  public_color = "#3388ff" # blue
26
 
27
+ # default color breaks, consider tool via st.slider()
28
  low = 2
29
  high = 3
30
  alpha = .5
31
  style_choice = "Manager Type"
32
+ us_lower_48_area_m2 = 7.8e+12
33
 
34
 
 
35
  st.set_page_config(layout="wide", page_title="Protected Areas Explorer", page_icon=":globe:")
36
 
37
  '''
38
  # US Protected Area Database Explorer
39
 
40
  '''
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ #pad_pmtiles = "https://data.source.coop/cboettig/pad-us-3/pad-stats.pmtiles"
43
+ #parquet = "https://data.source.coop/cboettig/pad-us-3/pad-stats.parquet"
44
+ pad_pmtiles = "https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/pad-stats.pmtiles"
45
+ parquet = "https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/pad-stats.parquet"
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ m = leafmap.Map(center=[35, -100], zoom=4, layers_control=True)
 
49
 
 
50
  custom_style = '''
51
  "blue"
52
  '''
53
 
54
 
55
+ sample_q = '''(
56
+ ibis.read_parquet('https://minio.carlboettiger.info/public-biodiversity/pad-us-3/pad-mobi.parquet').
57
+ group_by(_.bucket).
58
+ aggregate(percent_protected = 100 * _.area.sum() / us_lower_48_area_m2,
59
+ mean_richness = (_.richness * _.area).sum() / _.area.sum(),
60
+ mean_rsr = (_.rsr * _.area).sum() / _.area.sum()
61
+ ).
62
+ mutate(percent_protected = _.percent_protected.round())
63
+ )
64
+ '''
65
+
66
+ def bar_chart(df, x, y):
67
+ chart = alt.Chart(df).mark_bar().encode(
68
+ x=x,
69
+ y=y,
70
+ color=alt.Color('color').scale(None)
71
+ ).properties(width="container", height=200)
72
+ return chart
73
+
74
+
75
  manager = {
76
+ 'property': 'manager_group',
77
  'type': 'categorical',
78
  'stops': [
79
  ['public', public_color],
 
83
  ]
84
  }
85
  easement = {
86
+ 'property': 'category',
87
  'type': 'categorical',
88
  'stops': [
89
  ['Fee', public_color],
 
92
  ]
93
  }
94
 
95
+ access = {
96
+ 'property': 'public_access',
97
+ 'type': 'categorical',
98
+ 'stops': [
99
+ ['Open Access', public_color],
100
+ ['Closed', private_color],
101
+ ['Unknown', "grey"],
102
+ ['Restricted Access', tribal_color]
103
+ ]
104
+ }
105
+
106
  gap = {
107
+ 'property': 'gap_code',
108
  'type': 'categorical',
109
  'stops': [
110
+ [1, "#26633d"],
111
+ [2, "#879647"],
112
+ [3, "#BBBBBB"],
113
+ [4, "#F8F8F8"]
114
  ]
115
  }
116
 
117
  iucn = {
118
+ 'property': 'iucn_category',
119
  'type': 'categorical',
120
  'stops': [
121
+ ["Ia: Strict nature reserves", "#4B0082"],
122
+ ["Ib: Wilderness areas", "#663399"],
123
+ ["II: National park", "#7B68EE"],
124
+ ["III: Natural monument or feature", "#9370DB"],
125
+ ["IV: Habitat / species management", "#8A2BE2"],
126
+ ["V: Protected landscape / seascape", "#9932CC"],
127
+ ["VI: Protected area with sustainable use of natural resources", "#9400D3"],
128
+ ["Other Conservation Area", "#DDA0DD"],
129
+ ["Unassigned", "#F8F8F8"],
130
  ]
131
+ }
 
 
 
132
 
133
  thresholds = ['case',
134
  ['<', ['get', 'richness'], low],
 
152
  0.006, "#850101"
153
  ]
154
 
 
155
  def pad_style(paint, alpha):
156
  return {
157
  "version": 8,
 
163
  "layers": [{
164
  "id": "public",
165
  "source": "pad",
166
+ "source-layer": "pad-stats",
167
  "type": "fill",
168
  "paint": {
169
  "fill-color": paint,
 
196
  "IUCN Status Code": iucn,
197
  "Manager Type": manager,
198
  "Fee/Easement": easement,
199
+ "Public Access": access,
200
  "Mean Richness": richness,
201
  "Mean RSR": rsr,
202
  "custom": eval(custom)}
 
240
  hi="https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
241
  m.add_cog_layer(hi, palette="purples", name="Human Impact", transparent_bg=True, opacity = 0.8, zoom_to_layer=False)
242
 
243
+ if st.toggle("cropland expansion"):
244
+ m.add_cog_layer("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif",
245
+ palette="greens", name="cropland expansion", transparent_bg=True, opacity = 0.8, zoom_to_layer=False)
246
+
247
+ if st.toggle("Biodiversity Intactness Loss"):
248
+ m.add_cog_layer("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif",
249
+ palette="reds", name="biodiversity intactness loss", transparent_bg=True, opacity = 0.8, zoom_to_layer=False)
250
+
251
+ if st.toggle("Forest Integrity Loss"):
252
+ m.add_cog_layer("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif",
253
+ palette="reds", name="forest integrity loss", transparent_bg=True, opacity = 0.8, zoom_to_layer=False)
254
+
255
+
256
+
257
  if st.toggle("Custom map layers"):
258
 
259
  code = st.text_area(label = "leafmap code:",
 
290
  "source": "source1",
291
  "source-layer": "mtbs_perims_DD",
292
  "type": "fill",
293
+ "paint": {"fill-color": "#FFA500", "fill-opacity": 0.4}}]}
294
  m.add_pmtiles(usgs, name="Fire", style=combined_style, overlay=True, show=True, zoom_to_layer=False)
295
 
296
+ # Map radio buttons to corresponding column:
 
 
 
 
 
297
  select_column = {
298
+ "GAP Status Code": "gap_code",
299
+ "IUCN Status Code": "iucn_category",
300
+ "Manager Type": "manager_group",
301
+ "Fee/Easement": "category",
302
+ "Public Access": "public_access",
303
+ "Mean Richness": "manager_group",
304
+ "Mean RSR": "manager_group",
305
+ "custom": "gap_code"}
306
  column = select_column[style_choice]
307
 
308
+ # Map radio buttons to corresponding color-scheme:
309
  select_colors = {
310
  "GAP Status Code": gap["stops"],
311
  "IUCN Status Code": iucn["stops"],
312
  "Manager Type": manager["stops"],
313
  "Fee/Easement": easement["stops"],
314
+ "Public Access": access["stops"],
315
  "Mean Richness": manager["stops"],
316
  "Mean RSR": manager["stops"],
317
+ "custom": manager["stops"]}
318
+ colors = (ibis
319
+ .memtable(select_colors[style_choice], columns = [column, "color"])
320
+ .to_pandas()
321
+ )
322
+
323
+
324
+ main = st.container()
325
+
326
+ with main:
327
+ map_col, stats_col = st.columns([2,1])
328
+
329
+ with map_col:
330
+ m.to_streamlit(height=700)
331
+
332
+
333
+ @st.cache_resource
334
+ def ibis_connection(parquet):
335
+ return ibis.read_parquet(parquet)
336
+ pad_data = ibis_connection(parquet)
337
+
338
+ @st.cache_data()
339
+ def summary_table(column = column, colors = colors):
340
+ df = (pad_data
341
+ .rename(area = "area_square_meters")
342
+ .group_by(_[column])
343
+ .aggregate(percent_protected = 100 * _.area.sum() / us_lower_48_area_m2,
344
+ mean_richness = (_.richness * _.area).sum() / _.area.sum(),
345
+ mean_rsr = (_.rsr * _.area).sum() / _.area.sum(),
346
+ carbon_lost = (_.deforest_carbon * _.area).sum() / _.area.sum(),
347
+ crop_expansion = (_.crop_expansion * _.area).sum() / _.area.sum(),
348
+ human_impact = (_.human_impact * _.area).sum() / _.area.sum(),
349
+ )
350
+ .mutate(percent_protected = _.percent_protected.round())
351
+ .inner_join(colors, column)
352
+ )
353
+ df = df.to_pandas()
354
+ df[column] = df[column].astype(str)
355
+ return df
356
 
357
+ df = summary_table(column, colors)
358
+ total_percent = df.percent_protected.sum()
359
 
 
 
360
 
361
+ base = alt.Chart(df).encode(
362
+ alt.Theta("percent_protected:Q").stack(True),
363
+ alt.Color("color:N").scale(None).legend(None)
364
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
+ area_chart = (
367
+ base.mark_arc(innerRadius=40, outerRadius=70)
368
+ ).properties(width=180, height=180)
 
 
369
 
 
370
 
371
+ richness_chart = bar_chart(df, column, 'mean_richness')
372
+ rsr_chart = bar_chart(df, column, 'mean_rsr')
373
+ carbon_lost = bar_chart(df, column, 'carbon_lost')
374
+ crop_expansion = bar_chart(df, column, 'crop_expansion')
375
+ human_impact = bar_chart(df, column, 'human_impact')
 
376
 
377
+ with stats_col:
378
+ col1, col2, col3 = st.columns(3)
379
+ with col1:
380
+ f"{total_percent}% Continental US Covered"
381
+ st.altair_chart(area_chart, use_container_width=False)
382
+ "Carbon Lost ('02-'22)"
383
+ st.altair_chart(carbon_lost, use_container_width=True)
384
 
385
+ with col2:
386
+ "Species Richness"
387
+ st.altair_chart(richness_chart, use_container_width=True)
388
+ "Crop expansion"
389
+ st.altair_chart(crop_expansion, use_container_width=True)
 
 
390
 
391
+ with col3:
392
+ "Range-Size Rarity"
393
+ st.altair_chart(rsr_chart, use_container_width=True)
394
+ "Human Impact"
395
+ st.altair_chart(human_impact, use_container_width=True)
396
 
 
397
 
398
+ st.divider()
 
399
 
400
+ footer = st.container()
 
 
 
 
401
 
 
 
 
402
 
403
+ with footer:
404
+ '''
405
+ ## Custom queries
406
 
407
+ Input custom python code below to interactively explore the data.
 
 
408
 
409
+ '''
 
 
410
 
411
+ col2_1, col2_2 = st.columns(2)
412
 
 
413
 
 
414
 
415
+ with col2_1:
416
+ query = st.text_area(
417
+ label = "Python code:",
418
+ value = sample_q,
419
+ height = 300)
420
 
421
+ with col2_2:
422
+ "Output table:"
423
+ df = eval(query)
424
+ st.write(df.to_pandas())
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
+ st.divider()
 
 
 
427
 
428
+ '''
429
+ ## Credits
430
 
431
+ Author: Carl Boettiger, UC Berkeley
432
+ License: BSD-2-clause
433
 
434
+ ### Data sources
 
435
 
436
+ - US Protected Areas Database v3 by USGS, data hosted on https://beta.source.coop/cboettig/us-pad-3. Citation: https://doi.org/10.5066/P9Q9LQ4B, License: Public Domain
437
+ - Carbon-loss by Vizzuality, on https://beta.source.coop/repositories/vizzuality/lg-land-carbon-data. Citation: https://doi.org/10.1101/2023.11.01.565036, License: CC-BY
438
+ - Human Footprint by Vizzuality, on https://beta.source.coop/repositories/vizzuality/hfp-100. Citation: https://doi.org/10.3389/frsen.2023.1130896, License: Public Domain
439
+ - Fire polygons by USGS, reprocessed to PMTiles on https://beta.source.coop/cboettig/fire/. License: Public Domain.
440
+ - Irrecoverable Carbon from Conservation International, reprocessed to COG on https://beta.source.coop/cboettig/carbon, citation: https://doi.org/10.1038/s41893-021-00803-6, License: CC-BY-NC
441
 
442
+ ### Software
443
 
444
+ Proudly built with a free and Open Source software stack: Streamlit (reactive application), HuggingFace (application hosting), Source.Coop (data hosting),
445
+ using cloud-native data serializations in COG, PMTiles, and GeoParquet. Coded in pure python using leafmap and duckdb. Map styling with [MapLibre](https://maplibre.org/).
446
+ '''
 
 
 
 
 
 
 
 
447
 
pad-stats.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9585c3c27d05039ff30faae0c9f5244a1a061e722b47ba30b2e45c4f51df8dc
3
+ size 882042199
pad.duckdb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f01cd8a18be80c44cea7b5cb7f731a50cbc67c6285ddb80c8ae195adf7a770
3
+ size 1079783424
preprocess.py CHANGED
@@ -1,21 +1,29 @@
1
- # +
2
  import ibis
 
3
  from ibis import _
4
- import xarray
5
- from shapely.geometry import box
6
- from geocube.api.core import make_geocube
7
- import geopandas
8
- import fiona
9
 
10
- import multiprocessing.popen_spawn_posix
11
- from dask.distributed import Client, LocalCluster, Lock
12
- import rioxarray
13
 
 
 
 
 
 
 
14
 
15
 
16
  # +
17
- fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
18
- parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
 
 
 
 
 
 
 
19
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
20
 
21
  # fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
@@ -27,24 +35,13 @@ nrow = len(meta)
27
  r = rioxarray.open_rasterio(cog)
28
  bounds = box(*r.rio.transform_bounds(crs))
29
 
30
- # +
31
- #import leafmap
32
- #leafmap.cog_validate(cog)
33
- # -
34
-
35
- con = ibis.duckdb.connect()
36
- # We could just read the flatgeobuf with ibis.read_geo() but it is not as fast as working with the (Geo)Parquet
37
- # pad = con.read_geo(fgb)
38
-
39
- # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
40
- con.load_extension("spatial")
41
- con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
42
- pad = con.table("pad")
43
-
44
  # +
45
  # Now we can do all the usual SQL queries to subset the data. Note the `geom.within()` spatial filter!
46
- focal_columns = ["bucket", "FeatClass", "Mang_Name", "Mang_Type", "Des_Tp",
47
- "Pub_Access", "GAP_Sts", "IUCN_Cat", "Unit_Nm", "geom"]
 
 
 
48
  public = ["DIST", "LOC", "FED", "STAT", "JNT"]
49
 
50
  case = (
@@ -56,171 +53,187 @@ case = (
56
  .end()
57
  )
58
 
59
- pad_labeled = (
60
- pad.
61
- filter((_.FeatClass.isin(["Easement", "Fee"])) | (
62
  (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
63
- ).
64
- filter(_.Mang_Type.notin(["UNK", "TERR"])).
65
- filter(_.geom.within(bounds)).
66
- mutate(GAP_Sts = _.GAP_Sts.cast("int")).
67
- mutate(bucket = case).
68
- select(focal_columns).
69
- mutate(row_n=ibis.row_number())
 
70
  )
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # +
74
- # # %%time
75
- # smoke test -- only the lower 48 states!
76
- # (pad.filter(_.geom.within(bounds)).group_by([_.State_Nm]).aggregate(n = _.count()).to_pandas())
77
- # -
 
 
 
78
 
79
- # Or be bold!
80
- df = pad_labeled.to_pandas()
81
- geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
82
- geo.to_parquet("pad-filtered.parquet")
83
-
84
-
85
- def zonal_stats(cog, geo, band_name = "mean", row_n = "row_n"):
86
- # https://corteva.github.io/geocube/html/examples/zonal_statistics.html
87
- raster = (rioxarray.
88
- open_rasterio('/vsicurl/'+cog, masked=True, chunks=True, lock=False).
89
- rio.clip_box(*geo.total_bounds, crs=geo.crs).
90
- rio.clip(geo.geometry.values, crs=geo.crs, from_disk=True).
91
- sel(band=1).drop_vars("band")
92
- )
93
- out_grid = make_geocube(
94
- vector_data=geo,
95
- measurements=[row_n],
96
- like=raster, # ensure the data are on the same grid
97
- )
98
- # merge the two together
99
- out_grid["values"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
100
- grouped_raster = out_grid.drop_vars("spatial_ref").groupby(out_grid.row_n)
101
- # can add other stats
102
- grid_mean = grouped_raster.mean().rename({"values": band_name})
103
- zonal_stats = xarray.merge([grid_mean]).to_dataframe()
104
- geo = geo.merge(zonal_stats, how="left", on=row_n)
105
- return geo
106
-
107
-
108
- import numpy as np
109
- # consider doing multiple cogs per slice
110
- def piecewise_zonal2(cog, geo, band_name = "mean", dirname = "pad_parquet", n = 10000, row_n = "row_n"):
111
- total = len(geo)
112
- for i in range(0,total,n):
113
- k = i // n
114
- path = f"{dirname}/part_{k}.parquet"
115
- print(f"processing {path}")
116
- end = np.min([i + n,total])
117
- geo_slice = geo.iloc[i:end]
118
- geo_slice = zonal_stats(cog, geo_slice, band_name, row_n)
119
- geo_slice.to_parquet(path)
120
 
 
121
 
122
- # %%time
123
- piecewise_zonal2(cog, geo, "richness", dirname = "pad_mobi", n = 50000) # 6 min
 
 
124
 
 
125
 
126
- # # Manual approach
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # +
129
- import geopandas
130
- import multiprocessing.popen_spawn_posix
131
- from dask.distributed import Client, LocalCluster, Lock
132
- import rioxarray
133
 
134
- geo = geopandas.read_parquet("pad_mobi") # ~ 4.8 GB RAM
135
 
136
  # +
137
  # %%time
138
- band_name = "human_impact"
139
- row_n = "row_n"
140
- cog = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
141
 
142
- raster = (rioxarray.
143
- open_rasterio('/vsicurl/'+cog, masked=True, chunks=True, lock=False).
144
- rio.clip_box(*geo.total_bounds, crs=geo.crs).
145
- rio.clip(geo.geometry.values, geo.crs, from_disk=True).
146
- sel(band=1).drop_vars("band")
147
- )
148
 
149
  # +
150
  # %%time
151
 
152
- band_name = "human_impact"
153
- row_n = "row_n"
154
- cog = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
155
 
156
-
157
- with LocalCluster() as cluster, Client(cluster) as client:
158
- raster = (rioxarray.
159
- open_rasterio('/vsicurl/'+cog, masked=True, chunks=True, lock=False).
160
- rio.clip(geo.geometry.values, geo.crs, from_disk=True).
161
- sel(band=1).drop_vars("band")
162
- )
163
 
164
  # +
165
  # %%time
166
 
167
- out_grid = make_geocube(
168
- vector_data=geo,
169
- measurements=['row_n'],
170
- like=raster, # ensure the data are on the same grid
171
- )
172
- # ~ +1 Gb, 1.2s
173
 
174
  # +
175
  # %%time
176
- # 100 ~ 30s, 1000 ~ 30s
177
 
178
- out_grid["values"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
179
- grouped_raster = out_grid.drop_vars("spatial_ref").groupby(out_grid.row_n) # ~ +3 Gb
 
 
 
180
 
181
  # +
182
  # %%time
183
- grid_mean = grouped_raster.mean().rename({"values": band_name})
184
- zonal_stats = xarray.merge([grid_mean]).to_dataframe()
185
- geo = geo.merge(zonal_stats, how="left", on=row_n)
186
- geo.to_parquet("test.parquet")
187
- len(geo)
188
 
189
- # 1.2 s
190
- # -
191
 
 
 
192
 
 
 
193
 
194
- # Lastly we need to convert to PMTiles:
195
- #
196
- # ```
197
- # ogr2ogr -dsco MAX_SIZE=90000000 -dsco MAX_FEATURES=50000000 -dsco MAXZOOM=10 pad-mobi.pmtiles pad-mobi.parquet
198
- # ```
199
 
200
- geo.plot(column="richness", legend=True)
 
201
 
202
- import geopandas
203
- gdf = geopandas.read_parquet("pad-mobi.parquet")
204
- cog = "https://data.source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif"
 
 
205
 
 
 
206
 
207
- human_impacts_2021 = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
208
- geo = zonal_stats(human_impacts_2021, geo, "human_impacts_2021")
209
 
 
 
210
 
211
- geo.to_parquet("pad-extended.parquet")
 
212
 
213
  # +
214
- # %%time
215
- geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif", geo, "deforest_carbon")
216
- geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif", geo, "fii")
217
- geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif", geo, "bii")
218
- geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif", geo, "crop_expansion")
219
- geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif", geo, "crop_reduction")
220
-
221
- geo = zonal_stats("https://data.source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif", geo, "irrecoverable_c_total_2018")
222
- geo = zonal_stats("https://data.source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif", geo, "manageable_c_total_2018")
223
-
224
- cog = "https://data.source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif"
225
-
226
- geo.to_parquet("pad-extended.parquet")
 
 
1
  import ibis
2
+ import ibis.selectors as s
3
  from ibis import _
 
 
 
 
 
4
 
5
+ # +
6
+ fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
7
+ parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
8
 
9
+ #pad = ibis.read_parquet(parquet)
10
+ # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
11
+ con = ibis.duckdb.connect()
12
+ con.load_extension("spatial")
13
+ con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
14
+ pad = con.table("pad")
15
 
16
 
17
  # +
18
+ #pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
19
+ # pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
20
+ #pad.select("Comments").distinct().head(100).to_pandas()
21
+
22
+ # +
23
+ import fiona
24
+ import rioxarray
25
+ from shapely.geometry import box
26
+
27
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
28
 
29
  # fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
 
35
  r = rioxarray.open_rasterio(cog)
36
  bounds = box(*r.rio.transform_bounds(crs))
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # +
39
  # Now we can do all the usual SQL queries to subset the data. Note the `geom.within()` spatial filter!
40
+ focal_columns = ["bucket", "FeatClass", "Mang_Name",
41
+ "Mang_Type", "Des_Tp", "Pub_Access",
42
+ "GAP_Sts", "IUCN_Cat", "Unit_Nm",
43
+ "State_Nm", "EsmtHldr", "Date_Est",
44
+ "SHAPE_Area", "geom"]
45
  public = ["DIST", "LOC", "FED", "STAT", "JNT"]
46
 
47
  case = (
 
53
  .end()
54
  )
55
 
56
+ pad_parquet = (
57
+ pad
58
+ .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
59
  (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
60
+ )
61
+ .filter(_.Mang_Type.notin(["UNK", "TERR"]))
62
+ .filter(_.geom.within(bounds))
63
+ .mutate(GAP_Sts = _.GAP_Sts.cast("int"))
64
+ .mutate(bucket = case)
65
+ .mutate(row_n=ibis.row_number())
66
+ .select(focal_columns)
67
+ .rename(geometry="geom")
68
  )
69
 
70
+ #pad_parquet.to_parquet("pad-processed.parquet")
71
+ # -
72
+
73
+ agency_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
74
+ agency_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
75
+ desig_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
76
+ public_access = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
77
+ state_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-state-name.parquet").select(state_id = "Code", state = "Dom")
78
+ iucn = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-iucn.parquet").select(iucn_id = "CODE", iucn_category = "DOM")
79
+
80
+ pad_processed = (pad_parquet
81
+ .rename(manager_name_id = "Mang_Name",
82
+ manager_type_id = "Mang_Type",
83
+ manager_group="bucket",
84
+ designation_type_id = "Des_Tp",
85
+ public_access_id = "Pub_Access",
86
+ category = "FeatClass",
87
+ iucn_id = "IUCN_Cat",
88
+ gap_code = "GAP_Sts",
89
+ state_id = "State_Nm",
90
+ easement_holder = "EsmtHldr",
91
+ date_established = "Date_Est",
92
+ area_square_meters = "SHAPE_Area",
93
+ name = "Unit_Nm")
94
+ .left_join(agency_name, "manager_name_id")
95
+ .left_join(agency_type, "manager_type_id")
96
+ .left_join(desig_type, "designation_type_id")
97
+ .left_join(public_access, "public_access_id")
98
+ .left_join(state_name, "state_id")
99
+ .left_join(iucn, "iucn_id")
100
+ .select(~s.contains("_right"))
101
+ .select(~s.contains("_id"))
102
+ )
103
+ # pad_processed.to_parquet("pad-processed.parquet")
104
 
105
  # +
106
+ # if we keep the original geoparquet WKB 'geometry' column, to_pandas() (or execute) gives us only a normal pandas data.frame, and geopandas doesn't see the metadata.
107
+ # if we replace the geometry with duckdb-native 'geometry' type, to_pandas() gives us a geopanadas! But requires reading into RAM.
108
+ import geopandas as gpd
109
+
110
+ gdf = pad_processed.to_pandas()
111
+ gdf = gdf.set_crs(crs)
112
+ gdf.to_parquet("pad-processed.parquet")
113
 
114
+ # +
115
+ import rasterio
116
+ from rasterstats import zonal_stats
117
+ import geopandas as gpd
118
+ import pandas as pd
119
+ from joblib import Parallel, delayed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=1000):
122
 
123
+ # read in vector as geopandas, match CRS to raster
124
+ with rasterio.open(tif_file) as src:
125
+ raster_profile = src.profile
126
+ gdf = gpd.read_parquet(vec_file).to_crs(raster_profile['crs'])
127
 
128
+ gdf["row_n"] = gdf.index + 1
129
 
130
+ # lamba fn to zonal_stats a slice:
131
+ def get_stats(geom_slice, tif_file, stats):
132
+ stats = zonal_stats(geom_slice.geometry, tif_file, stats = stats)
133
+ stats[0]['row_n'] = geom_slice.row_n
134
+ return stats[0]
135
+
136
+ # iteratation (could be a list comprehension?)
137
+ jobs = []
138
+ for r in gdf.itertuples():
139
+ jobs.append(delayed(get_stats)(r, tif_file, stats))
140
+
141
+ # And here we go
142
+ output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
143
+
144
+ # reshape output
145
+ df_zonal_stats = (
146
+ pd.DataFrame(output)
147
+ .rename(columns={'mean': col_name})
148
+ .merge(gdf, how='right', on = 'row_n')
149
+ )
150
+ return df_zonal_stats
151
 
 
 
 
 
 
152
 
 
153
 
154
  # +
155
  # %%time
 
 
 
156
 
157
+ tif_file = './hfp_2021_100m_v1-2_cog.tif'
158
+ vec_file = './pad-processed.parquet'
159
+
160
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
161
+ col_name = "human_impact", n_jobs=-1, verbose=0)
162
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
163
 
164
  # +
165
  # %%time
166
 
167
+ tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
168
+ vec_file = './pad-stats.parquet'
 
169
 
170
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
171
+ col_name = "richness", n_jobs=-1, verbose=0)
172
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
 
 
 
 
173
 
174
  # +
175
  # %%time
176
 
177
+ tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif'
178
+ vec_file = './pad-stats.parquet'
179
+
180
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
181
+ col_name = "rsr", n_jobs=-1, verbose=0)
182
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
183
 
184
  # +
185
  # %%time
 
186
 
187
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
188
+ vec_file = './pad-stats.parquet'
189
+
190
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "deforest_carbon", n_jobs=-1, verbose=0)
191
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
192
 
193
  # +
194
  # %%time
 
 
 
 
 
195
 
196
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
197
+ vec_file = './pad-stats.parquet'
198
 
199
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "biodiversity_intactness", n_jobs=-1, verbose=0)
200
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
201
 
202
+ # +
203
+ # %%time
204
 
205
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
206
+ vec_file = './pad-stats.parquet'
 
 
 
207
 
208
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "forest_integrity", n_jobs=-1, verbose=0)
209
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
210
 
211
+ # +
212
+ # %%time
213
+
214
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
215
+ vec_file = './pad-stats.parquet'
216
 
217
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_expansion", n_jobs=-1, verbose=0)
218
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
219
 
220
+ # +
221
+ # %%time
222
 
223
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
224
+ vec_file = './pad-stats.parquet'
225
 
226
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=-1, verbose=0)
227
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
228
 
229
  # +
230
+ from sqlalchemy import create_engine
231
+ from sqlalchemy import text
232
+ db_uri = "duckdb:///pad.duckdb"
233
+ engine = create_engine(db_uri)
234
+ con = engine.connect()
235
+ con.execute("create or replace table pad as select * from 'pad-stats.parquet'")
236
+ con.close()
237
+
238
+ # pad_stats = ibis.read_parquet("pad-stats.parquet")
239
+ # pad_stats.head(20).to_pandas()