cboettig commited on
Commit
2e4f09d
·
1 Parent(s): b15781c
Files changed (1) hide show
  1. preprocess.py +24 -22
preprocess.py CHANGED
@@ -1,29 +1,31 @@
 
1
  import ibis
2
  import ibis.selectors as s
3
  from ibis import _
4
  con = ibis.duckdb.connect()
5
  con.load_extension("spatial")
6
 
 
 
7
  # +
8
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
9
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
10
  # "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
11
 
12
- # pad = ibis.read_parquet(parquet)
13
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
14
- # con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
15
- # pad = con.table("pad")
16
 
17
- # or read the fgb version
18
- pad = con.read_geo(fgb)
19
 
20
 
21
- # -
22
-
23
-
24
- pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
25
  # pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
26
  #pad.select("Comments").distinct().head(100).to_pandas()
 
27
 
28
  import fiona
29
  meta = fiona.open(fgb)
@@ -132,7 +134,7 @@ import geopandas as gpd
132
  import pandas as pd
133
  from joblib import Parallel, delayed
134
 
135
- def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=1000):
136
 
137
  # read in vector as geopandas, match CRS to raster
138
  with rasterio.open(tif_file) as src:
@@ -178,7 +180,7 @@ tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
178
  vec_file = './pad-processed.parquet'
179
 
180
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
181
- col_name = "human_impact", n_jobs=-1, verbose=0)
182
  gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
183
 
184
  # +
@@ -187,7 +189,7 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
187
  tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
188
  vec_file = './pad-stats.parquet'
189
 
190
- big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
191
 
192
 
193
  # +
@@ -197,7 +199,7 @@ tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_Al
197
  vec_file = './pad-stats.parquet'
198
 
199
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
200
- col_name = "rsr", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
201
 
202
  # +
203
  # %%time
@@ -206,7 +208,7 @@ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_ca
206
  vec_file = './pad-stats.parquet'
207
 
208
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
209
- col_name = "deforest_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
210
 
211
  # +
212
  # %%time
@@ -215,7 +217,7 @@ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii
215
  vec_file = './pad-stats.parquet'
216
 
217
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
218
- col_name = "biodiversity_intactness_loss", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
219
 
220
  # +
221
  # %%time
@@ -224,7 +226,7 @@ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii
224
  vec_file = './pad-stats.parquet'
225
 
226
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
227
- col_name = "forest_integrity_loss", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
228
 
229
  # +
230
  # %%time
@@ -232,7 +234,7 @@ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
232
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
233
  vec_file = './pad-stats.parquet'
234
 
235
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_expansion", n_jobs=-1, verbose=0)
236
  gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
237
 
238
  # +
@@ -240,35 +242,35 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
240
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
241
  vec_file = './pad-stats.parquet'
242
 
243
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
244
 
245
  # +
246
  # %%time
247
  tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
248
  vec_file = './pad-stats.parquet'
249
 
250
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
251
 
252
  # +
253
  # %%time
254
  tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
255
  vec_file = './pad-stats.parquet'
256
 
257
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
258
 
259
  # +
260
  # %%time
261
  tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
262
  vec_file = './pad-stats.parquet'
263
 
264
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
265
 
266
  # +
267
  # %%time
268
  tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
269
  vec_file = './pad-stats.parquet'
270
 
271
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
272
 
273
  # +
274
  columns = '''
 
1
+ # +
2
  import ibis
3
  import ibis.selectors as s
4
  from ibis import _
5
  con = ibis.duckdb.connect()
6
  con.load_extension("spatial")
7
 
8
+ threads = 2
9
+
10
  # +
11
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
12
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
13
  # "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
14
 
15
+ pad = ibis.read_parquet(parquet)
16
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
17
+ con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
18
+ pad = con.table("pad")
19
 
20
+ # or read the fgb version, much slower
21
+ # pad = con.read_geo(fgb)
22
 
23
 
24
+ # +
25
+ # pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
 
 
26
  # pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
27
  #pad.select("Comments").distinct().head(100).to_pandas()
28
+ # -
29
 
30
  import fiona
31
  meta = fiona.open(fgb)
 
134
  import pandas as pd
135
  from joblib import Parallel, delayed
136
 
137
+ def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=10000):
138
 
139
  # read in vector as geopandas, match CRS to raster
140
  with rasterio.open(tif_file) as src:
 
180
  vec_file = './pad-processed.parquet'
181
 
182
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
183
+ col_name = "human_impact", n_jobs=threads, verbose=0)
184
  gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
185
 
186
  # +
 
189
  tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
190
  vec_file = './pad-stats.parquet'
191
 
192
+ big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
193
 
194
 
195
  # +
 
199
  vec_file = './pad-stats.parquet'
200
 
201
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
202
+ col_name = "rsr", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
203
 
204
  # +
205
  # %%time
 
208
  vec_file = './pad-stats.parquet'
209
 
210
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
211
+ col_name = "deforest_carbon", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
212
 
213
  # +
214
  # %%time
 
217
  vec_file = './pad-stats.parquet'
218
 
219
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
220
+ col_name = "biodiversity_intactness_loss", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
221
 
222
  # +
223
  # %%time
 
226
  vec_file = './pad-stats.parquet'
227
 
228
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
229
+ col_name = "forest_integrity_loss", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
230
 
231
  # +
232
  # %%time
 
234
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
235
  vec_file = './pad-stats.parquet'
236
 
237
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_expansion", n_jobs=threads, verbose=0)
238
  gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
239
 
240
  # +
 
242
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
243
  vec_file = './pad-stats.parquet'
244
 
245
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
246
 
247
  # +
248
  # %%time
249
  tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
250
  vec_file = './pad-stats.parquet'
251
 
252
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
253
 
254
  # +
255
  # %%time
256
  tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
257
  vec_file = './pad-stats.parquet'
258
 
259
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
260
 
261
  # +
262
  # %%time
263
  tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
264
  vec_file = './pad-stats.parquet'
265
 
266
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
267
 
268
  # +
269
  # %%time
270
  tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
271
  vec_file = './pad-stats.parquet'
272
 
273
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
274
 
275
  # +
276
  columns = '''