Spaces:
Sleeping
Sleeping
...
Browse files- preprocess.py +24 -22
preprocess.py
CHANGED
@@ -1,29 +1,31 @@
|
|
|
|
1 |
import ibis
|
2 |
import ibis.selectors as s
|
3 |
from ibis import _
|
4 |
con = ibis.duckdb.connect()
|
5 |
con.load_extension("spatial")
|
6 |
|
|
|
|
|
7 |
# +
|
8 |
fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
|
9 |
parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
|
10 |
# "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
|
11 |
|
12 |
-
|
13 |
# Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
|
14 |
-
|
15 |
-
|
16 |
|
17 |
-
# or read the fgb version
|
18 |
-
pad = con.read_geo(fgb)
|
19 |
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
|
25 |
# pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
|
26 |
#pad.select("Comments").distinct().head(100).to_pandas()
|
|
|
27 |
|
28 |
import fiona
|
29 |
meta = fiona.open(fgb)
|
@@ -132,7 +134,7 @@ import geopandas as gpd
|
|
132 |
import pandas as pd
|
133 |
from joblib import Parallel, delayed
|
134 |
|
135 |
-
def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=
|
136 |
|
137 |
# read in vector as geopandas, match CRS to raster
|
138 |
with rasterio.open(tif_file) as src:
|
@@ -178,7 +180,7 @@ tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
|
|
178 |
vec_file = './pad-processed.parquet'
|
179 |
|
180 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
181 |
-
col_name = "human_impact", n_jobs
|
182 |
gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
183 |
|
184 |
# +
|
@@ -187,7 +189,7 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
187 |
tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
|
188 |
vec_file = './pad-stats.parquet'
|
189 |
|
190 |
-
big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs
|
191 |
|
192 |
|
193 |
# +
|
@@ -197,7 +199,7 @@ tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_Al
|
|
197 |
vec_file = './pad-stats.parquet'
|
198 |
|
199 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
200 |
-
col_name = "rsr", n_jobs
|
201 |
|
202 |
# +
|
203 |
# %%time
|
@@ -206,7 +208,7 @@ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_ca
|
|
206 |
vec_file = './pad-stats.parquet'
|
207 |
|
208 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
209 |
-
col_name = "deforest_carbon", n_jobs
|
210 |
|
211 |
# +
|
212 |
# %%time
|
@@ -215,7 +217,7 @@ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii
|
|
215 |
vec_file = './pad-stats.parquet'
|
216 |
|
217 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
218 |
-
col_name = "biodiversity_intactness_loss", n_jobs
|
219 |
|
220 |
# +
|
221 |
# %%time
|
@@ -224,7 +226,7 @@ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii
|
|
224 |
vec_file = './pad-stats.parquet'
|
225 |
|
226 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
227 |
-
col_name = "forest_integrity_loss", n_jobs
|
228 |
|
229 |
# +
|
230 |
# %%time
|
@@ -232,7 +234,7 @@ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
|
232 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
|
233 |
vec_file = './pad-stats.parquet'
|
234 |
|
235 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_expansion", n_jobs
|
236 |
gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
237 |
|
238 |
# +
|
@@ -240,35 +242,35 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
240 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
|
241 |
vec_file = './pad-stats.parquet'
|
242 |
|
243 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs
|
244 |
|
245 |
# +
|
246 |
# %%time
|
247 |
tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
|
248 |
vec_file = './pad-stats.parquet'
|
249 |
|
250 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs
|
251 |
|
252 |
# +
|
253 |
# %%time
|
254 |
tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
|
255 |
vec_file = './pad-stats.parquet'
|
256 |
|
257 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs
|
258 |
|
259 |
# +
|
260 |
# %%time
|
261 |
tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
|
262 |
vec_file = './pad-stats.parquet'
|
263 |
|
264 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs
|
265 |
|
266 |
# +
|
267 |
# %%time
|
268 |
tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
|
269 |
vec_file = './pad-stats.parquet'
|
270 |
|
271 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs
|
272 |
|
273 |
# +
|
274 |
columns = '''
|
|
|
1 |
+
# +
|
2 |
import ibis
|
3 |
import ibis.selectors as s
|
4 |
from ibis import _
|
5 |
con = ibis.duckdb.connect()
|
6 |
con.load_extension("spatial")
|
7 |
|
8 |
+
threads = 2
|
9 |
+
|
10 |
# +
|
11 |
fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
|
12 |
parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
|
13 |
# "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
|
14 |
|
15 |
+
pad = ibis.read_parquet(parquet)
|
16 |
# Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
|
17 |
+
con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
|
18 |
+
pad = con.table("pad")
|
19 |
|
20 |
+
# or read the fgb version, much slower
|
21 |
+
# pad = con.read_geo(fgb)
|
22 |
|
23 |
|
24 |
+
# +
|
25 |
+
# pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
|
|
|
|
|
26 |
# pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
|
27 |
#pad.select("Comments").distinct().head(100).to_pandas()
|
28 |
+
# -
|
29 |
|
30 |
import fiona
|
31 |
meta = fiona.open(fgb)
|
|
|
134 |
import pandas as pd
|
135 |
from joblib import Parallel, delayed
|
136 |
|
137 |
+
def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=10000):
|
138 |
|
139 |
# read in vector as geopandas, match CRS to raster
|
140 |
with rasterio.open(tif_file) as src:
|
|
|
180 |
vec_file = './pad-processed.parquet'
|
181 |
|
182 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
183 |
+
col_name = "human_impact", n_jobs=threads, verbose=0)
|
184 |
gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
185 |
|
186 |
# +
|
|
|
189 |
tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
|
190 |
vec_file = './pad-stats.parquet'
|
191 |
|
192 |
+
big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
193 |
|
194 |
|
195 |
# +
|
|
|
199 |
vec_file = './pad-stats.parquet'
|
200 |
|
201 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
202 |
+
col_name = "rsr", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
203 |
|
204 |
# +
|
205 |
# %%time
|
|
|
208 |
vec_file = './pad-stats.parquet'
|
209 |
|
210 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
211 |
+
col_name = "deforest_carbon", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
212 |
|
213 |
# +
|
214 |
# %%time
|
|
|
217 |
vec_file = './pad-stats.parquet'
|
218 |
|
219 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
220 |
+
col_name = "biodiversity_intactness_loss", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
221 |
|
222 |
# +
|
223 |
# %%time
|
|
|
226 |
vec_file = './pad-stats.parquet'
|
227 |
|
228 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
229 |
+
col_name = "forest_integrity_loss", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
230 |
|
231 |
# +
|
232 |
# %%time
|
|
|
234 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
|
235 |
vec_file = './pad-stats.parquet'
|
236 |
|
237 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_expansion", n_jobs=threads, verbose=0)
|
238 |
gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
239 |
|
240 |
# +
|
|
|
242 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
|
243 |
vec_file = './pad-stats.parquet'
|
244 |
|
245 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
246 |
|
247 |
# +
|
248 |
# %%time
|
249 |
tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
|
250 |
vec_file = './pad-stats.parquet'
|
251 |
|
252 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
253 |
|
254 |
# +
|
255 |
# %%time
|
256 |
tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
|
257 |
vec_file = './pad-stats.parquet'
|
258 |
|
259 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
260 |
|
261 |
# +
|
262 |
# %%time
|
263 |
tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
|
264 |
vec_file = './pad-stats.parquet'
|
265 |
|
266 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
267 |
|
268 |
# +
|
269 |
# %%time
|
270 |
tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
|
271 |
vec_file = './pad-stats.parquet'
|
272 |
|
273 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=threads, verbose=0).to_parquet("pad-stats.parquet")
|
274 |
|
275 |
# +
|
276 |
columns = '''
|