cboettig commited on
Commit
9f29cb1
·
1 Parent(s): 898bb75

update preprocess

Browse files
Files changed (1) hide show
  1. preprocess.py +101 -31
preprocess.py CHANGED
@@ -46,9 +46,11 @@ public = ["DIST", "LOC", "FED", "STAT", "JNT"]
46
 
47
  case = (
48
  ibis.case()
49
- .when( (_.Mang_Type.isin(public) & (_.GAP_Sts <= 2)), "public")
50
- .when( (_.Mang_Type.isin(public) & (_.GAP_Sts > 2)), "mixed")
51
- .when( (_.Mang_Type.isin(["PVT", "NGO"])), "private")
 
 
52
  .when( (_.Mang_Type == "TRIB"), "tribal")
53
  .end()
54
  )
@@ -60,7 +62,7 @@ pad_parquet = (
60
  )
61
  .filter(_.Mang_Type.notin(["UNK", "TERR"]))
62
  .filter(_.geom.within(bounds))
63
- .mutate(GAP_Sts = _.GAP_Sts.cast("int"))
64
  .mutate(bucket = case)
65
  .mutate(row_n=ibis.row_number())
66
  .select(focal_columns)
@@ -74,8 +76,8 @@ agency_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parq
74
  agency_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
75
  desig_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
76
  public_access = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
77
- state_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-state-name.parquet").select(state_id = "Code", state = "Dom")
78
- iucn = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-iucn.parquet").select(iucn_id = "CODE", iucn_category = "DOM")
79
 
80
  pad_processed = (pad_parquet
81
  .rename(manager_name_id = "Mang_Name",
@@ -84,21 +86,21 @@ pad_processed = (pad_parquet
84
  designation_type_id = "Des_Tp",
85
  public_access_id = "Pub_Access",
86
  category = "FeatClass",
87
- iucn_id = "IUCN_Cat",
88
  gap_code = "GAP_Sts",
89
- state_id = "State_Nm",
90
  easement_holder = "EsmtHldr",
91
  date_established = "Date_Est",
92
  area_square_meters = "SHAPE_Area",
93
- name = "Unit_Nm")
94
  .left_join(agency_name, "manager_name_id")
95
  .left_join(agency_type, "manager_type_id")
96
  .left_join(desig_type, "designation_type_id")
97
  .left_join(public_access, "public_access_id")
98
- .left_join(state_name, "state_id")
99
- .left_join(iucn, "iucn_id")
100
  .select(~s.contains("_right"))
101
- .select(~s.contains("_id"))
102
  )
103
  # pad_processed.to_parquet("pad-processed.parquet")
104
 
@@ -142,19 +144,25 @@ def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, t
142
  output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
143
 
144
  # reshape output
145
- df_zonal_stats = (
146
  pd.DataFrame(output)
147
  .rename(columns={'mean': col_name})
148
  .merge(gdf, how='right', on = 'row_n')
149
  )
150
- return df_zonal_stats
 
151
 
152
 
153
 
 
 
 
 
 
 
154
  # +
155
  # %%time
156
-
157
- tif_file = './hfp_2021_100m_v1-2_cog.tif'
158
  vec_file = './pad-processed.parquet'
159
 
160
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
@@ -167,9 +175,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
167
  tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
168
  vec_file = './pad-stats.parquet'
169
 
170
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
171
- col_name = "richness", n_jobs=-1, verbose=0)
172
- gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
173
 
174
  # +
175
  # %%time
@@ -178,8 +185,7 @@ tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_Al
178
  vec_file = './pad-stats.parquet'
179
 
180
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
181
- col_name = "rsr", n_jobs=-1, verbose=0)
182
- gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
183
 
184
  # +
185
  # %%time
@@ -187,8 +193,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
187
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
188
  vec_file = './pad-stats.parquet'
189
 
190
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "deforest_carbon", n_jobs=-1, verbose=0)
191
- gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
192
 
193
  # +
194
  # %%time
@@ -196,8 +202,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
196
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
197
  vec_file = './pad-stats.parquet'
198
 
199
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "biodiversity_intactness", n_jobs=-1, verbose=0)
200
- gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
201
 
202
  # +
203
  # %%time
@@ -205,8 +211,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
205
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
206
  vec_file = './pad-stats.parquet'
207
 
208
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "forest_integrity", n_jobs=-1, verbose=0)
209
- gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
210
 
211
  # +
212
  # %%time
@@ -219,20 +225,84 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
219
 
220
  # +
221
  # %%time
222
-
223
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
224
  vec_file = './pad-stats.parquet'
225
 
226
- df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=-1, verbose=0)
227
- gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
 
 
 
 
 
 
228
 
229
  # +
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  from sqlalchemy import create_engine
231
  from sqlalchemy import text
232
  db_uri = "duckdb:///pad.duckdb"
233
  engine = create_engine(db_uri)
234
  con = engine.connect()
235
- con.execute("create or replace table pad as select * from 'pad-stats.parquet'")
236
  con.close()
237
 
238
  # pad_stats = ibis.read_parquet("pad-stats.parquet")
 
46
 
47
  case = (
48
  ibis.case()
49
+ .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public protected")
50
+ .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["3"])), "mixed")
51
+ .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["4"])), "public other")
52
+ .when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["1","2", "3"]))), "private protected")
53
+ .when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["4"]))), "private other")
54
  .when( (_.Mang_Type == "TRIB"), "tribal")
55
  .end()
56
  )
 
62
  )
63
  .filter(_.Mang_Type.notin(["UNK", "TERR"]))
64
  .filter(_.geom.within(bounds))
65
+ .mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
66
  .mutate(bucket = case)
67
  .mutate(row_n=ibis.row_number())
68
  .select(focal_columns)
 
76
  agency_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
77
  desig_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
78
  public_access = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
79
+ state_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
80
+ iucn = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
81
 
82
  pad_processed = (pad_parquet
83
  .rename(manager_name_id = "Mang_Name",
 
86
  designation_type_id = "Des_Tp",
87
  public_access_id = "Pub_Access",
88
  category = "FeatClass",
89
+ iucn_code = "IUCN_Cat",
90
  gap_code = "GAP_Sts",
91
+ state = "State_Nm",
92
  easement_holder = "EsmtHldr",
93
  date_established = "Date_Est",
94
  area_square_meters = "SHAPE_Area",
95
+ area_name = "Unit_Nm")
96
  .left_join(agency_name, "manager_name_id")
97
  .left_join(agency_type, "manager_type_id")
98
  .left_join(desig_type, "designation_type_id")
99
  .left_join(public_access, "public_access_id")
100
+ .left_join(state_name, "state")
101
+ .left_join(iucn, "iucn_code")
102
  .select(~s.contains("_right"))
103
+ # .select(~s.contains("_id"))
104
  )
105
  # pad_processed.to_parquet("pad-processed.parquet")
106
 
 
144
  output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
145
 
146
  # reshape output
147
+ df = (
148
  pd.DataFrame(output)
149
  .rename(columns={'mean': col_name})
150
  .merge(gdf, how='right', on = 'row_n')
151
  )
152
+ gdf = gpd.GeoDataFrame(df, geometry="geometry")
153
+ return gdf
154
 
155
 
156
 
157
+ # -
158
+
159
+ import geopandas as gpd
160
+ test = gpd.read_parquet("pad-processed.parquet")
161
+ test.columns
162
+
163
  # +
164
  # %%time
165
+ tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
 
166
  vec_file = './pad-processed.parquet'
167
 
168
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
 
175
  tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
176
  vec_file = './pad-stats.parquet'
177
 
178
+ big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
179
+
 
180
 
181
  # +
182
  # %%time
 
185
  vec_file = './pad-stats.parquet'
186
 
187
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
188
+ col_name = "rsr", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
 
189
 
190
  # +
191
  # %%time
 
193
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
194
  vec_file = './pad-stats.parquet'
195
 
196
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
197
+ col_name = "deforest_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
198
 
199
  # +
200
  # %%time
 
202
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
203
  vec_file = './pad-stats.parquet'
204
 
205
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
206
+ col_name = "biodiversity_intactness_loss", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
207
 
208
  # +
209
  # %%time
 
211
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
212
  vec_file = './pad-stats.parquet'
213
 
214
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
215
+ col_name = "forest_integrity_loss", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
216
 
217
  # +
218
  # %%time
 
225
 
226
  # +
227
  # %%time
 
228
  tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
229
  vec_file = './pad-stats.parquet'
230
 
231
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
232
+
233
+ # +
234
+ # %%time
235
+ tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
236
+ vec_file = './pad-stats.parquet'
237
+
238
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
239
 
240
  # +
241
+ # %%time
242
+ tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
243
+ vec_file = './pad-stats.parquet'
244
+
245
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
246
+
247
+ # +
248
+ # %%time
249
+ tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
250
+ vec_file = './pad-stats.parquet'
251
+
252
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
253
+
254
+ # +
255
+ # %%time
256
+ tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
257
+ vec_file = './pad-stats.parquet'
258
+
259
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
260
+ # -
261
+
262
+ columns = '''
263
+ area_name,
264
+ manager_name,
265
+ manager_type,
266
+ manager_group,
267
+ designation_type,
268
+ public_access,
269
+ category,
270
+ iucn_code,
271
+ iucn_category,
272
+ gap_code,
273
+ state,
274
+ state_name,
275
+ easement_holder,
276
+ date_established,
277
+ area_square_meters,
278
+ geometry,
279
+ all_species_richness,
280
+ all_species_rwr,
281
+ manageable_carbon,
282
+ irrecoverable_carbon,
283
+ crop_reduction,
284
+ crop_expansion,
285
+ deforest_carbon,
286
+ richness,
287
+ rsr,
288
+ forest_integrity_loss,
289
+ biodiversity_intactness_loss
290
+ '''
291
+
292
+ import ibis
293
+ df = ibis.read_parquet("pad-stats.parquet")
294
+ df.columns
295
+
296
+ # +
297
+
298
+
299
+ ## create pad.duckdb
300
  from sqlalchemy import create_engine
301
  from sqlalchemy import text
302
  db_uri = "duckdb:///pad.duckdb"
303
  engine = create_engine(db_uri)
304
  con = engine.connect()
305
+ con.execute(f"create or replace table pad as select {columns} from 'pad-stats.parquet'")
306
  con.close()
307
 
308
  # pad_stats = ibis.read_parquet("pad-stats.parquet")