Spaces:
Sleeping
Sleeping
update preprocess
Browse files- preprocess.py +101 -31
preprocess.py
CHANGED
@@ -46,9 +46,11 @@ public = ["DIST", "LOC", "FED", "STAT", "JNT"]
|
|
46 |
|
47 |
case = (
|
48 |
ibis.case()
|
49 |
-
.when( (_.Mang_Type.isin(public) &
|
50 |
-
.when( (_.Mang_Type.isin(public) &
|
51 |
-
.when( (_.Mang_Type.isin(["
|
|
|
|
|
52 |
.when( (_.Mang_Type == "TRIB"), "tribal")
|
53 |
.end()
|
54 |
)
|
@@ -60,7 +62,7 @@ pad_parquet = (
|
|
60 |
)
|
61 |
.filter(_.Mang_Type.notin(["UNK", "TERR"]))
|
62 |
.filter(_.geom.within(bounds))
|
63 |
-
.mutate(GAP_Sts = _.GAP_Sts
|
64 |
.mutate(bucket = case)
|
65 |
.mutate(row_n=ibis.row_number())
|
66 |
.select(focal_columns)
|
@@ -74,8 +76,8 @@ agency_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parq
|
|
74 |
agency_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
|
75 |
desig_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
|
76 |
public_access = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
|
77 |
-
state_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-state-name.parquet").select(
|
78 |
-
iucn = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-iucn.parquet").select(
|
79 |
|
80 |
pad_processed = (pad_parquet
|
81 |
.rename(manager_name_id = "Mang_Name",
|
@@ -84,21 +86,21 @@ pad_processed = (pad_parquet
|
|
84 |
designation_type_id = "Des_Tp",
|
85 |
public_access_id = "Pub_Access",
|
86 |
category = "FeatClass",
|
87 |
-
|
88 |
gap_code = "GAP_Sts",
|
89 |
-
|
90 |
easement_holder = "EsmtHldr",
|
91 |
date_established = "Date_Est",
|
92 |
area_square_meters = "SHAPE_Area",
|
93 |
-
|
94 |
.left_join(agency_name, "manager_name_id")
|
95 |
.left_join(agency_type, "manager_type_id")
|
96 |
.left_join(desig_type, "designation_type_id")
|
97 |
.left_join(public_access, "public_access_id")
|
98 |
-
.left_join(state_name, "
|
99 |
-
.left_join(iucn, "
|
100 |
.select(~s.contains("_right"))
|
101 |
-
|
102 |
)
|
103 |
# pad_processed.to_parquet("pad-processed.parquet")
|
104 |
|
@@ -142,19 +144,25 @@ def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, t
|
|
142 |
output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
|
143 |
|
144 |
# reshape output
|
145 |
-
|
146 |
pd.DataFrame(output)
|
147 |
.rename(columns={'mean': col_name})
|
148 |
.merge(gdf, how='right', on = 'row_n')
|
149 |
)
|
150 |
-
|
|
|
151 |
|
152 |
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
# +
|
155 |
# %%time
|
156 |
-
|
157 |
-
tif_file = './hfp_2021_100m_v1-2_cog.tif'
|
158 |
vec_file = './pad-processed.parquet'
|
159 |
|
160 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
@@ -167,9 +175,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
167 |
tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
|
168 |
vec_file = './pad-stats.parquet'
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
173 |
|
174 |
# +
|
175 |
# %%time
|
@@ -178,8 +185,7 @@ tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_Al
|
|
178 |
vec_file = './pad-stats.parquet'
|
179 |
|
180 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
181 |
-
col_name = "rsr", n_jobs=-1, verbose=0)
|
182 |
-
gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
183 |
|
184 |
# +
|
185 |
# %%time
|
@@ -187,8 +193,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
187 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
|
188 |
vec_file = './pad-stats.parquet'
|
189 |
|
190 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
191 |
-
|
192 |
|
193 |
# +
|
194 |
# %%time
|
@@ -196,8 +202,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
196 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
|
197 |
vec_file = './pad-stats.parquet'
|
198 |
|
199 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
200 |
-
|
201 |
|
202 |
# +
|
203 |
# %%time
|
@@ -205,8 +211,8 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
205 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
|
206 |
vec_file = './pad-stats.parquet'
|
207 |
|
208 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
209 |
-
|
210 |
|
211 |
# +
|
212 |
# %%time
|
@@ -219,20 +225,84 @@ gpd.GeoDataFrame(df, geometry="geometry").to_parquet("pad-stats.parquet")
|
|
219 |
|
220 |
# +
|
221 |
# %%time
|
222 |
-
|
223 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
|
224 |
vec_file = './pad-stats.parquet'
|
225 |
|
226 |
-
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=-1, verbose=0)
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
# +
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
from sqlalchemy import create_engine
|
231 |
from sqlalchemy import text
|
232 |
db_uri = "duckdb:///pad.duckdb"
|
233 |
engine = create_engine(db_uri)
|
234 |
con = engine.connect()
|
235 |
-
con.execute("create or replace table pad as select
|
236 |
con.close()
|
237 |
|
238 |
# pad_stats = ibis.read_parquet("pad-stats.parquet")
|
|
|
46 |
|
47 |
case = (
|
48 |
ibis.case()
|
49 |
+
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public protected")
|
50 |
+
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["3"])), "mixed")
|
51 |
+
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["4"])), "public other")
|
52 |
+
.when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["1","2", "3"]))), "private protected")
|
53 |
+
.when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["4"]))), "private other")
|
54 |
.when( (_.Mang_Type == "TRIB"), "tribal")
|
55 |
.end()
|
56 |
)
|
|
|
62 |
)
|
63 |
.filter(_.Mang_Type.notin(["UNK", "TERR"]))
|
64 |
.filter(_.geom.within(bounds))
|
65 |
+
.mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
|
66 |
.mutate(bucket = case)
|
67 |
.mutate(row_n=ibis.row_number())
|
68 |
.select(focal_columns)
|
|
|
76 |
agency_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
|
77 |
desig_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
|
78 |
public_access = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
|
79 |
+
state_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
|
80 |
+
iucn = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
|
81 |
|
82 |
pad_processed = (pad_parquet
|
83 |
.rename(manager_name_id = "Mang_Name",
|
|
|
86 |
designation_type_id = "Des_Tp",
|
87 |
public_access_id = "Pub_Access",
|
88 |
category = "FeatClass",
|
89 |
+
iucn_code = "IUCN_Cat",
|
90 |
gap_code = "GAP_Sts",
|
91 |
+
state = "State_Nm",
|
92 |
easement_holder = "EsmtHldr",
|
93 |
date_established = "Date_Est",
|
94 |
area_square_meters = "SHAPE_Area",
|
95 |
+
area_name = "Unit_Nm")
|
96 |
.left_join(agency_name, "manager_name_id")
|
97 |
.left_join(agency_type, "manager_type_id")
|
98 |
.left_join(desig_type, "designation_type_id")
|
99 |
.left_join(public_access, "public_access_id")
|
100 |
+
.left_join(state_name, "state")
|
101 |
+
.left_join(iucn, "iucn_code")
|
102 |
.select(~s.contains("_right"))
|
103 |
+
# .select(~s.contains("_id"))
|
104 |
)
|
105 |
# pad_processed.to_parquet("pad-processed.parquet")
|
106 |
|
|
|
144 |
output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
|
145 |
|
146 |
# reshape output
|
147 |
+
df = (
|
148 |
pd.DataFrame(output)
|
149 |
.rename(columns={'mean': col_name})
|
150 |
.merge(gdf, how='right', on = 'row_n')
|
151 |
)
|
152 |
+
gdf = gpd.GeoDataFrame(df, geometry="geometry")
|
153 |
+
return gdf
|
154 |
|
155 |
|
156 |
|
157 |
+
# -
|
158 |
+
|
159 |
+
import geopandas as gpd
|
160 |
+
test = gpd.read_parquet("pad-processed.parquet")
|
161 |
+
test.columns
|
162 |
+
|
163 |
# +
|
164 |
# %%time
|
165 |
+
tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
|
|
|
166 |
vec_file = './pad-processed.parquet'
|
167 |
|
168 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
|
|
175 |
tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
|
176 |
vec_file = './pad-stats.parquet'
|
177 |
|
178 |
+
big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
179 |
+
|
|
|
180 |
|
181 |
# +
|
182 |
# %%time
|
|
|
185 |
vec_file = './pad-stats.parquet'
|
186 |
|
187 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
188 |
+
col_name = "rsr", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
|
|
189 |
|
190 |
# +
|
191 |
# %%time
|
|
|
193 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
|
194 |
vec_file = './pad-stats.parquet'
|
195 |
|
196 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
197 |
+
col_name = "deforest_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
198 |
|
199 |
# +
|
200 |
# %%time
|
|
|
202 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
|
203 |
vec_file = './pad-stats.parquet'
|
204 |
|
205 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
206 |
+
col_name = "biodiversity_intactness_loss", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
207 |
|
208 |
# +
|
209 |
# %%time
|
|
|
211 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
|
212 |
vec_file = './pad-stats.parquet'
|
213 |
|
214 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
|
215 |
+
col_name = "forest_integrity_loss", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
216 |
|
217 |
# +
|
218 |
# %%time
|
|
|
225 |
|
226 |
# +
|
227 |
# %%time
|
|
|
228 |
tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
|
229 |
vec_file = './pad-stats.parquet'
|
230 |
|
231 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
232 |
+
|
233 |
+
# +
|
234 |
+
# %%time
|
235 |
+
tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
|
236 |
+
vec_file = './pad-stats.parquet'
|
237 |
+
|
238 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
239 |
|
240 |
# +
|
241 |
+
# %%time
|
242 |
+
tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
|
243 |
+
vec_file = './pad-stats.parquet'
|
244 |
+
|
245 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
246 |
+
|
247 |
+
# +
|
248 |
+
# %%time
|
249 |
+
tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
|
250 |
+
vec_file = './pad-stats.parquet'
|
251 |
+
|
252 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
253 |
+
|
254 |
+
# +
|
255 |
+
# %%time
|
256 |
+
tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
|
257 |
+
vec_file = './pad-stats.parquet'
|
258 |
+
|
259 |
+
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
260 |
+
# -
|
261 |
+
|
262 |
+
columns = '''
|
263 |
+
area_name,
|
264 |
+
manager_name,
|
265 |
+
manager_type,
|
266 |
+
manager_group,
|
267 |
+
designation_type,
|
268 |
+
public_access,
|
269 |
+
category,
|
270 |
+
iucn_code,
|
271 |
+
iucn_category,
|
272 |
+
gap_code,
|
273 |
+
state,
|
274 |
+
state_name,
|
275 |
+
easement_holder,
|
276 |
+
date_established,
|
277 |
+
area_square_meters,
|
278 |
+
geometry,
|
279 |
+
all_species_richness,
|
280 |
+
all_species_rwr,
|
281 |
+
manageable_carbon,
|
282 |
+
irrecoverable_carbon,
|
283 |
+
crop_reduction,
|
284 |
+
crop_expansion,
|
285 |
+
deforest_carbon,
|
286 |
+
richness,
|
287 |
+
rsr,
|
288 |
+
forest_integrity_loss,
|
289 |
+
biodiversity_intactness_loss
|
290 |
+
'''
|
291 |
+
|
292 |
+
import ibis
|
293 |
+
df = ibis.read_parquet("pad-stats.parquet")
|
294 |
+
df.columns
|
295 |
+
|
296 |
+
# +
|
297 |
+
|
298 |
+
|
299 |
+
## create pad.duckdb
|
300 |
from sqlalchemy import create_engine
|
301 |
from sqlalchemy import text
|
302 |
db_uri = "duckdb:///pad.duckdb"
|
303 |
engine = create_engine(db_uri)
|
304 |
con = engine.connect()
|
305 |
+
con.execute(f"create or replace table pad as select {columns} from 'pad-stats.parquet'")
|
306 |
con.close()
|
307 |
|
308 |
# pad_stats = ibis.read_parquet("pad-stats.parquet")
|