Spaces:
Sleeping
Sleeping
working on pre-process
Browse files- preprocess.py +47 -23
preprocess.py
CHANGED
@@ -1,34 +1,44 @@
|
|
1 |
import ibis
|
2 |
import ibis.selectors as s
|
3 |
from ibis import _
|
|
|
|
|
4 |
|
5 |
# +
|
6 |
fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
|
7 |
parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
|
|
|
8 |
|
9 |
-
#pad = ibis.read_parquet(parquet)
|
10 |
# Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
|
11 |
-
con
|
12 |
-
con.
|
13 |
-
con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
|
14 |
-
pad = con.table("pad")
|
15 |
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
#
|
|
|
|
|
|
|
19 |
# pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
|
20 |
#pad.select("Comments").distinct().head(100).to_pandas()
|
21 |
|
22 |
-
# +
|
23 |
import fiona
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
import rioxarray
|
25 |
from shapely.geometry import box
|
26 |
|
27 |
cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
|
28 |
|
29 |
-
# fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
|
30 |
-
|
31 |
-
crs = meta.crs
|
32 |
nrow = len(meta)
|
33 |
|
34 |
# extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
|
@@ -44,13 +54,15 @@ focal_columns = ["bucket", "FeatClass", "Mang_Name",
|
|
44 |
"SHAPE_Area", "geom"]
|
45 |
public = ["DIST", "LOC", "FED", "STAT", "JNT"]
|
46 |
|
|
|
|
|
47 |
case = (
|
48 |
ibis.case()
|
49 |
-
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public
|
50 |
-
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["3"])), "mixed")
|
51 |
-
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["4"])), "public
|
52 |
-
.when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["1","2", "3"]))), "private
|
53 |
-
.when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["4"]))), "private
|
54 |
.when( (_.Mang_Type == "TRIB"), "tribal")
|
55 |
.end()
|
56 |
)
|
@@ -60,8 +72,8 @@ pad_parquet = (
|
|
60 |
.filter((_.FeatClass.isin(["Easement", "Fee"])) | (
|
61 |
(_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
|
62 |
)
|
63 |
-
.filter(_.Mang_Type.notin(["UNK", "TERR"]))
|
64 |
-
.filter(_.geom.within(bounds))
|
65 |
.mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
|
66 |
.mutate(bucket = case)
|
67 |
.mutate(row_n=ibis.row_number())
|
@@ -257,8 +269,8 @@ tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022
|
|
257 |
vec_file = './pad-stats.parquet'
|
258 |
|
259 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
260 |
-
# -
|
261 |
|
|
|
262 |
columns = '''
|
263 |
area_name,
|
264 |
manager_name,
|
@@ -289,13 +301,18 @@ forest_integrity_loss,
|
|
289 |
biodiversity_intactness_loss
|
290 |
'''
|
291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
import ibis
|
293 |
-
|
294 |
-
df.
|
|
|
295 |
|
296 |
# +
|
297 |
-
|
298 |
-
|
299 |
## create pad.duckdb
|
300 |
from sqlalchemy import create_engine
|
301 |
from sqlalchemy import text
|
@@ -307,3 +324,10 @@ con.close()
|
|
307 |
|
308 |
# pad_stats = ibis.read_parquet("pad-stats.parquet")
|
309 |
# pad_stats.head(20).to_pandas()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import ibis
|
2 |
import ibis.selectors as s
|
3 |
from ibis import _
|
4 |
+
con = ibis.duckdb.connect()
|
5 |
+
con.load_extension("spatial")
|
6 |
|
7 |
# +
|
8 |
fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
|
9 |
parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
|
10 |
+
# "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
|
11 |
|
12 |
+
# pad = ibis.read_parquet(parquet)
|
13 |
# Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
|
14 |
+
# con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
|
15 |
+
# pad = con.table("pad")
|
|
|
|
|
16 |
|
17 |
+
# or read the fgb version
|
18 |
+
pad = con.read_geo(fgb)
|
19 |
|
20 |
+
|
21 |
+
# -
|
22 |
+
|
23 |
+
|
24 |
+
pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
|
25 |
# pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
|
26 |
#pad.select("Comments").distinct().head(100).to_pandas()
|
27 |
|
|
|
28 |
import fiona
|
29 |
+
meta = fiona.open(fgb)
|
30 |
+
crs = meta.crs
|
31 |
+
|
32 |
+
# +
|
33 |
+
## optional getting bounds
|
34 |
+
|
35 |
import rioxarray
|
36 |
from shapely.geometry import box
|
37 |
|
38 |
cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
|
39 |
|
40 |
+
# fiona is not built with parquet support. ideally duckdb's st_read_meta would do this.
|
41 |
+
|
|
|
42 |
nrow = len(meta)
|
43 |
|
44 |
# extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
|
|
|
54 |
"SHAPE_Area", "geom"]
|
55 |
public = ["DIST", "LOC", "FED", "STAT", "JNT"]
|
56 |
|
57 |
+
# Add our custom bucket categories:
|
58 |
+
# really could be done seperately.
|
59 |
case = (
|
60 |
ibis.case()
|
61 |
+
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public conservation")
|
62 |
+
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["3"])), "mixed use")
|
63 |
+
.when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["4"])), "public unprotected")
|
64 |
+
.when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["1","2", "3"]))), "private conservation")
|
65 |
+
.when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["4"]))), "private unprotected")
|
66 |
.when( (_.Mang_Type == "TRIB"), "tribal")
|
67 |
.end()
|
68 |
)
|
|
|
72 |
.filter((_.FeatClass.isin(["Easement", "Fee"])) | (
|
73 |
(_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
|
74 |
)
|
75 |
+
# .filter(_.Mang_Type.notin(["UNK", "TERR"]))
|
76 |
+
# .filter(_.geom.within(bounds))
|
77 |
.mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
|
78 |
.mutate(bucket = case)
|
79 |
.mutate(row_n=ibis.row_number())
|
|
|
269 |
vec_file = './pad-stats.parquet'
|
270 |
|
271 |
df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
|
|
|
272 |
|
273 |
+
# +
|
274 |
columns = '''
|
275 |
area_name,
|
276 |
manager_name,
|
|
|
301 |
biodiversity_intactness_loss
|
302 |
'''
|
303 |
|
304 |
+
items = columns.split(',')
|
305 |
+
# Remove empty strings and whitespace
|
306 |
+
items = [item.strip() for item in items if item.strip()]
|
307 |
+
items
|
308 |
+
# -
|
309 |
+
|
310 |
import ibis
|
311 |
+
from ibis import _
|
312 |
+
df = ibis.read_parquet("pad-stats.parquet").select(items)
|
313 |
+
df.group_by(_.manager_group).aggregate(n = _.manager_group.count()).to_pandas()
|
314 |
|
315 |
# +
|
|
|
|
|
316 |
## create pad.duckdb
|
317 |
from sqlalchemy import create_engine
|
318 |
from sqlalchemy import text
|
|
|
324 |
|
325 |
# pad_stats = ibis.read_parquet("pad-stats.parquet")
|
326 |
# pad_stats.head(20).to_pandas()
|
327 |
+
# -
|
328 |
+
|
329 |
+
import pandas as pd
|
330 |
+
db_uri = "duckdb:///pad.duckdb"
|
331 |
+
engine = create_engine(db_uri)
|
332 |
+
con = engine.connect()
|
333 |
+
pd.DataFrame(con.execute("select * from pad limit 1").fetchall())
|