cboettig commited on
Commit
b15781c
·
1 Parent(s): 9f29cb1

working on pre-process

Browse files
Files changed (1) hide show
  1. preprocess.py +47 -23
preprocess.py CHANGED
@@ -1,34 +1,44 @@
1
  import ibis
2
  import ibis.selectors as s
3
  from ibis import _
 
 
4
 
5
  # +
6
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
7
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
 
8
 
9
- #pad = ibis.read_parquet(parquet)
10
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
11
- con = ibis.duckdb.connect()
12
- con.load_extension("spatial")
13
- con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
14
- pad = con.table("pad")
15
 
 
 
16
 
17
- # +
18
- #pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
 
 
 
19
  # pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
20
  #pad.select("Comments").distinct().head(100).to_pandas()
21
 
22
- # +
23
  import fiona
 
 
 
 
 
 
24
  import rioxarray
25
  from shapely.geometry import box
26
 
27
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
28
 
29
- # fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
30
- meta = fiona.open(fgb)
31
- crs = meta.crs
32
  nrow = len(meta)
33
 
34
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
@@ -44,13 +54,15 @@ focal_columns = ["bucket", "FeatClass", "Mang_Name",
44
  "SHAPE_Area", "geom"]
45
  public = ["DIST", "LOC", "FED", "STAT", "JNT"]
46
 
 
 
47
  case = (
48
  ibis.case()
49
- .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public protected")
50
- .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["3"])), "mixed")
51
- .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["4"])), "public other")
52
- .when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["1","2", "3"]))), "private protected")
53
- .when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["4"]))), "private other")
54
  .when( (_.Mang_Type == "TRIB"), "tribal")
55
  .end()
56
  )
@@ -60,8 +72,8 @@ pad_parquet = (
60
  .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
61
  (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
62
  )
63
- .filter(_.Mang_Type.notin(["UNK", "TERR"]))
64
- .filter(_.geom.within(bounds))
65
  .mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
66
  .mutate(bucket = case)
67
  .mutate(row_n=ibis.row_number())
@@ -257,8 +269,8 @@ tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022
257
  vec_file = './pad-stats.parquet'
258
 
259
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
260
- # -
261
 
 
262
  columns = '''
263
  area_name,
264
  manager_name,
@@ -289,13 +301,18 @@ forest_integrity_loss,
289
  biodiversity_intactness_loss
290
  '''
291
 
 
 
 
 
 
 
292
  import ibis
293
- df = ibis.read_parquet("pad-stats.parquet")
294
- df.columns
 
295
 
296
  # +
297
-
298
-
299
  ## create pad.duckdb
300
  from sqlalchemy import create_engine
301
  from sqlalchemy import text
@@ -307,3 +324,10 @@ con.close()
307
 
308
  # pad_stats = ibis.read_parquet("pad-stats.parquet")
309
  # pad_stats.head(20).to_pandas()
 
 
 
 
 
 
 
 
1
  import ibis
2
  import ibis.selectors as s
3
  from ibis import _
4
+ con = ibis.duckdb.connect()
5
+ con.load_extension("spatial")
6
 
7
  # +
8
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
9
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
10
+ # "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
11
 
12
+ # pad = ibis.read_parquet(parquet)
13
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
14
+ # con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
15
+ # pad = con.table("pad")
 
 
16
 
17
+ # or read the fgb version
18
+ pad = con.read_geo(fgb)
19
 
20
+
21
+ # -
22
+
23
+
24
+ pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
25
  # pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
26
  #pad.select("Comments").distinct().head(100).to_pandas()
27
 
 
28
  import fiona
29
+ meta = fiona.open(fgb)
30
+ crs = meta.crs
31
+
32
+ # +
33
+ ## optional getting bounds
34
+
35
  import rioxarray
36
  from shapely.geometry import box
37
 
38
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
39
 
40
+ # fiona is not built with parquet support. ideally duckdb's st_read_meta would do this.
41
+
 
42
  nrow = len(meta)
43
 
44
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
 
54
  "SHAPE_Area", "geom"]
55
  public = ["DIST", "LOC", "FED", "STAT", "JNT"]
56
 
57
+ # Add our custom bucket categories:
58
+ # really could be done seperately.
59
  case = (
60
  ibis.case()
61
+ .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public conservation")
62
+ .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["3"])), "mixed use")
63
+ .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["4"])), "public unprotected")
64
+ .when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["1","2", "3"]))), "private conservation")
65
+ .when( (_.Mang_Type.isin(["PVT", "NGO"]) & (_.GAP_Sts.isin(["4"]))), "private unprotected")
66
  .when( (_.Mang_Type == "TRIB"), "tribal")
67
  .end()
68
  )
 
72
  .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
73
  (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
74
  )
75
+ # .filter(_.Mang_Type.notin(["UNK", "TERR"]))
76
+ # .filter(_.geom.within(bounds))
77
  .mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
78
  .mutate(bucket = case)
79
  .mutate(row_n=ibis.row_number())
 
269
  vec_file = './pad-stats.parquet'
270
 
271
  df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=-1, verbose=0).to_parquet("pad-stats.parquet")
 
272
 
273
+ # +
274
  columns = '''
275
  area_name,
276
  manager_name,
 
301
  biodiversity_intactness_loss
302
  '''
303
 
304
+ items = columns.split(',')
305
+ # Remove empty strings and whitespace
306
+ items = [item.strip() for item in items if item.strip()]
307
+ items
308
+ # -
309
+
310
  import ibis
311
+ from ibis import _
312
+ df = ibis.read_parquet("pad-stats.parquet").select(items)
313
+ df.group_by(_.manager_group).aggregate(n = _.manager_group.count()).to_pandas()
314
 
315
  # +
 
 
316
  ## create pad.duckdb
317
  from sqlalchemy import create_engine
318
  from sqlalchemy import text
 
324
 
325
  # pad_stats = ibis.read_parquet("pad-stats.parquet")
326
  # pad_stats.head(20).to_pandas()
327
+ # -
328
+
329
+ import pandas as pd
330
+ db_uri = "duckdb:///pad.duckdb"
331
+ engine = create_engine(db_uri)
332
+ con = engine.connect()
333
+ pd.DataFrame(con.execute("select * from pad limit 1").fetchall())