cboettig commited on
Commit
b8cea97
·
1 Parent(s): 2e4f09d
Files changed (1) hide show
  1. preprocess.py +59 -56
preprocess.py CHANGED
@@ -1,63 +1,72 @@
1
- # +
2
  import ibis
3
  import ibis.selectors as s
4
  from ibis import _
5
- con = ibis.duckdb.connect()
6
- con.load_extension("spatial")
7
-
8
- threads = 2
9
 
10
  # +
 
11
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
12
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
13
- # "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb"
14
 
15
- pad = ibis.read_parquet(parquet)
16
- # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
17
- con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
18
- pad = con.table("pad")
19
 
20
  # or read the fgb version, much slower
21
  # pad = con.read_geo(fgb)
22
-
23
-
24
- # +
25
- # pad.filter(_.Category == "Easement").select("EHoldTyp", "Mang_Type", "Unit_Nm").distinct().head(100).to_pandas()
26
- # pad.filter(_.Category == "Easement").select("EsmtHldr", "Mang_Name", "Unit_Nm").distinct().sample(.1).to_pandas()
27
- #pad.select("Comments").distinct().head(100).to_pandas()
28
  # -
29
 
30
- import fiona
 
 
31
  meta = fiona.open(fgb)
32
  crs = meta.crs
33
 
34
  # +
35
  ## optional getting bounds
36
-
37
- import rioxarray
38
- from shapely.geometry import box
39
-
40
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
41
 
42
- # fiona is not built with parquet support. ideally duckdb's st_read_meta would do this.
43
-
44
- nrow = len(meta)
45
-
46
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
47
  r = rioxarray.open_rasterio(cog)
48
  bounds = box(*r.rio.transform_bounds(crs))
49
 
50
  # +
51
  # Now we can do all the usual SQL queries to subset the data. Note the `geom.within()` spatial filter!
52
- focal_columns = ["bucket", "FeatClass", "Mang_Name",
53
  "Mang_Type", "Des_Tp", "Pub_Access",
54
  "GAP_Sts", "IUCN_Cat", "Unit_Nm",
55
  "State_Nm", "EsmtHldr", "Date_Est",
56
  "SHAPE_Area", "geom"]
57
- public = ["DIST", "LOC", "FED", "STAT", "JNT"]
 
 
 
 
 
 
 
 
 
 
 
58
 
 
59
  # Add our custom bucket categories:
60
  # really could be done seperately.
 
 
 
 
 
 
61
  case = (
62
  ibis.case()
63
  .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public conservation")
@@ -68,35 +77,29 @@ case = (
68
  .when( (_.Mang_Type == "TRIB"), "tribal")
69
  .end()
70
  )
71
-
72
- pad_parquet = (
73
  pad
 
74
  .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
75
  (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
76
  )
77
- # .filter(_.Mang_Type.notin(["UNK", "TERR"]))
78
- # .filter(_.geom.within(bounds))
79
- .mutate(GAP_Sts = _.GAP_Sts) # do not cast to integer!
80
  .mutate(bucket = case)
81
- .mutate(row_n=ibis.row_number())
82
- .select(focal_columns)
83
- .rename(geometry="geom")
84
  )
85
 
86
- #pad_parquet.to_parquet("pad-processed.parquet")
87
  # -
88
 
89
- agency_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
90
- agency_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
91
- desig_type = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
92
- public_access = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
93
- state_name = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
94
- iucn = con.read_parquet("/home/rstudio/huggingface/datasets/pad-us-3/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
95
 
96
- pad_processed = (pad_parquet
97
  .rename(manager_name_id = "Mang_Name",
98
  manager_type_id = "Mang_Type",
99
- manager_group="bucket",
100
  designation_type_id = "Des_Tp",
101
  public_access_id = "Pub_Access",
102
  category = "FeatClass",
@@ -114,18 +117,14 @@ pad_processed = (pad_parquet
114
  .left_join(state_name, "state")
115
  .left_join(iucn, "iucn_code")
116
  .select(~s.contains("_right"))
117
- # .select(~s.contains("_id"))
118
- )
119
- # pad_processed.to_parquet("pad-processed.parquet")
120
-
121
- # +
122
  # if we keep the original geoparquet WKB 'geometry' column, to_pandas() (or execute) gives us only a normal pandas data.frame, and geopandas doesn't see the metadata.
123
  # if we replace the geometry with duckdb-native 'geometry' type, to_pandas() gives us a geopanadas! But requires reading into RAM.
124
- import geopandas as gpd
 
 
 
125
 
126
- gdf = pad_processed.to_pandas()
127
- gdf = gdf.set_crs(crs)
128
- gdf.to_parquet("pad-processed.parquet")
129
 
130
  # +
131
  import rasterio
@@ -141,7 +140,8 @@ def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, t
141
  raster_profile = src.profile
142
  gdf = gpd.read_parquet(vec_file).to_crs(raster_profile['crs'])
143
 
144
- gdf["row_n"] = gdf.index + 1
 
145
 
146
  # lamba fn to zonal_stats a slice:
147
  def get_stats(geom_slice, tif_file, stats):
@@ -275,10 +275,13 @@ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_spec
275
  # +
276
  columns = '''
277
  area_name,
278
- manager_name,
279
- manager_type,
 
 
280
  manager_group,
281
  designation_type,
 
282
  public_access,
283
  category,
284
  iucn_code,
 
 
1
  import ibis
2
  import ibis.selectors as s
3
  from ibis import _
4
+ import fiona
5
+ import geopandas as gpd
6
+ import rioxarray
7
+ from shapely.geometry import box
8
 
9
  # +
10
+
11
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
12
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
13
+ # gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
14
 
15
+ con = ibis.duckdb.connect()
16
+ con.load_extension("spatial")
17
+ threads = 24
 
18
 
19
  # or read the fgb version, much slower
20
  # pad = con.read_geo(fgb)
21
+ # pad = con.read_parquet(parquet)
22
+ # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
23
+ con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
24
+ pad = con.table("pad")
 
 
25
  # -
26
 
27
+
28
+ # Get the CRS
29
+ # fiona is not built with parquet support, must read this from fgb. ideally duckdb's st_read_meta would do this from the parquet
30
  meta = fiona.open(fgb)
31
  crs = meta.crs
32
 
33
  # +
34
  ## optional getting bounds
 
 
 
 
35
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
36
 
 
 
 
 
37
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
38
  r = rioxarray.open_rasterio(cog)
39
  bounds = box(*r.rio.transform_bounds(crs))
40
 
41
  # +
42
  # Now we can do all the usual SQL queries to subset the data. Note the `geom.within()` spatial filter!
43
+ focal_columns = ["row_n", "FeatClass", "Mang_Name",
44
  "Mang_Type", "Des_Tp", "Pub_Access",
45
  "GAP_Sts", "IUCN_Cat", "Unit_Nm",
46
  "State_Nm", "EsmtHldr", "Date_Est",
47
  "SHAPE_Area", "geom"]
48
+ pad_parquet = (
49
+ pad
50
+ .mutate(row_n=ibis.row_number())
51
+ .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
52
+ (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
53
+ )
54
+ .filter(_.geom.within(bounds))
55
+ .select(focal_columns)
56
+ .rename(geometry="geom")
57
+ )
58
+
59
+ pad_parquet.to_parquet("pad-processed.parquet")
60
 
61
+ # +
62
  # Add our custom bucket categories:
63
  # really could be done seperately.
64
+ categorical_columns = ["bucket", "FeatClass", "Mang_Name",
65
+ "Mang_Type", "Des_Tp", "Pub_Access",
66
+ "GAP_Sts", "IUCN_Cat", "Unit_Nm",
67
+ "State_Nm", "EsmtHldr", "Date_Est",
68
+ "row_n"]
69
+ public = ["DIST", "LOC", "FED", "STAT", "JNT"]
70
  case = (
71
  ibis.case()
72
  .when( (_.Mang_Type.isin(public) & _.GAP_Sts.isin(["1","2"])), "public conservation")
 
77
  .when( (_.Mang_Type == "TRIB"), "tribal")
78
  .end()
79
  )
80
+ pad_grouping = (
 
81
  pad
82
+ .mutate(row_n=ibis.row_number())
83
  .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
84
  (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
85
  )
 
 
 
86
  .mutate(bucket = case)
87
+ .select(categorical_columns)
 
 
88
  )
89
 
90
+ pad_grouping.to_parquet("pad-groupings.parquet")
91
  # -
92
 
93
+ agency_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
94
+ agency_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
95
+ desig_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
96
+ public_access = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
97
+ state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
98
+ iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
99
 
100
+ (pad_parquet
101
  .rename(manager_name_id = "Mang_Name",
102
  manager_type_id = "Mang_Type",
 
103
  designation_type_id = "Des_Tp",
104
  public_access_id = "Pub_Access",
105
  category = "FeatClass",
 
117
  .left_join(state_name, "state")
118
  .left_join(iucn, "iucn_code")
119
  .select(~s.contains("_right"))
120
+ # .select(~s.contains("_id"))
 
 
 
 
121
  # if we keep the original geoparquet WKB 'geometry' column, to_pandas() (or execute) gives us only a normal pandas data.frame, and geopandas doesn't see the metadata.
122
  # if we replace the geometry with duckdb-native 'geometry' type, to_pandas() gives us a geopanadas! But requires reading into RAM.
123
+ .to_pandas()
124
+ .set_crs(crs)
125
+ .to_parquet("pad-processed.parquet")
126
+ )
127
 
 
 
 
128
 
129
  # +
130
  import rasterio
 
140
  raster_profile = src.profile
141
  gdf = gpd.read_parquet(vec_file).to_crs(raster_profile['crs'])
142
 
143
+ # row_n is a global id, may refer to excluded polygons
144
+ # gdf["row_id"] = gdf.index + 1
145
 
146
  # lamba fn to zonal_stats a slice:
147
  def get_stats(geom_slice, tif_file, stats):
 
275
  # +
276
  columns = '''
277
  area_name,
278
+ manager_name,
279
+ manager_name_id,
280
+ manager_type,
281
+ manager_type_id,
282
  manager_group,
283
  designation_type,
284
+ designation_type_id,
285
  public_access,
286
  category,
287
  iucn_code,