cboettig commited on
Commit
321154b
·
1 Parent(s): 71736e2

preprocess

Browse files
Files changed (1) hide show
  1. preprocess.py +122 -0
preprocess.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ibis
2
+ from ibis import _
3
+ import rioxarray
4
+ from shapely.geometry import box
5
+ import fiona
6
+
7
+ # +
8
+ # read crs, ideally we could do this with st_read_meta() in ibis+duckdb
9
+ fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
10
+ v = fiona.open(fgb)
11
+ crs = v.crs
12
+
13
+ # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
14
+ cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
15
+ r = rioxarray.open_rasterio(cog)
16
+ bounds = box(*r.rio.transform_bounds(crs))
17
+
18
+
19
+ # +
20
+ #import leafmap
21
+ #leafmap.cog_validate(cog)
22
+
23
+ # +
24
+ con = ibis.duckdb.connect()
25
+
26
+ # We could just read the flatgeobuf with read_geo.
27
+ # it is not as fast as working with the (Geo)Parquet
28
+ # pad = con.read_geo(fgb)
29
+ # -
30
+
31
+ # Unfortunately, ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
32
+ parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
33
+ con = ibis.duckdb.connect()
34
+ con.load_extension("spatial")
35
+ con.raw_sql(f"CREATE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
36
+ pad = con.table("pad")
37
+
38
+ pad.columns
39
+
40
+ # +
41
+ # Now we can do all the usual SQL queries to subset the data. Note the `geom.within()` spatial filter!
42
+
43
+ focal_columns = ["bucket", "FeatClass", "Mang_Name", "Mang_Type", "Des_Tp",
44
+ "Pub_Access", "GAP_Sts", "IUCN_Cat", "Unit_Nm", "geom"]
45
+ public = ["DIST", "LOC", "FED", "STAT", "JNT"]
46
+
47
+ case = (
48
+ ibis.case()
49
+ .when( (_.Mang_Type.isin(public) & (_.GAP_Sts <= 2)), "public")
50
+ .when( (_.Mang_Type.isin(public) & (_.GAP_Sts > 2)), "mixed")
51
+ .when( (_.Mang_Type.isin(["PVT", "NGO"])), "private")
52
+ .when( (_.Mang_Type == "TRIB"), "tribal")
53
+ .end()
54
+ )
55
+
56
+ pad_labeled = (
57
+ pad.
58
+ filter((_.FeatClass.isin(["Easement", "Fee"])) | (
59
+ (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
60
+ ).
61
+ filter(_.Mang_Type.notin(["UNK", "TERR"])).
62
+ filter(_.geom.within(bounds)).
63
+ mutate(GAP_Sts = _.GAP_Sts.cast("int")).
64
+ mutate(bucket = case).
65
+ select(focal_columns).
66
+ mutate(row_n=ibis.row_number())
67
+ )
68
+
69
+ # -
70
+
71
+ pad_labeled.filter(_.row_n < 10).to_pandas()
72
+
73
+ # +
74
+ # # %%time
75
+ # testing -- only the lower 48 states!
76
+ # (pad.filter(_.geom.within(bounds)).group_by([_.State_Nm]).aggregate(n = _.count()).to_pandas())
77
+ # -
78
+
79
+ start = 0
80
+ end = 10000
81
+ df = pad_labeled.filter([_.row_n > start, _.row_n <= end]).to_pandas()
82
+ #from_wkb(df.geometry)
83
+
84
+ import geopandas
85
+ geo = geopandas.GeoDataFrame(
86
+ df, geometry=df.geometry, crs=crs
87
+ )
88
+ geo.shape
89
+
90
+ # +
91
+ #geo.geometry.values
92
+ # -
93
+
94
+ raster = (rioxarray.
95
+ open_rasterio('/vsicurl/'+cog, masked=True).
96
+ rio.clip(geo.geometry.values, crs, from_disk=True).
97
+ sel(band=1).drop("band")
98
+ )
99
+
100
+
101
+ # +
102
+ # https://corteva.github.io/geocube/html/examples/zonal_statistics.html
103
+ from geocube.api.core import make_geocube
104
+ import xarray
105
+ out_grid = make_geocube(
106
+ vector_data=geo,
107
+ measurements=["row_n"],
108
+ like=raster, # ensure the data are on the same grid
109
+ )
110
+
111
+ # merge the two together
112
+ out_grid["richness"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
113
+ # -
114
+
115
+ grouped_raster = out_grid.drop("spatial_ref").groupby(out_grid.row_n)
116
+ grid_mean = grouped_raster.mean().rename({"richness": "richness_mean"})
117
+ zonal_stats = xarray.merge([grid_mean]).to_dataframe()
118
+
119
+ geo = geo.merge(zonal_stats, how="left", on="row_n")
120
+
121
+ geo.plot(column="richness_mean", legend=True)
122
+