cboettig commited on
Commit
e436758
·
1 Parent(s): abebbc4
Files changed (1) hide show
  1. preprocess.py +107 -30
preprocess.py CHANGED
@@ -1,16 +1,22 @@
1
  import ibis
2
  from ibis import _
3
  import rioxarray
 
4
  from shapely.geometry import box
 
 
5
  import fiona
6
 
 
7
  # +
8
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
9
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
10
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
11
 
12
  # fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
13
- crs = fiona.open(fgb).crs
 
 
14
 
15
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
16
  r = rioxarray.open_rasterio(cog)
@@ -27,7 +33,7 @@ con = ibis.duckdb.connect()
27
 
28
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
29
  con.load_extension("spatial")
30
- con.raw_sql(f"CREATE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
31
  pad = con.table("pad")
32
 
33
  # +
@@ -59,34 +65,15 @@ pad_labeled = (
59
  )
60
 
61
 
62
- # +
63
- # # %%time
64
- # testing -- only the lower 48 states!
65
- # (pad.filter(_.geom.within(bounds)).group_by([_.State_Nm]).aggregate(n = _.count()).to_pandas())
66
  # -
67
 
68
- # We could work in chunks, possibly parallelize this....
69
- start = 0
70
- end = 10000
71
- df = pad_labeled.filter([_.row_n > start, _.row_n <= end]).to_pandas()
72
-
73
- # Or be bold!
74
- df = pad_labeled.to_pandas()
75
-
76
- from geocube.api.core import make_geocube
77
- import xarray
78
- import geopandas
79
- geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
80
-
81
- geo.shape
82
 
83
-
84
- def zonal_stats(cog, geo, crs, row_n = "row_n"):
85
  # https://corteva.github.io/geocube/html/examples/zonal_statistics.html
86
  raster = (rioxarray.
87
  open_rasterio('/vsicurl/'+cog, masked=True).
88
- rio.clip(geo.geometry.values, crs, from_disk=True).
89
- sel(band=1).drop("band")
90
  )
91
  out_grid = make_geocube(
92
  vector_data=geo,
@@ -95,22 +82,112 @@ def zonal_stats(cog, geo, crs, row_n = "row_n"):
95
  )
96
  # merge the two together
97
  out_grid["values"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
98
- grouped_raster = out_grid.drop("spatial_ref").groupby(out_grid.row_n)
99
  # can add other stats
100
- grid_mean = grouped_raster.mean().rename({"values": "mean"})
101
  zonal_stats = xarray.merge([grid_mean]).to_dataframe()
102
  geo = geo.merge(zonal_stats, how="left", on=row_n)
103
  return geo
104
 
105
 
106
- geo = zonal_stats(cog, geo, crs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- geo.to_parquet("pad-mobi.parquet")
109
 
110
- # Now we need to convert to PMTiles:
 
 
 
111
  #
112
  # ```
113
  # ogr2ogr -dsco MAX_SIZE=90000000 -dsco MAX_FEATURES=50000000 -dsco MAXZOOM=10 pad-mobi.pmtiles pad-mobi.parquet
114
  # ```
115
 
116
- geo.plot(column="mean", legend=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import ibis
2
  from ibis import _
3
  import rioxarray
4
+ import xarray
5
  from shapely.geometry import box
6
+ from geocube.api.core import make_geocube
7
+ import geopandas
8
  import fiona
9
 
10
+
11
  # +
12
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
13
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
14
  cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
15
 
16
  # fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
17
+ meta = fiona.open(fgb)
18
+ crs = meta.crs
19
+ nrow = len(meta)
20
 
21
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
22
  r = rioxarray.open_rasterio(cog)
 
33
 
34
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
35
  con.load_extension("spatial")
36
+ con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
37
  pad = con.table("pad")
38
 
39
  # +
 
65
  )
66
 
67
 
 
 
 
 
68
  # -
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ def zonal_stats(cog, geo, band_name = "mean", row_n = "row_n"):
 
72
  # https://corteva.github.io/geocube/html/examples/zonal_statistics.html
73
  raster = (rioxarray.
74
  open_rasterio('/vsicurl/'+cog, masked=True).
75
+ rio.clip(geo.geometry.values, geo.crs, from_disk=True).
76
+ sel(band=1).drop_vars("band")
77
  )
78
  out_grid = make_geocube(
79
  vector_data=geo,
 
82
  )
83
  # merge the two together
84
  out_grid["values"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
85
+ grouped_raster = out_grid.drop_vars("spatial_ref").groupby(out_grid.row_n)
86
  # can add other stats
87
+ grid_mean = grouped_raster.mean().rename({"values": band_name})
88
  zonal_stats = xarray.merge([grid_mean]).to_dataframe()
89
  geo = geo.merge(zonal_stats, how="left", on=row_n)
90
  return geo
91
 
92
 
93
+ total_features = pad_labeled.count().to_pandas()
94
+
95
+
96
+ # +
97
+ # # %%time
98
+ # testing -- only the lower 48 states!
99
+ # (pad.filter(_.geom.within(bounds)).group_by([_.State_Nm]).aggregate(n = _.count()).to_pandas())
100
+
101
+ # +
102
+
103
+ def piecewise_zonal(cog, tbl, crs, band_name = "mean", row_n = "row_n", dirname = "pad_parquet"):
104
+ total_features = tbl.count().to_pandas()
105
+ n = 10000
106
+ steps = range(0, total_features, 10000)
107
+ parts = [*[i for i in steps], total_features]
108
+ for i in range(0,len(steps)):
109
+ begin = parts[i]
110
+ end = parts[i+1] - 1
111
+ df = tbl.filter([_[row_n] > begin, _[row_n] <= end]).to_pandas()
112
+ geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
113
+ geo = zonal_stats(cog, geo, band_name, row_n)
114
+ geo.to_parquet(f"{dirname}/part_{i}.parquet")
115
+
116
+
117
+
118
+ # -
119
+
120
+ # %%time
121
+ cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
122
+ piecewise_zonal(cog, pad_labeled, crs, "richness")
123
+
124
+ # Or be bold!
125
+ df = pad_labeled.to_pandas()
126
+ geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
127
+
128
+ # +
129
+ import numpy as np
130
+
131
+ def piecewise_zonal2(cog, geo, band_name = "mean", n = 10000, row_n = "row_n", dirname = "pad_parquet2"):
132
+ total = len(geo)
133
+ for i in range(0,total,n):
134
+ end = np.min([i + n,total])
135
+ geo_slice = geo.iloc[i:end]
136
+ geo_slice = zonal_stats(cog, geo_slice, band_name, row_n)
137
+ geo_slice.to_parquet(f"{dirname}/part_{i}.parquet")
138
+
139
+
140
+
141
+ # -
142
+
143
+ # %%time
144
+ piecewise_zonal2(cog, geo, "richness") # 6 min
145
+
146
+
147
+ import geopandas
148
+ gdf = geopandas.read_parquet("pad_parquet2")
149
+
150
+ gdf.columns
151
+
152
+ # %%time
153
+ human_impacts_2021 = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
154
+ gdf = piecewise_zonal2(human_impacts_2021, gdf, "human_impacts_2021")
155
+
156
 
 
157
 
158
+
159
+
160
+
161
+ # Lastly we need to convert to PMTiles:
162
  #
163
  # ```
164
  # ogr2ogr -dsco MAX_SIZE=90000000 -dsco MAX_FEATURES=50000000 -dsco MAXZOOM=10 pad-mobi.pmtiles pad-mobi.parquet
165
  # ```
166
 
167
+ geo.plot(column="richness", legend=True)
168
+
169
+ import geopandas
170
+ gdf = geopandas.read_parquet("pad-mobi.parquet")
171
+ cog = "https://data.source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif"
172
+
173
+
174
+ human_impacts_2021 = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
175
+ geo = zonal_stats(human_impacts_2021, geo, "human_impacts_2021")
176
+
177
+
178
+ geo.to_parquet("pad-extended.parquet")
179
+
180
+ # +
181
+ # %%time
182
+ geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif", geo, "deforest_carbon")
183
+ geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif", geo, "fii")
184
+ geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif", geo, "bii")
185
+ geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif", geo, "crop_expansion")
186
+ geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif", geo, "crop_reduction")
187
+
188
+ geo = zonal_stats("https://data.source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif", geo, "irrecoverable_c_total_2018")
189
+ geo = zonal_stats("https://data.source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif", geo, "manageable_c_total_2018")
190
+
191
+ cog = "https://data.source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif"
192
+
193
+ geo.to_parquet("pad-extended.parquet")