Spaces:
Sleeping
Sleeping
parts
Browse files- preprocess.py +107 -30
preprocess.py
CHANGED
@@ -1,16 +1,22 @@
|
|
1 |
import ibis
|
2 |
from ibis import _
|
3 |
import rioxarray
|
|
|
4 |
from shapely.geometry import box
|
|
|
|
|
5 |
import fiona
|
6 |
|
|
|
7 |
# +
|
8 |
fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
|
9 |
parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
|
10 |
cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
|
11 |
|
12 |
# fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
|
13 |
-
|
|
|
|
|
14 |
|
15 |
# extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
|
16 |
r = rioxarray.open_rasterio(cog)
|
@@ -27,7 +33,7 @@ con = ibis.duckdb.connect()
|
|
27 |
|
28 |
# Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
|
29 |
con.load_extension("spatial")
|
30 |
-
con.raw_sql(f"CREATE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
|
31 |
pad = con.table("pad")
|
32 |
|
33 |
# +
|
@@ -59,34 +65,15 @@ pad_labeled = (
|
|
59 |
)
|
60 |
|
61 |
|
62 |
-
# +
|
63 |
-
# # %%time
|
64 |
-
# testing -- only the lower 48 states!
|
65 |
-
# (pad.filter(_.geom.within(bounds)).group_by([_.State_Nm]).aggregate(n = _.count()).to_pandas())
|
66 |
# -
|
67 |
|
68 |
-
# We could work in chunks, possibly parallelize this....
|
69 |
-
start = 0
|
70 |
-
end = 10000
|
71 |
-
df = pad_labeled.filter([_.row_n > start, _.row_n <= end]).to_pandas()
|
72 |
-
|
73 |
-
# Or be bold!
|
74 |
-
df = pad_labeled.to_pandas()
|
75 |
-
|
76 |
-
from geocube.api.core import make_geocube
|
77 |
-
import xarray
|
78 |
-
import geopandas
|
79 |
-
geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
|
80 |
-
|
81 |
-
geo.shape
|
82 |
|
83 |
-
|
84 |
-
def zonal_stats(cog, geo, crs, row_n = "row_n"):
|
85 |
# https://corteva.github.io/geocube/html/examples/zonal_statistics.html
|
86 |
raster = (rioxarray.
|
87 |
open_rasterio('/vsicurl/'+cog, masked=True).
|
88 |
-
rio.clip(geo.geometry.values, crs, from_disk=True).
|
89 |
-
sel(band=1).
|
90 |
)
|
91 |
out_grid = make_geocube(
|
92 |
vector_data=geo,
|
@@ -95,22 +82,112 @@ def zonal_stats(cog, geo, crs, row_n = "row_n"):
|
|
95 |
)
|
96 |
# merge the two together
|
97 |
out_grid["values"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
|
98 |
-
grouped_raster = out_grid.
|
99 |
# can add other stats
|
100 |
-
grid_mean = grouped_raster.mean().rename({"values":
|
101 |
zonal_stats = xarray.merge([grid_mean]).to_dataframe()
|
102 |
geo = geo.merge(zonal_stats, how="left", on=row_n)
|
103 |
return geo
|
104 |
|
105 |
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
-
geo.to_parquet("pad-mobi.parquet")
|
109 |
|
110 |
-
|
|
|
|
|
|
|
111 |
#
|
112 |
# ```
|
113 |
# ogr2ogr -dsco MAX_SIZE=90000000 -dsco MAX_FEATURES=50000000 -dsco MAXZOOM=10 pad-mobi.pmtiles pad-mobi.parquet
|
114 |
# ```
|
115 |
|
116 |
-
geo.plot(column="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import ibis
|
2 |
from ibis import _
|
3 |
import rioxarray
|
4 |
+
import xarray
|
5 |
from shapely.geometry import box
|
6 |
+
from geocube.api.core import make_geocube
|
7 |
+
import geopandas
|
8 |
import fiona
|
9 |
|
10 |
+
|
11 |
# +
|
12 |
fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
|
13 |
parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
|
14 |
cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
|
15 |
|
16 |
# fiona not built with parquet support. ideally duckdb's st_read_meta would do this.
|
17 |
+
meta = fiona.open(fgb)
|
18 |
+
crs = meta.crs
|
19 |
+
nrow = len(meta)
|
20 |
|
21 |
# extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
|
22 |
r = rioxarray.open_rasterio(cog)
|
|
|
33 |
|
34 |
# Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
|
35 |
con.load_extension("spatial")
|
36 |
+
con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
|
37 |
pad = con.table("pad")
|
38 |
|
39 |
# +
|
|
|
65 |
)
|
66 |
|
67 |
|
|
|
|
|
|
|
|
|
68 |
# -
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
def zonal_stats(cog, geo, band_name = "mean", row_n = "row_n"):
|
|
|
72 |
# https://corteva.github.io/geocube/html/examples/zonal_statistics.html
|
73 |
raster = (rioxarray.
|
74 |
open_rasterio('/vsicurl/'+cog, masked=True).
|
75 |
+
rio.clip(geo.geometry.values, geo.crs, from_disk=True).
|
76 |
+
sel(band=1).drop_vars("band")
|
77 |
)
|
78 |
out_grid = make_geocube(
|
79 |
vector_data=geo,
|
|
|
82 |
)
|
83 |
# merge the two together
|
84 |
out_grid["values"] = (raster.dims, raster.values, raster.attrs, raster.encoding)
|
85 |
+
grouped_raster = out_grid.drop_vars("spatial_ref").groupby(out_grid.row_n)
|
86 |
# can add other stats
|
87 |
+
grid_mean = grouped_raster.mean().rename({"values": band_name})
|
88 |
zonal_stats = xarray.merge([grid_mean]).to_dataframe()
|
89 |
geo = geo.merge(zonal_stats, how="left", on=row_n)
|
90 |
return geo
|
91 |
|
92 |
|
93 |
+
total_features = pad_labeled.count().to_pandas()
|
94 |
+
|
95 |
+
|
96 |
+
# +
|
97 |
+
# # %%time
|
98 |
+
# testing -- only the lower 48 states!
|
99 |
+
# (pad.filter(_.geom.within(bounds)).group_by([_.State_Nm]).aggregate(n = _.count()).to_pandas())
|
100 |
+
|
101 |
+
# +
|
102 |
+
|
103 |
+
def piecewise_zonal(cog, tbl, crs, band_name = "mean", row_n = "row_n", dirname = "pad_parquet"):
|
104 |
+
total_features = tbl.count().to_pandas()
|
105 |
+
n = 10000
|
106 |
+
steps = range(0, total_features, 10000)
|
107 |
+
parts = [*[i for i in steps], total_features]
|
108 |
+
for i in range(0,len(steps)):
|
109 |
+
begin = parts[i]
|
110 |
+
end = parts[i+1] - 1
|
111 |
+
df = tbl.filter([_[row_n] > begin, _[row_n] <= end]).to_pandas()
|
112 |
+
geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
|
113 |
+
geo = zonal_stats(cog, geo, band_name, row_n)
|
114 |
+
geo.to_parquet(f"{dirname}/part_{i}.parquet")
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
# -
|
119 |
+
|
120 |
+
# %%time
|
121 |
+
cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
|
122 |
+
piecewise_zonal(cog, pad_labeled, crs, "richness")
|
123 |
+
|
124 |
+
# Or be bold!
|
125 |
+
df = pad_labeled.to_pandas()
|
126 |
+
geo = geopandas.GeoDataFrame(df, geometry=df.geometry, crs=crs)
|
127 |
+
|
128 |
+
# +
|
129 |
+
import numpy as np
|
130 |
+
|
131 |
+
def piecewise_zonal2(cog, geo, band_name = "mean", n = 10000, row_n = "row_n", dirname = "pad_parquet2"):
|
132 |
+
total = len(geo)
|
133 |
+
for i in range(0,total,n):
|
134 |
+
end = np.min([i + n,total])
|
135 |
+
geo_slice = geo.iloc[i:end]
|
136 |
+
geo_slice = zonal_stats(cog, geo_slice, band_name, row_n)
|
137 |
+
geo_slice.to_parquet(f"{dirname}/part_{i}.parquet")
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
# -
|
142 |
+
|
143 |
+
# %%time
|
144 |
+
piecewise_zonal2(cog, geo, "richness") # 6 min
|
145 |
+
|
146 |
+
|
147 |
+
import geopandas
|
148 |
+
gdf = geopandas.read_parquet("pad_parquet2")
|
149 |
+
|
150 |
+
gdf.columns
|
151 |
+
|
152 |
+
# %%time
|
153 |
+
human_impacts_2021 = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
|
154 |
+
gdf = piecewise_zonal2(human_impacts_2021, gdf, "human_impacts_2021")
|
155 |
+
|
156 |
|
|
|
157 |
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
# Lastly we need to convert to PMTiles:
|
162 |
#
|
163 |
# ```
|
164 |
# ogr2ogr -dsco MAX_SIZE=90000000 -dsco MAX_FEATURES=50000000 -dsco MAXZOOM=10 pad-mobi.pmtiles pad-mobi.parquet
|
165 |
# ```
|
166 |
|
167 |
+
geo.plot(column="richness", legend=True)
|
168 |
+
|
169 |
+
import geopandas
|
170 |
+
gdf = geopandas.read_parquet("pad-mobi.parquet")
|
171 |
+
cog = "https://data.source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif"
|
172 |
+
|
173 |
+
|
174 |
+
human_impacts_2021 = "https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
|
175 |
+
geo = zonal_stats(human_impacts_2021, geo, "human_impacts_2021")
|
176 |
+
|
177 |
+
|
178 |
+
geo.to_parquet("pad-extended.parquet")
|
179 |
+
|
180 |
+
# +
|
181 |
+
# %%time
|
182 |
+
geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif", geo, "deforest_carbon")
|
183 |
+
geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif", geo, "fii")
|
184 |
+
geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif", geo, "bii")
|
185 |
+
geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif", geo, "crop_expansion")
|
186 |
+
geo = zonal_stats("https://data.source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif", geo, "crop_reduction")
|
187 |
+
|
188 |
+
geo = zonal_stats("https://data.source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif", geo, "irrecoverable_c_total_2018")
|
189 |
+
geo = zonal_stats("https://data.source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif", geo, "manageable_c_total_2018")
|
190 |
+
|
191 |
+
cog = "https://data.source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif"
|
192 |
+
|
193 |
+
geo.to_parquet("pad-extended.parquet")
|