cassiebuhler commited on
Commit
0c88eb4
·
1 Parent(s): 81d856c

filling missing values, include all gap codes

Browse files
Files changed (2) hide show
  1. preprocess.ipynb +42 -138
  2. preprocess.py +99 -0
preprocess.ipynb CHANGED
@@ -12,114 +12,80 @@
12
  "cell_type": "code",
13
  "execution_count": null,
14
  "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
15
- "metadata": {},
 
 
 
 
 
 
16
  "outputs": [],
17
  "source": [
18
  "import ibis\n",
19
  "from ibis import _\n",
20
  "\n",
21
- "conn = ibis.duckdb.connect(\"tmp\", extensions=[\"spatial\"])\n",
22
- "ca_parquet = \"https://data.source.coop/cboettig/ca30x30/ca_areas.parquet\"\n",
23
  "# or use local copy:\n",
24
- "ca_parquet = \"/home/rstudio/source.coop/cboettig/ca30x30/ca_areas.parquet\"\n"
25
  ]
26
  },
27
  {
28
  "cell_type": "code",
29
  "execution_count": null,
30
- "id": "a0cb34b1-8d70-49bf-80c6-244ecc8ddf84",
31
  "metadata": {},
32
  "outputs": [],
33
  "source": [
34
- "buffer = -2\n",
 
35
  "\n",
36
  "tbl = (\n",
37
  " conn.read_parquet(ca_parquet)\n",
38
  " .cast({\"SHAPE\": \"geometry\"})\n",
39
  " .rename(geom = \"SHAPE\")\n",
40
- "# .filter(_.UNIT_NAME == \"Angeles National Forest\")\n",
41
- " .filter(_.reGAP < 3) \n",
42
  ")\n",
43
- "tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer))\n",
 
 
 
44
  "tbl_2024 = tbl.filter(_.Release_Year == 2024)\n",
45
- "intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))\n",
46
- "\n"
47
  ]
48
  },
49
  {
50
  "cell_type": "code",
51
  "execution_count": null,
52
- "id": "a0b75637-e015-4be4-86e1-c9757ac43d0f",
53
- "metadata": {},
54
- "outputs": [],
55
- "source": [
56
- "## Testing, run only on subset data\n",
57
- "if False:\n",
58
- " gdf = intersects.mutate(geom = _.geom.convert(\"epsg:3310\",\"epsg:4326\")).execute()\n",
59
- " gdf_2023 = tbl_2023.mutate(geom = _.geom.convert(\"epsg:3310\",\"epsg:4326\")).execute()\n",
60
- " gdf_2024 = tbl_2024.mutate(geom = _.geom.convert(\"epsg:3310\",\"epsg:4326\")).execute()\n",
61
- " # gdf = ca2024\n",
62
- " established = {'property': 'established',\n",
63
- " 'type': 'categorical',\n",
64
- " 'stops': [\n",
65
- " [2023, \"#26542C80\"], \n",
66
- " [2024, \"#F3AB3D80\"]]\n",
67
- " }\n",
68
- " inter = {\"fill-color\": \"#F3AB3D\"}\n",
69
- " p2024 = {\"fill-color\": \"#26542C\"}\n",
70
- " p2023 = {\"fill-color\": \"#8B0A1A\"}\n",
71
- " \n",
72
- " m = leafmap.Map(style=\"positron\")\n",
73
- " m.add_gdf(gdf_2024,layer_type=\"fill\", name = \"2024\", paint = p2024)\n",
74
- " m.add_gdf(gdf_2023,layer_type=\"fill\", name = \"2023\", paint = p2023)\n",
75
- " m.add_gdf(gdf,layer_type=\"fill\", name = \"intersects\", paint = inter)\n",
76
- " \n",
77
- " m.add_layer_control()\n",
78
- " m"
79
- ]
80
- },
81
- {
82
- "cell_type": "code",
83
- "execution_count": null,
84
- "id": "275c171a-f82f-4ee8-991c-1e34eb83a33d",
85
  "metadata": {},
86
  "outputs": [],
87
  "source": [
88
  "%%time\n",
89
  "\n",
90
- "new2024 = intersects.select(\"OBJECTID\").mutate(established = 2024)\n",
91
  "\n",
92
  "ca = (conn\n",
93
  " .read_parquet(ca_parquet)\n",
94
  " .cast({\"SHAPE\": \"geometry\"})\n",
95
  " .mutate(area = _.SHAPE.area())\n",
96
- " .filter(_.Release_Year == 2024)\n",
97
- " .filter(_.reGAP < 3)\n",
98
- " .left_join(new2024, \"OBJECTID\")\n",
99
- " .mutate(established=_.established.fill_null(2023))\n",
100
  " .mutate(geom = _.SHAPE.convert(\"epsg:3310\",\"epsg:4326\"))\n",
101
  " .rename(name = \"cpad_PARK_NAME\", access_type = \"cpad_ACCESS_TYP\", manager = \"cpad_MNG_AGENCY\",\n",
102
  " manager_type = \"cpad_MNG_AG_LEV\", id = \"OBJECTID\", type = \"TYPE\")\n",
 
 
 
 
103
  " .select(_.established, _.reGAP, _.name, _.access_type, _.manager, _.manager_type,\n",
104
  " _.Easement, _.Acres, _.id, _.type, _.geom)\n",
105
  " )\n",
106
- "ca2024 = ca.execute()\n",
107
- "\n",
108
  "\n",
109
- "\n",
110
- "ca2024.to_parquet(\"ca2024.parquet\")\n",
111
- "\n"
112
- ]
113
- },
114
- {
115
- "cell_type": "code",
116
- "execution_count": 2,
117
- "id": "8259b450-2152-472c-a58c-50ce0d68d78f",
118
- "metadata": {},
119
- "outputs": [],
120
- "source": [
121
- "ca2024 = conn.read_parquet(\"ca2024.parquet\")\n",
122
- "ca2024.execute().to_file(\"ca2024.geojson\") # tippecanoe can't parse geoparquet :-("
123
  ]
124
  },
125
  {
@@ -144,83 +110,21 @@
144
  " repo_id=\"boettiger-lab/ca-30x30\",\n",
145
  " repo_type=\"dataset\",\n",
146
  " )\n",
147
- "hf_upload(\"ca2024.parquet\")"
148
- ]
149
- },
150
- {
151
- "cell_type": "markdown",
152
- "id": "cebd0ff5-8353-4b84-b9ee-182b74613554",
153
- "metadata": {},
154
- "source": [
155
- "# Testing & visualization\n",
156
- "\n",
157
- "`ca2024.parquet()` now contains all we need. The code below illustrates some quick examples of the kinds of visualizations and summaries we might want to compute with this data. \n"
158
- ]
159
- },
160
- {
161
- "cell_type": "code",
162
- "execution_count": 1,
163
- "id": "55afe07c-8681-4308-bbb9-e460f7380f86",
164
- "metadata": {},
165
- "outputs": [],
166
- "source": [
167
- "import leafmap.maplibregl as leafmap\n",
168
- "import ibis\n",
169
- "from ibis import _\n",
170
- "conn = ibis.duckdb.connect(extensions=[\"spatial\"])\n",
171
- "\n",
172
- "ca2024 = conn.read_parquet(\"ca2024.parquet\")"
173
- ]
174
- },
175
- {
176
- "cell_type": "code",
177
- "execution_count": null,
178
- "id": "6f3df8c1-a603-4dd5-be84-8deaae928d0a",
179
- "metadata": {},
180
- "outputs": [],
181
- "source": [
182
- "# compute some summary tables:\n",
183
- "\n",
184
- "(ca2024\n",
185
- " .filter(_.established == 2024)\n",
186
- " .filter(_.manager_type == \"State\")\n",
187
- " .group_by(_.manager, _.manager_type)\n",
188
- " .agg(area = _.Acres.sum())\n",
189
- " .order_by(_.area.desc())\n",
190
- " .execute()\n",
191
- ")"
192
- ]
193
- },
194
- {
195
- "cell_type": "code",
196
- "execution_count": null,
197
- "id": "c62854f6-1456-4207-8c69-53af17970102",
198
- "metadata": {},
199
- "outputs": [],
200
- "source": [
201
- "\n",
202
- "gdf = ca2024.execute()\n",
203
- "established = {'property': 'established',\n",
204
- " 'type': 'categorical',\n",
205
- " 'stops': [\n",
206
- " [2023, \"#26542C80\"], \n",
207
- " [2024, \"#F3AB3D80\"]]}\n",
208
- "paint = {\"fill-color\": established}\n",
209
- "\n",
210
- "\n",
211
- "m = leafmap.Map(style=\"positron\")\n",
212
- "m.add_gdf(gdf,layer_type=\"fill\", name = \"intersects\", paint = paint)\n",
213
- "\n",
214
- "m.add_layer_control()\n",
215
- "m.to_html(\"ca2024.html\")\n",
216
- "m"
217
  ]
218
  },
219
  {
220
  "cell_type": "code",
221
  "execution_count": null,
222
  "id": "2df80e1d-6b94-4884-b9f5-d9c23d3ea028",
223
- "metadata": {},
 
 
 
 
 
 
224
  "outputs": [],
225
  "source": [
226
  "import subprocess\n",
@@ -249,8 +153,8 @@
249
  " except subprocess.CalledProcessError as e:\n",
250
  " print(f\"Error running Tippecanoe: {e}\")\n",
251
  "\n",
252
- "generate_pmtiles(\"ca2024.geojson\", \"ca2024-tippe.pmtiles\")\n",
253
- "hf_upload(\"ca2024-tippe.pmtiles\")"
254
  ]
255
  }
256
  ],
 
12
  "cell_type": "code",
13
  "execution_count": null,
14
  "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
15
+ "metadata": {
16
+ "editable": true,
17
+ "slideshow": {
18
+ "slide_type": ""
19
+ },
20
+ "tags": []
21
+ },
22
  "outputs": [],
23
  "source": [
24
  "import ibis\n",
25
  "from ibis import _\n",
26
  "\n",
27
+ "conn = ibis.duckdb.connect(\"tmp3\", extensions=[\"spatial\"])\n",
28
+ "# ca_parquet = \"https://data.source.coop/cboettig/ca30x30/ca_areas.parquet\"\n",
29
  "# or use local copy:\n",
30
+ "ca_parquet = \"ca_areas.parquet\""
31
  ]
32
  },
33
  {
34
  "cell_type": "code",
35
  "execution_count": null,
36
+ "id": "a3d4f189-1563-4868-9f1f-64d67569df27",
37
  "metadata": {},
38
  "outputs": [],
39
  "source": [
40
+ "# negative buffer to account for overlapping boundaries. \n",
41
+ "buffer = -30 #30m buffer \n",
42
  "\n",
43
  "tbl = (\n",
44
  " conn.read_parquet(ca_parquet)\n",
45
  " .cast({\"SHAPE\": \"geometry\"})\n",
46
  " .rename(geom = \"SHAPE\")\n",
47
+ " .filter(_.reGAP < 3) # only gap 1 and 2 count towards 30x30\n",
 
48
  ")\n",
49
+ "\n",
50
+ "# polygons with release_year 2024 are a superset of release_year 2023. \n",
51
+ "# use anti_join to isolate the objects that are in release_year 2024 but not release_year 2023 (aka newly established). \n",
52
+ "tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer)) \n",
53
  "tbl_2024 = tbl.filter(_.Release_Year == 2024)\n",
54
+ "intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))"
 
55
  ]
56
  },
57
  {
58
  "cell_type": "code",
59
  "execution_count": null,
60
+ "id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  "metadata": {},
62
  "outputs": [],
63
  "source": [
64
  "%%time\n",
65
  "\n",
66
+ "new2024 = intersects.select(\"OBJECTID\").mutate(established = 2024) # saving IDs to join on\n",
67
  "\n",
68
  "ca = (conn\n",
69
  " .read_parquet(ca_parquet)\n",
70
  " .cast({\"SHAPE\": \"geometry\"})\n",
71
  " .mutate(area = _.SHAPE.area())\n",
72
+ " .filter(_.Release_Year == 2024) # having both 2023 and 2024 is redudant since 2024 is the superset.\n",
73
+ " .left_join(new2024, \"OBJECTID\") # newly established 2024 polygons \n",
74
+ " .mutate(established=_.established.fill_null(2023)) \n",
 
75
  " .mutate(geom = _.SHAPE.convert(\"epsg:3310\",\"epsg:4326\"))\n",
76
  " .rename(name = \"cpad_PARK_NAME\", access_type = \"cpad_ACCESS_TYP\", manager = \"cpad_MNG_AGENCY\",\n",
77
  " manager_type = \"cpad_MNG_AG_LEV\", id = \"OBJECTID\", type = \"TYPE\")\n",
78
+ " .mutate(manager = _.manager.substitute({\"\": \"Unknown\"})) \n",
79
+ " .mutate(manager_type = _.manager_type.substitute({\"\": \"Unknown\"}))\n",
80
+ " .mutate(access_type = _.access_type.substitute({\"\": \"Unknown Access\"}))\n",
81
+ " .mutate(name = _.name.substitute({\"\": \"Unknown\"}))\n",
82
  " .select(_.established, _.reGAP, _.name, _.access_type, _.manager, _.manager_type,\n",
83
  " _.Easement, _.Acres, _.id, _.type, _.geom)\n",
84
  " )\n",
 
 
85
  "\n",
86
+ "ca2024 = ca.execute()\n",
87
+ "ca2024.to_parquet(\"ca2024-30m.parquet\")\n",
88
+ "ca2024.to_file(\"ca2024-30m.geojson\") # tippecanoe can't parse geoparquet :-("
 
 
 
 
 
 
 
 
 
 
 
89
  ]
90
  },
91
  {
 
110
  " repo_id=\"boettiger-lab/ca-30x30\",\n",
111
  " repo_type=\"dataset\",\n",
112
  " )\n",
113
+ " \n",
114
+ "hf_upload(\"ca2024-30m.parquet\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  ]
116
  },
117
  {
118
  "cell_type": "code",
119
  "execution_count": null,
120
  "id": "2df80e1d-6b94-4884-b9f5-d9c23d3ea028",
121
+ "metadata": {
122
+ "editable": true,
123
+ "slideshow": {
124
+ "slide_type": ""
125
+ },
126
+ "tags": []
127
+ },
128
  "outputs": [],
129
  "source": [
130
  "import subprocess\n",
 
153
  " except subprocess.CalledProcessError as e:\n",
154
  " print(f\"Error running Tippecanoe: {e}\")\n",
155
  "\n",
156
+ "generate_pmtiles(\"ca2024-30m.geojson\", \"ca2024-30m-tippe.pmtiles\")\n",
157
+ "hf_upload(\"ca2024-30m-tippe.pmtiles\")"
158
  ]
159
  }
160
  ],
preprocess.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ibis
2
+ from ibis import _
3
+
4
+ conn = ibis.duckdb.connect("tmp3", extensions=["spatial"])
5
+ # ca_parquet = "https://data.source.coop/cboettig/ca30x30/ca_areas.parquet"
6
+ # or use local copy:
7
+ ca_parquet = "ca_areas.parquet"
8
+
9
+
10
+ # negative buffer to account for overlapping boundaries.
11
+ buffer = -30 #30m buffer
12
+
13
+ tbl = (
14
+ conn.read_parquet(ca_parquet)
15
+ .cast({"SHAPE": "geometry"})
16
+ .rename(geom = "SHAPE")
17
+ .filter(_.reGAP < 3) # only gap 1 and 2 count towards 30x30
18
+ )
19
+
20
+ # polygons with release_year 2024 are a superset of release_year 2023.
21
+ # use anti_join to isolate the objects that are in release_year 2024 but not release_year 2023 (aka newly established).
22
+ tbl_2023 = tbl.filter(_.Release_Year == 2023).mutate(geom=_.geom.buffer(buffer))
23
+ tbl_2024 = tbl.filter(_.Release_Year == 2024)
24
+ intersects = tbl_2024.anti_join(tbl_2023, _.geom.intersects(tbl_2023.geom))
25
+
26
+ new2024 = intersects.select("OBJECTID").mutate(established = 2024) # saving IDs to join on
27
+
28
+ ca = (conn
29
+ .read_parquet(ca_parquet)
30
+ .cast({"SHAPE": "geometry"})
31
+ .mutate(area = _.SHAPE.area())
32
+ .filter(_.Release_Year == 2024) # having both 2023 and 2024 is redudant since 2024 is the superset.
33
+ .left_join(new2024, "OBJECTID") # newly established 2024 polygons
34
+ .mutate(established=_.established.fill_null(2023))
35
+ .mutate(geom = _.SHAPE.convert("epsg:3310","epsg:4326"))
36
+ .rename(name = "cpad_PARK_NAME", access_type = "cpad_ACCESS_TYP", manager = "cpad_MNG_AGENCY",
37
+ manager_type = "cpad_MNG_AG_LEV", id = "OBJECTID", type = "TYPE")
38
+ .mutate(manager = _.manager.substitute({"": "Unknown"}))
39
+ .mutate(manager_type = _.manager_type.substitute({"": "Unknown"}))
40
+ .mutate(access_type = _.access_type.substitute({"": "Unknown Access"}))
41
+ .mutate(name = _.name.substitute({"": "Unknown"}))
42
+ .select(_.established, _.reGAP, _.name, _.access_type, _.manager, _.manager_type,
43
+ _.Easement, _.Acres, _.id, _.type, _.geom)
44
+ )
45
+
46
+ ca2024 = ca.execute()
47
+
48
+ ca2024.to_parquet("ca2024-30m.parquet")
49
+
50
+ ca2024.to_file("ca2024-30m.geojson") # tippecanoe can't parse geoparquet :-(
51
+
52
+
53
+ ## Upload to Huggingface
54
+ # https://huggingface.co/datasets/boettiger-lab/ca-30x30/
55
+
56
+ from huggingface_hub import HfApi, login
57
+ import streamlit as st
58
+ login(st.secrets["HF_TOKEN"])
59
+ api = HfApi()
60
+
61
+ def hf_upload(file):
62
+ info = api.upload_file(
63
+ path_or_fileobj=file,
64
+ path_in_repo=file,
65
+ repo_id="boettiger-lab/ca-30x30",
66
+ repo_type="dataset",
67
+ )
68
+ hf_upload("ca2024-30m.parquet")
69
+
70
+
71
+
72
+ import subprocess
73
+ import os
74
+
75
+ def generate_pmtiles(input_file, output_file, max_zoom=12):
76
+ # Ensure Tippecanoe is installed
77
+ if subprocess.call(["which", "tippecanoe"], stdout=subprocess.DEVNULL) != 0:
78
+ raise RuntimeError("Tippecanoe is not installed or not in PATH")
79
+
80
+ # Construct the Tippecanoe command
81
+ command = [
82
+ "tippecanoe",
83
+ "-o", output_file,
84
+ "-z", str(max_zoom),
85
+ "--drop-densest-as-needed",
86
+ "--extend-zooms-if-still-dropping",
87
+ "--force",
88
+ input_file
89
+ ]
90
+
91
+ # Run Tippecanoe
92
+ try:
93
+ subprocess.run(command, check=True)
94
+ print(f"Successfully generated PMTiles file: {output_file}")
95
+ except subprocess.CalledProcessError as e:
96
+ print(f"Error running Tippecanoe: {e}")
97
+
98
+ generate_pmtiles("ca2024-30m.geojson", "ca2024-30m-tippe.pmtiles")
99
+ hf_upload("ca2024-30m-tippe.pmtiles")