cboettig commited on
Commit
dd06071
·
1 Parent(s): fb4c9f4

refactor app.py into sections

Browse files
Files changed (5) hide show
  1. app/app.py +20 -596
  2. app/footer.md +22 -0
  3. app/system_prompt.txt +140 -0
  4. app/utils.py +271 -0
  5. app/variables.py +154 -0
app/app.py CHANGED
@@ -13,75 +13,30 @@ from shapely import wkb
13
  import sqlalchemy
14
  import pathlib
15
  from typing import Optional
 
16
 
17
- # urls for main layer
18
- ca_pmtiles = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cpad-stats.pmtiles"
19
- ca_parquet = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cpad-stats.parquet"
20
- #ca_parquet = "cpad-stats.parquet" #local copy is faster
21
-
22
- ca_area_acres = 1.014e8 #acres
23
- style_choice = "GAP Status Code"
24
-
25
-
26
 
27
 
28
- ## Create the engine
29
- cwd = pathlib.Path.cwd()
30
- connect_args = {'preload_extensions':['spatial']}
31
- eng = sqlalchemy.create_engine(f"duckdb:///{cwd}/duck.db",connect_args = connect_args)
32
 
33
  # Create the duckdb connection directly from the sqlalchemy engine instead.
34
- # Not as elegant as `ibis.duckdb.connect()` but shares connection with sqlalchmey.
35
- con = ibis.duckdb.from_connection(eng.raw_connection())
 
 
 
 
36
 
37
  ## Create the table from remote parquet only if it doesn't already exist on disk
 
 
38
  current_tables = con.list_tables()
39
  if "mydata" not in set(current_tables):
40
  tbl = con.read_parquet(ca_parquet)
41
  con.create_table("mydata", tbl)
42
-
43
  ca = con.table("mydata")
44
 
45
- # urls for additional data layers
46
- url_sr = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/species-richness-ca/{z}/{x}/{y}.png"
47
- url_rsr = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/range-size-rarity/{z}/{x}/{y}.png"
48
- url_irr_carbon = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/ca_irrecoverable_c_2018_cog.tif"
49
- url_man_carbon = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/ca_manageable_c_2018_cog.tif"
50
- url_svi = "https://data.source.coop/cboettig/social-vulnerability/svi2020_us_county.pmtiles"
51
- url_justice40 = "https://data.source.coop/cboettig/justice40/disadvantaged-communities.pmtiles"
52
- url_loss_carbon = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/deforest-carbon-ca/{z}/{x}/{y}.png"
53
- url_hi = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/ca_human_impact_cog.tif"
54
- url_calfire = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cal_fire_2022.pmtiles"
55
- url_rxburn = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cal_rxburn_2022.pmtiles"
56
-
57
- # colors for plotting
58
- private_access_color = "#DE881E" # orange
59
- public_access_color = "#3388ff" # blue
60
- tribal_color = "#BF40BF" # purple
61
- mixed_color = "#005a00" # green
62
- year2023_color = "#26542C" # green
63
- year2024_color = "#F3AB3D" # orange
64
- federal_color = "#529642" # green
65
- state_color = "#A1B03D" # light green
66
- local_color = "#365591" # blue
67
- special_color = "#0096FF" # blue
68
- private_color = "#7A3F1A" # brown
69
- joint_color = "#DAB0AE" # light pink
70
- county_color = "#DE3163" # magenta
71
- city_color = "#ADD8E6" #light blue
72
- hoa_color = "#A89BBC" # purple
73
- nonprofit_color = "#D77031" #orange
74
- justice40_color = "#00008B" #purple
75
- svi_color = "#1bc7c3" #cyan
76
- white = "#FFFFFF"
77
-
78
-
79
- # gap codes 3 and 4 are off by default.
80
- default_gap = {
81
- 3: False,
82
- 4: False,
83
- }
84
-
85
 
86
  for key in [
87
  'richness', 'rsr', 'irrecoverable_carbon', 'manageable_carbon',
@@ -92,332 +47,7 @@ for key in [
92
  ]:
93
  if key not in st.session_state:
94
  st.session_state[key] = False
95
-
96
-
97
-
98
- from functools import reduce
99
-
100
- def get_summary(ca, combined_filter, column, colors=None): #summary stats, based on filtered data
101
- df = ca.filter(combined_filter)
102
- df = (df
103
- .group_by(*column) # unpack the list for grouping
104
- .aggregate(percent_protected=100 * _.acres.sum() / ca_area_acres,
105
- mean_richness = (_.richness * _.acres).sum() / _.acres.sum(),
106
- mean_rsr = (_.rsr * _.acres).sum() / _.acres.sum(),
107
- mean_irrecoverable_carbon = (_.irrecoverable_carbon * _.acres).sum() / _.acres.sum(),
108
- mean_manageable_carbon = (_.manageable_carbon * _.acres).sum() / _.acres.sum(),
109
- mean_percent_fire_10yr = (_.percent_fire_10yr *_.acres).sum()/_.acres.sum(),
110
- mean_percent_rxburn_10yr = (_.percent_rxburn_10yr *_.acres).sum()/_.acres.sum(),
111
- mean_percent_disadvantaged = (_.percent_disadvantaged * _.acres).sum() / _.acres.sum(),
112
- mean_svi = (_.svi * _.acres).sum() / _.acres.sum(),
113
- mean_svi_socioeconomic_status = (_.svi_socioeconomic_status * _.acres).sum() / _.acres.sum(),
114
- mean_svi_household_char = (_.svi_household_char * _.acres).sum() / _.acres.sum(),
115
- mean_svi_racial_ethnic_minority = (_.svi_racial_ethnic_minority * _.acres).sum() / _.acres.sum(),
116
- mean_svi_housing_transit = (_.svi_housing_transit * _.acres).sum() / _.acres.sum(),
117
- mean_carbon_lost = (_.deforest_carbon * _.acres).sum() / _.acres.sum(),
118
- mean_human_impact = (_.human_impact * _.acres).sum() / _.acres.sum(),
119
- )
120
- .mutate(percent_protected=_.percent_protected.round(1))
121
- )
122
- if colors is not None and not colors.empty: #only the df will have colors, df_tab doesn't since we are printing it.
123
- df = df.inner_join(colors, column)
124
- df = df.cast({col: "string" for col in column})
125
- df = df.to_pandas()
126
- return df
127
-
128
-
129
- def summary_table(column, colors, filter_cols, filter_vals,colorby_vals): # get df for charts + df_tab for printed table
130
- filters = []
131
- if filter_cols and filter_vals: #if a filter is selected, add to list of filters
132
- for filter_col, filter_val in zip(filter_cols, filter_vals):
133
- if len(filter_val) > 1:
134
- filters.append(getattr(_, filter_col).isin(filter_val))
135
- else:
136
- filters.append(getattr(_, filter_col) == filter_val[0])
137
- if column not in filter_cols: #show color_by column in table by adding it as a filter (if it's not already a filter)
138
- filter_cols.append(column)
139
- filters.append(getattr(_, column).isin(colorby_vals[column]))
140
- combined_filter = reduce(lambda x, y: x & y, filters) #combining all the filters into ibis filter expression
141
- df = get_summary(ca, combined_filter, [column], colors) # df used for charts
142
- df_tab = get_summary(ca, combined_filter, filter_cols, colors = None) #df used for printed table
143
- return df, df_tab
144
-
145
-
146
-
147
- def area_plot(df, column): #percent protected pie chart
148
- base = alt.Chart(df).encode(
149
- alt.Theta("percent_protected:Q").stack(True),
150
- )
151
- pie = ( base
152
- .mark_arc(innerRadius= 40, outerRadius=100)
153
- .encode(alt.Color("color:N").scale(None).legend(None),
154
- tooltip=['percent_protected', column])
155
- )
156
- text = ( base
157
- .mark_text(radius=80, size=14, color="white")
158
- .encode(text = column + ":N")
159
- )
160
- plot = pie # pie + text
161
- return plot.properties(width="container", height=290)
162
-
163
-
164
- def bar_chart(df, x, y, title): #display summary stats for color_by column
165
-
166
- #axis label angles / chart size
167
- if x == "manager_type": #labels are too long, making vertical
168
- angle = 270
169
- height = 373
170
- else: #other labels are horizontal
171
- angle = 0
172
- height = 310
173
-
174
- # order of bars
175
- if x == "established": # order labels in chronological order, not alphabetic.
176
- sort = '-x'
177
- elif x == "access_type": #order based on levels of openness
178
- sort=['Open', 'Restricted', 'No Public', "Unknown"]
179
- elif x == "manager_type":
180
- sort = ["Federal","Tribal","State","Special District", "County", "City", "HOA","Joint","Non Profit","Private","Unknown"]
181
- else:
182
- sort = 'x'
183
-
184
- x_title = next(key for key, value in select_column.items() if value == x)
185
- chart = alt.Chart(df).mark_bar().transform_calculate(
186
- access_label=f"replace(datum.{x}, ' Access', '')" #omit access from access_type labels so it fits in frame
187
- ).encode(
188
- x=alt.X("access_label:N",
189
- axis=alt.Axis(labelAngle=angle, title=x_title),
190
- sort=sort),
191
- y=alt.Y(y, axis=alt.Axis()),
192
- color=alt.Color('color').scale(None)
193
- ).properties(width="container", height=height, title = title
194
- )
195
- # sizing for poster
196
- # ).configure_title(
197
- # fontSize=40
198
- # ).configure_axis(
199
- # labelFontSize=24,
200
- # titleFontSize=34
201
- # )
202
- return chart
203
-
204
-
205
-
206
- def getButtons(style_options, style_choice, default_gap=None): #finding the buttons selected to use as filters
207
- column = style_options[style_choice]['property']
208
- opts = [style[0] for style in style_options[style_choice]['stops']]
209
- default_gap = default_gap or {}
210
- buttons = {
211
- name: st.checkbox(f"{name}", value=default_gap.get(name, True), key=column + str(name))
212
- for name in opts
213
- }
214
- filter_choice = [key for key, value in buttons.items() if value] # return only selected
215
- d = {}
216
- d[column] = filter_choice
217
- return d
218
-
219
-
220
-
221
- def getColorVals(style_options, style_choice):
222
- #df_tab only includes filters selected, we need to manually add "color_by" column (if it's not already a filter).
223
- column = style_options[style_choice]['property']
224
- opts = [style[0] for style in style_options[style_choice]['stops']]
225
- d = {}
226
- d[column] = opts
227
- return d
228
 
229
- manager = {
230
- 'property': 'manager_type',
231
- 'type': 'categorical',
232
- 'stops': [
233
- ['Federal', federal_color],
234
- ['State', state_color],
235
- ['Non Profit', nonprofit_color],
236
- ['Special District', special_color],
237
- ['Unknown', "#bbbbbb"],
238
- ['County', county_color],
239
- ['City', city_color],
240
- ['Joint', joint_color],
241
- ['Tribal', tribal_color],
242
- ['Private', private_color],
243
- ['HOA', hoa_color]
244
- ]
245
- }
246
-
247
- easement = {
248
- 'property': 'easement',
249
- 'type': 'categorical',
250
- 'stops': [
251
- ['True', private_access_color],
252
- ['False', public_access_color]
253
- ]
254
- }
255
-
256
- year = {
257
- 'property': 'established',
258
- 'type': 'categorical',
259
- 'stops': [
260
- ['pre-2024', year2023_color],
261
- ['2024', year2024_color]
262
- ]
263
- }
264
-
265
- access = {
266
- 'property': 'access_type',
267
- 'type': 'categorical',
268
- 'stops': [
269
- ['Open Access', public_access_color],
270
- ['No Public Access', private_access_color],
271
- ['Unknown Access', "#bbbbbb"],
272
- ['Restricted Access', tribal_color]
273
- ]
274
- }
275
-
276
- gap = {
277
- 'property': 'reGAP',
278
- 'type': 'categorical',
279
- 'stops': [
280
- [1, "#26633d"],
281
- [2, "#879647"],
282
- [3, "#EE4B2B"],
283
- [4, "#BF40BF"]
284
- ]
285
- }
286
-
287
- style_options = {
288
- "Year": year,
289
- "GAP Status Code": gap,
290
- "Manager Type": manager,
291
- "Easement": easement,
292
- "Access Type": access,
293
- }
294
-
295
- justice40_fill = {
296
- 'property': 'Disadvan',
297
- 'type': 'categorical',
298
- 'stops': [
299
- [0, white],
300
- [1, justice40_color]
301
- ]
302
- }
303
-
304
- justice40_style = {
305
- "version": 8,
306
- "sources": {
307
- "source1": {
308
- "type": "vector",
309
- "url": "pmtiles://" + url_justice40,
310
- "attribution": "Justice40"
311
- }
312
- },
313
- "layers": [
314
- {
315
- "id": "layer1",
316
- "source": "source1",
317
- "source-layer": "DisadvantagedCommunitiesCEJST",
318
- "filter": ["match", ["get", "StateName"], "California", True, False],
319
- "type": "fill",
320
- "paint": {
321
- "fill-color": justice40_fill,
322
- }
323
- }
324
- ]
325
- }
326
-
327
- def fire_style(layer):
328
- return {"version": 8,
329
- "sources": {
330
- "source1": {
331
- "type": "vector",
332
- "url": "pmtiles://" + url_calfire,
333
- "attribution": "CAL FIRE"
334
- }
335
- },
336
- "layers": [
337
- {
338
- "id": "fire",
339
- "source": "source1",
340
- "source-layer": layer,
341
- "type": "fill",
342
- "paint": {
343
- "fill-color": "#D22B2B",
344
- }
345
- }
346
- ]
347
- }
348
- def rx_style(layer):
349
- return{
350
- "version": 8,
351
- "sources": {
352
- "source2": {
353
- "type": "vector",
354
- "url": "pmtiles://" + url_rxburn,
355
- "attribution": "CAL FIRE"
356
- }
357
- },
358
- "layers": [
359
- {
360
- "id": "fire",
361
- "source": "source2",
362
- "source-layer": layer,
363
- # "filter": [">=", ["get", "YEAR_"], year],
364
- "type": "fill",
365
- "paint": {
366
- "fill-color": "#702963",
367
- }
368
- }
369
- ]
370
- }
371
-
372
- def get_sv_style(column):
373
- return {
374
- "layers": [
375
- {
376
- "id": "SVI",
377
- "source": column, #need different "source" for multiple pmtiles layers w/ same file
378
- "source-layer": "SVI2020_US_county",
379
- "filter": ["match", ["get", "STATE"], "California", True, False],
380
- "type": "fill",
381
- "paint": {
382
- "fill-color": [
383
- "interpolate", ["linear"], ["get", column],
384
- 0, white,
385
- 1, svi_color
386
- ]
387
- }
388
- }
389
- ]
390
- }
391
-
392
-
393
- def get_pmtiles_style(paint, alpha, filter_cols, filter_vals):
394
- filters = []
395
- for col, val in zip(filter_cols, filter_vals):
396
- filters.append(["match", ["get", col], val, True, False])
397
- combined_filters = ["all"] + filters
398
- style = {
399
- "version": 8,
400
- "sources": {
401
- "ca": {
402
- "type": "vector",
403
- "url": "pmtiles://" + ca_pmtiles,
404
- }
405
- },
406
- "layers": [
407
- {
408
- "id": "ca30x30",
409
- "source": "ca",
410
- "source-layer": "layer",
411
- "type": "fill",
412
- "filter": combined_filters,
413
- "paint": {
414
- "fill-color": paint,
415
- "fill-opacity": alpha
416
- }
417
- }
418
- ]
419
- }
420
- return style
421
 
422
  st.set_page_config(layout="wide", page_title="CA Protected Areas Explorer", page_icon=":globe:")
423
 
@@ -509,38 +139,9 @@ m = leafmap.Map(style="positron")
509
  #############
510
 
511
 
512
- def get_pmtiles_style_llm(paint, ids):
513
- combined_filters = ["all", ["match", ["get", "id"], ids, True, False]]
514
- style = {
515
- "version": 8,
516
- "sources": {
517
- "ca": {
518
- "type": "vector",
519
- "url": "pmtiles://" + ca_pmtiles,
520
- }
521
- },
522
- "layers": [
523
- {
524
- "id": "ca30x30",
525
- "source": "ca",
526
- "source-layer": "layer",
527
- "type": "fill",
528
- "filter": combined_filters,
529
- "paint": {
530
- "fill-color": paint,
531
- "fill-opacity": 1,
532
- # "fill-extrusion-height": 1000
533
- }
534
- }
535
- ]
536
- }
537
- return style
538
 
539
- ##### Chatbot stuff
540
 
541
- # langchain can also talk to this connection and see the table:
542
- from langchain_community.utilities import SQLDatabase
543
- db = SQLDatabase(eng, view_support=True)
544
 
545
 
546
  from pydantic import BaseModel, Field
@@ -549,148 +150,8 @@ class SQLResponse(BaseModel):
549
  sql_query: str = Field(description="The SQL query generated by the assistant.")
550
  explanation: str = Field(description="A detailed explanation of how the SQL query answers the input question.")
551
 
552
-
553
- from langchain.chains import create_sql_query_chain
554
- template = '''You are an expert in SQL and an assistant for mapping and analyzing California land data. Given an input question, create a syntactically correct {dialect} query to run, and then provide an explanation of how you answered the input question.
555
-
556
- For example:
557
- {{
558
- "sql_query": "SELECT * FROM my_table WHERE condition = 'value';",
559
- "explanation": "This query retrieves all rows from my_table where the condition column equals 'value'."
560
- }}
561
-
562
- Ensure the response contains only this JSON object, with no additional text, formatting, or commentary.
563
-
564
- # Important Details
565
-
566
- - For map-related queries (e.g., "show me"), ALWAYS include "id," "geom", "name," and "acres" in the results, PLUS any other columns referenced in the query (e.g., in conditions, calculations, or subqueries). This output structure is MANDATORY for all map-related queries.
567
- - ONLY use LIMIT in your SQL queries if the user specifies a quantity (e.g., 'show me 5'). Otherwise, return all matching data without a limit.
568
- - Wrap each column name in double quotes (") to denote them as delimited identifiers.
569
- - Pay attention to use only the column names you can see in the tables below. DO NOT query for columns that do not exist.
570
- If the query mentions "biodiversity" without specifying a column, default to using "richness" (species richness). Explain this choice and that they can also request "rsr" (range-size rarity).
571
- - If the query mentions carbon without specifying a column, use "irrecoverable carbon". Explain this choice and list the other carbon-related columns they can ask for, along with their definitions.
572
- - If the query asks about the manager, use the "manager" column. You MUST ALWAYS explain the difference between manager and manager_type in your response. Clarify that "manager" refers to the name of the managing entity (e.g., an agency), while "manager_type" specifies the type of jurisdiction (e.g., Federal, State, Non Profit). Also, let the user know they can include "manager_type" in their query if they want to refine their results.
573
- - If the user's query is unclear, DO NOT make assumptions. Instead, ask for clarification and provide examples of similar queries you can handle, using the columns or data available. You MUST ONLY deliver accurate results.
574
- - If you are mapping the data, explicitly state that the data is being visualized on a map. ALWAYS include a statement encouraging the user to examine the queried data below the map, as some areas may be too small at the current zoom level.
575
- - Users may not be familiar with this data, so your explanation should be short, clear, and easily understandable. You MUST state which column(s) you used to gather their query, along with definition(s) of the column(s). Do NOT explain SQL commands.
576
- - If the prompt is unrelated to the California dataset, provide examples of relevant queries that you can answer.
577
-
578
- # Example Questions and How to Approach Them
579
-
580
- ## Example:
581
- example_user: "Show me all non-profit land."
582
- example_assistant: {{"sql_query":
583
- SELECT id, geom, name, acres
584
- FROM mydata
585
- WHERE "manager_type" = "Non Profit";
586
- "explanation":"I selected all data where `manager_type` is 'Non Profit'."
587
- }}
588
-
589
- ## Example:
590
- example_user: "Which gap code has been impacted the most by fire?"
591
- example_assistant: {{"sql_query":
592
- SELECT "reGAP", SUM("percent_fire_10yr") AS temp
593
- FROM mydata
594
- GROUP BY "reGAP"
595
- ORDER BY temp ASC
596
- LIMIT 1;
597
- "explanation":"I used the `percent_fire_10yr` column, which shows the percentage of each area burned over the past 10 years (2013–2022), summing it for each GAP code to find the one with the highest total fire impact."
598
- }}
599
-
600
- ## Example:
601
- example_user: "Who manages the land with the worst biodiversity and highest SVI?"
602
- example_assistant: {{"sql_query":
603
- SELECT manager,richness, svi
604
- FROM mydata
605
- GROUP BY "manager"
606
- ORDER BY richness ASC, svi DESC
607
- LIMIT 1;
608
- "explanation": "I identified the land manager with the worst biodiversity and highest Social Vulnerability Index (SVI) by analyzing the columns: `richness`, which measures species richness, and `svi`, which represents social vulnerability based on factors like socioeconomic status, household characteristics, racial & ethnic minority status, and housing & transportation.
609
-
610
- I sorted the data by richness in ascending order (worst biodiversity first) and svi in descending order (highest vulnerability). The result provides the manager, which is the name of the entity managing the land. Note that the manager column refers to the specific agency or organization responsible for managing the land, while`manager_type` categorizes the type of jurisdiction (e.g., Federal, State, Non Profit)."
611
- }}
612
-
613
-
614
- ## Example:
615
- example_user: "Show me the biggest protected area"
616
- example_assistant: {{"sql_query":
617
- SELECT "id", "geom", "name", "acres", "manager", "manager_type", "acres"
618
- FROM mydata
619
- ORDER BY "acres" DESC
620
- LIMIT 1;
621
- "explanation": "I identified the biggest protected area by sorting the data in descending order based on the `acres` column, which represents the size of each area."
622
-
623
- ## Example:
624
- example_user: "Show me the 50 most biodiverse areas found in disadvantaged communities."
625
- example_assistant: {{"sql_query":
626
- SELECT "id", "geom", "name", "acres", "richness", "percent_disadvantaged" FROM mydata
627
- WHERE "percent_disadvantaged" > 0
628
- ORDER BY "richness" DESC
629
- LIMIT 50;
630
- "explanation": "I used the `richness` column to measure biodiversity and the `percent_disadvantaged` column to identify areas located in disadvantaged communities. The `percent_disadvantaged` value is derived from the Justice40 initiative, which identifies communities burdened by systemic inequities and vulnerabilities across multiple domains, including climate resilience, energy access, health disparities, housing affordability, pollution exposure, transportation infrastructure, water quality, and workforce opportunities.
631
-
632
- The results are sorted in descending order by biodiversity richness (highest biodiversity first), and only areas with a `percent_disadvantaged` value greater than 0 (indicating some portion of the area overlaps with a disadvantaged community) are included."
633
- }}
634
-
635
-
636
- ## Example:
637
- example_user: "Show me federally managed gap 3 lands that are in the top 5% of biodiversity richness and have experienced forest fire over at least 50% of their area"
638
- sql_query:
639
- WITH temp_tab AS (
640
- SELECT PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "richness") AS temp
641
- FROM mydata
642
- )
643
- SELECT "id", "geom", "name", "acres","richness", "reGAP"
644
- FROM mydata
645
- WHERE "reGAP" = 3
646
- AND "percent_fire_10yr" >= 0.5
647
- and "manager_type" = "Federal"
648
- AND "richness" > (SELECT temp FROM temp_tab);
649
-
650
-
651
- ## Example:
652
- example_user: "What is the total acreage of areas designated as easements?
653
- sql_query:
654
- SELECT SUM("acres") AS total_acres
655
- FROM mydata
656
- WHERE "easement" = "True";
657
-
658
-
659
- # Detailed Explanation of the Columns in the California Dataset
660
- - "established": The time range which the land was acquired, either "2024" or "pre-2024".
661
- - "reGAP": The GAP status code; corresponds to the level of protection the area has. There are 4 gap codes and are defined as the following.
662
- Status 1: Permanently protected to maintain a natural state, allowing natural disturbances or mimicking them through management.
663
- Status 2: Permanently protected but may allow some uses or management practices that degrade natural communities or suppress natural disturbances.
664
- Status 3: Permanently protected from major land cover conversion but allows some extractive uses (e.g., logging, mining) and protects federally listed species.
665
- Status 4: No protection mandates; land may be converted to unnatural habitat types or its management intent is unknown.
666
-
667
- - "name": The name of a protected area. The user may use a shortened name and/or not capitalize it. For example, "redwoods" may refer to "Redwood National Park", or "klamath" refers to "Klamath National Forest". Another example, "san diego wildlife refuge" could refer to multiple areas, so you would use "WHERE LOWER("name") LIKE '%san diego%' AND LOWER("name") LIKE '%wildlife%' AND LOWER("name") LIKE '%refuge%';" in your SQL query, to ensure that it is case-insensitive and matches any record that includes our phrases, because we don't want to overlook a match. If the name isn't capitalized, you MUST ensure the search is case-insensitive by converting "name" to lowercase.
668
- The names of the largest parks are {names}.
669
- - "access_type": Level of access to the land: "Unknown Access","Restricted Access","No Public Access" and "Open Access".
670
- - "manager": The name of land manager for the area. Also referred to as the agency name. These are the manager names: {managers}. Users might use acronyms or could omit "United States" in the agency name, make sure to use the name used in the table. Some examples: "BLM" or "Bureau of Land Management" refers to the "United States Bureau of Land Management" or "CDFW" is "California Department of Fish and Wildlife". Similar to the "name" field, you can search for managers using "LIKE" in the SQL query.
671
- - "manager_type": The jurisdiction of the land manager: "Federal","State","Non Profit","Special District","Unknown","County","City","Joint","Tribal","Private","HOA". If the user says "non-profit", do not use a hyphen in your query.
672
- - "easement": Boolean value; whether or not the land is an easement.
673
- - "acres": Land acreage; measures the size of the area.
674
- - "id": unique id for each area. This is necessary for displaying queried results on a map.
675
- - "type": Physical type of area, either "Land" or "Water".
676
- - "richness": Species richness; higher values indicate better biodiversity.
677
- - "rsr": Range-size rarity; higher values indicate better rarity metrics.
678
- - "svi": Social Vulnerability Index based on 4 themes: socioeconomic status, household characteristics, racial & ethnic minority status, and housing & transportation. Higher values indicate greater vulnerability.
679
- - Themes:
680
- - "svi_socioeconomic_status": Poverty, unemployment, housing cost burden, education, and health insurance.
681
- - "svi_household_char": Age, disability, single-parent households, and language proficiency.
682
- - "svi_racial_ethnic_minority": Race and ethnicity variables.
683
- - "svi_housing_transit": Housing type, crowding, vehicles, and group quarters.
684
- - "percent_disadvantaged": Justice40-defined disadvantaged communities overburdened by climate, energy, health, housing, pollution, transportation, water, and workforce factors. Higher values indicate more disadvantage. Range is between 0 and 1.
685
- - "deforest_carbon": Carbon emissions due to deforestation.
686
- - "human_impact": A score representing the human footprint: cumulative anthropogenic impacts such as land cover change, population density, and infrastructure.
687
- - "percent_fire_10yr": The percentage of the area burned by fires from (2013-2022). Range is between 0 and 1.
688
- - "percent_rxburn_10yr": The percentage of the area affected by prescribed burns from (2013-2022). Range is between 0 and 1.
689
-
690
- Only use the following tables:
691
- {table_info}.
692
-
693
- Question: {input}'''
694
 
695
  from langchain_openai import ChatOpenAI
696
  # os.environ["OPENAI_API_KEY"] = st.secrets["LITELLM_KEY"]
@@ -759,15 +220,13 @@ def run_sql(query,color_choice):
759
 
760
 
761
 
762
- def summary_table_sql(column, colors, ids): # get df for charts + df_tab for printed table
763
  filters = [_.id.isin(ids)]
764
  combined_filter = reduce(lambda x, y: x & y, filters) #combining all the filters into ibis filter expression
765
  df = get_summary(ca, combined_filter, [column], colors) # df used for charts
766
  return df
767
 
768
 
769
-
770
-
771
  chatbot_toggles = {key: False for key in [
772
  'richness', 'rsr', 'irrecoverable_carbon', 'manageable_carbon',
773
  'percent_fire_10yr', 'percent_rxburn_10yr', 'percent_disadvantaged',
@@ -959,13 +418,6 @@ if 'out' not in locals():
959
  m.add_pmtiles(ca_pmtiles, style=style, name="CA", opacity=alpha, tooltip=True, fit_bounds = True)
960
 
961
 
962
- select_column = {
963
- "Year": "established",
964
- "GAP Status Code": "reGAP",
965
- "Manager Type": "manager_type",
966
- "Easement": "easement",
967
- "Access Type": "access_type",
968
- }
969
 
970
  column = select_column[color_choice]
971
 
@@ -986,9 +438,9 @@ colors = (
986
  # get summary tables used for charts + printed table
987
  # df - charts; df_tab - printed table (omits colors)
988
  if 'out' not in locals():
989
- df,df_tab = summary_table(column, colors, filter_cols, filter_vals, colorby_vals)
990
  else:
991
- df = summary_table_sql(column, colors, ids)
992
 
993
  total_percent = df.percent_protected.sum().round(2)
994
 
@@ -1086,11 +538,6 @@ with main:
1086
 
1087
 
1088
 
1089
- #########
1090
-
1091
-
1092
- footer = st.container()
1093
-
1094
 
1095
 
1096
  st.caption("***The label 'established' is inferred from the California Protected Areas Database, which may introduce artifacts. For details on our methodology, please refer to our code: https://github.com/boettiger-lab/ca-30x30.")
@@ -1101,31 +548,8 @@ st.caption("***Under California’s 30x30 framework, only GAP codes 1 and 2 are
1101
 
1102
  st.divider()
1103
 
 
 
 
1104
 
1105
 
1106
- '''
1107
- ## Credits
1108
- Authors: Cassie Buhler & Carl Boettiger, UC Berkeley
1109
- License: BSD-2-clause
1110
-
1111
- Data: https://huggingface.co/datasets/boettiger-lab/ca-30x30
1112
-
1113
- ### Data sources
1114
- - CA Nature Terrestrial 30x30 Conserved Areas map layer by CA Nature. Data: https://www.californianature.ca.gov/datasets/CAnature::30x30-conserved-areas-terrestrial-2024/about. License: Public Domain
1115
-
1116
- - Imperiled Species Richness and Range-Size-Rarity from NatureServe (2022). Data: https://beta.source.coop/repositories/cboettig/mobi. License CC-BY-NC-ND
1117
-
1118
- - Irrecoverable Carbon from Conservation International, reprocessed to COG on https://beta.source.coop/cboettig/carbon, citation: https://doi.org/10.1038/s41893-021-00803-6, License: CC-BY-NC
1119
-
1120
- - Fire polygons by CAL FIRE (2022), reprocessed to PMTiles on https://beta.source.coop/cboettig/fire/. License: Public Domain
1121
-
1122
- - Climate and Economic Justice Screening Tool, US Council on Environmental Quality, Justice40. Description: https://screeningtool.geoplatform.gov/en/methodology#3/33.47/-97.5. Data: https://beta.source.coop/repositories/cboettig/justice40/description/, License: Public Domain
1123
-
1124
- - CDC 2020 Social Vulnerability Index by US Census Tract. Description: https://www.atsdr.cdc.gov/place-health/php/svi/index.html. Data: https://source.coop/repositories/cboettig/social-vulnerability/description. License: Public Domain
1125
-
1126
- - Carbon-loss by Vizzuality, on https://beta.source.coop/repositories/vizzuality/lg-land-carbon-data. Citation: https://doi.org/10.1101/2023.11.01.565036, License: CC-BY
1127
-
1128
- - Human Footprint by Vizzuality, on https://beta.source.coop/repositories/vizzuality/hfp-100. Citation: https://doi.org/10.3389/frsen.2023.1130896, License: Public Domain
1129
-
1130
- '''
1131
-
 
13
  import sqlalchemy
14
  import pathlib
15
  from typing import Optional
16
+ from functools import reduce
17
 
18
+ from variables import *
19
+ from utils import *
 
 
 
 
 
 
 
20
 
21
 
 
 
 
 
22
 
23
  # Create the duckdb connection directly from the sqlalchemy engine instead.
24
+ # Not as elegant as `ibis.duckdb.connect()` but shares connection with sqlalchemy.
25
+ ## Create the engine
26
+ #cwd = pathlib.Path.cwd()
27
+ #connect_args = {'preload_extensions':['spatial']}
28
+ #eng = sqlalchemy.create_engine(f"duckdb:///{cwd}/duck.db",connect_args = connect_args)
29
+ #con = ibis.duckdb.from_connection(eng.raw_connection())
30
 
31
  ## Create the table from remote parquet only if it doesn't already exist on disk
32
+
33
+ con = ibis.duckdb.connect(extensions=["spatial"])
34
  current_tables = con.list_tables()
35
  if "mydata" not in set(current_tables):
36
  tbl = con.read_parquet(ca_parquet)
37
  con.create_table("mydata", tbl)
 
38
  ca = con.table("mydata")
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  for key in [
42
  'richness', 'rsr', 'irrecoverable_carbon', 'manageable_carbon',
 
47
  ]:
48
  if key not in st.session_state:
49
  st.session_state[key] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  st.set_page_config(layout="wide", page_title="CA Protected Areas Explorer", page_icon=":globe:")
53
 
 
139
  #############
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
 
143
 
144
+ ##### Chatbot stuff
 
 
145
 
146
 
147
  from pydantic import BaseModel, Field
 
150
  sql_query: str = Field(description="The SQL query generated by the assistant.")
151
  explanation: str = Field(description="A detailed explanation of how the SQL query answers the input question.")
152
 
153
+ with open('system_prompt.txt', 'r') as file:
154
+ template = file.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  from langchain_openai import ChatOpenAI
157
  # os.environ["OPENAI_API_KEY"] = st.secrets["LITELLM_KEY"]
 
220
 
221
 
222
 
223
+ def summary_table_sql(ca, column, colors, ids): # get df for charts + df_tab for printed table
224
  filters = [_.id.isin(ids)]
225
  combined_filter = reduce(lambda x, y: x & y, filters) #combining all the filters into ibis filter expression
226
  df = get_summary(ca, combined_filter, [column], colors) # df used for charts
227
  return df
228
 
229
 
 
 
230
  chatbot_toggles = {key: False for key in [
231
  'richness', 'rsr', 'irrecoverable_carbon', 'manageable_carbon',
232
  'percent_fire_10yr', 'percent_rxburn_10yr', 'percent_disadvantaged',
 
418
  m.add_pmtiles(ca_pmtiles, style=style, name="CA", opacity=alpha, tooltip=True, fit_bounds = True)
419
 
420
 
 
 
 
 
 
 
 
421
 
422
  column = select_column[color_choice]
423
 
 
438
  # get summary tables used for charts + printed table
439
  # df - charts; df_tab - printed table (omits colors)
440
  if 'out' not in locals():
441
+ df,df_tab = summary_table(ca, column, colors, filter_cols, filter_vals, colorby_vals)
442
  else:
443
+ df = summary_table_sql(ca, column, colors, ids)
444
 
445
  total_percent = df.percent_protected.sum().round(2)
446
 
 
538
 
539
 
540
 
 
 
 
 
 
541
 
542
 
543
  st.caption("***The label 'established' is inferred from the California Protected Areas Database, which may introduce artifacts. For details on our methodology, please refer to our code: https://github.com/boettiger-lab/ca-30x30.")
 
548
 
549
  st.divider()
550
 
551
+ with open('footer.md', 'r') as file:
552
+ footer = file.read()
553
+ st.markdown(footer)
554
 
555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/footer.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Credits
2
+ Authors: Cassie Buhler & Carl Boettiger, UC Berkeley
3
+ License: BSD-2-clause
4
+
5
+ Data: https://huggingface.co/datasets/boettiger-lab/ca-30x30
6
+
7
+ ### Data sources
8
+ - CA Nature Terrestrial 30x30 Conserved Areas map layer by CA Nature. Data: https://www.californianature.ca.gov/datasets/CAnature::30x30-conserved-areas-terrestrial-2024/about. License: Public Domain
9
+
10
+ - Imperiled Species Richness and Range-Size-Rarity from NatureServe (2022). Data: https://beta.source.coop/repositories/cboettig/mobi. License CC-BY-NC-ND
11
+
12
+ - Irrecoverable Carbon from Conservation International, reprocessed to COG on https://beta.source.coop/cboettig/carbon, citation: https://doi.org/10.1038/s41893-021-00803-6, License: CC-BY-NC
13
+
14
+ - Fire polygons by CAL FIRE (2022), reprocessed to PMTiles on https://beta.source.coop/cboettig/fire/. License: Public Domain
15
+
16
+ - Climate and Economic Justice Screening Tool, US Council on Environmental Quality, Justice40. Description: https://screeningtool.geoplatform.gov/en/methodology#3/33.47/-97.5. Data: https://beta.source.coop/repositories/cboettig/justice40/description/, License: Public Domain
17
+
18
+ - CDC 2020 Social Vulnerability Index by US Census Tract. Description: https://www.atsdr.cdc.gov/place-health/php/svi/index.html. Data: https://source.coop/repositories/cboettig/social-vulnerability/description. License: Public Domain
19
+
20
+ - Carbon-loss by Vizzuality, on https://beta.source.coop/repositories/vizzuality/lg-land-carbon-data. Citation: https://doi.org/10.1101/2023.11.01.565036, License: CC-BY
21
+
22
+ - Human Footprint by Vizzuality, on https://beta.source.coop/repositories/vizzuality/hfp-100. Citation: https://doi.org/10.3389/frsen.2023.1130896, License: Public Domain
app/system_prompt.txt ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert in SQL and an assistant for mapping and analyzing California land data. Given an input question, create a syntactically correct {dialect} query to run, and then provide an explanation of how you answered the input question.
2
+
3
+ For example:
4
+ {{
5
+ "sql_query": "SELECT * FROM my_table WHERE condition = 'value';",
6
+ "explanation": "This query retrieves all rows from my_table where the condition column equals 'value'."
7
+ }}
8
+
9
+ Ensure the response contains only this JSON object, with no additional text, formatting, or commentary.
10
+
11
+ # Important Details
12
+
13
+ - For map-related queries (e.g., "show me"), ALWAYS include "id," "geom", "name," and "acres" in the results, PLUS any other columns referenced in the query (e.g., in conditions, calculations, or subqueries). This output structure is MANDATORY for all map-related queries.
14
+ - ONLY use LIMIT in your SQL queries if the user specifies a quantity (e.g., 'show me 5'). Otherwise, return all matching data without a limit.
15
+ - Wrap each column name in double quotes (") to denote them as delimited identifiers.
16
+ - Pay attention to use only the column names you can see in the tables below. DO NOT query for columns that do not exist.
17
+ If the query mentions "biodiversity" without specifying a column, default to using "richness" (species richness). Explain this choice and that they can also request "rsr" (range-size rarity).
18
+ - If the query mentions carbon without specifying a column, use "irrecoverable carbon". Explain this choice and list the other carbon-related columns they can ask for, along with their definitions.
19
+ - If the query asks about the manager, use the "manager" column. You MUST ALWAYS explain the difference between manager and manager_type in your response. Clarify that "manager" refers to the name of the managing entity (e.g., an agency), while "manager_type" specifies the type of jurisdiction (e.g., Federal, State, Non Profit). Also, let the user know they can include "manager_type" in their query if they want to refine their results.
20
+ - If the user's query is unclear, DO NOT make assumptions. Instead, ask for clarification and provide examples of similar queries you can handle, using the columns or data available. You MUST ONLY deliver accurate results.
21
+ - If you are mapping the data, explicitly state that the data is being visualized on a map. ALWAYS include a statement encouraging the user to examine the queried data below the map, as some areas may be too small at the current zoom level.
22
+ - Users may not be familiar with this data, so your explanation should be short, clear, and easily understandable. You MUST state which column(s) you used to gather their query, along with definition(s) of the column(s). Do NOT explain SQL commands.
23
+ - If the prompt is unrelated to the California dataset, provide examples of relevant queries that you can answer.
24
+
25
+ # Example Questions and How to Approach Them
26
+
27
+ ## Example:
28
+ example_user: "Show me all non-profit land."
29
+ example_assistant: {{"sql_query":
30
+ SELECT id, geom, name, acres
31
+ FROM mydata
32
+ WHERE "manager_type" = "Non Profit";
33
+ "explanation":"I selected all data where `manager_type` is 'Non Profit'."
34
+ }}
35
+
36
+ ## Example:
37
+ example_user: "Which gap code has been impacted the most by fire?"
38
+ example_assistant: {{"sql_query":
39
+ SELECT "reGAP", SUM("percent_fire_10yr") AS temp
40
+ FROM mydata
41
+ GROUP BY "reGAP"
42
+ ORDER BY temp ASC
43
+ LIMIT 1;
44
+ "explanation":"I used the `percent_fire_10yr` column, which shows the percentage of each area burned over the past 10 years (2013–2022), summing it for each GAP code to find the one with the highest total fire impact."
45
+ }}
46
+
47
+ ## Example:
48
+ example_user: "Who manages the land with the worst biodiversity and highest SVI?"
49
+ example_assistant: {{"sql_query":
50
+ SELECT manager,richness, svi
51
+ FROM mydata
52
+ GROUP BY "manager"
53
+ ORDER BY richness ASC, svi DESC
54
+ LIMIT 1;
55
+ "explanation": "I identified the land manager with the worst biodiversity and highest Social Vulnerability Index (SVI) by analyzing the columns: `richness`, which measures species richness, and `svi`, which represents social vulnerability based on factors like socioeconomic status, household characteristics, racial & ethnic minority status, and housing & transportation.
56
+
57
+ I sorted the data by richness in ascending order (worst biodiversity first) and svi in descending order (highest vulnerability). The result provides the manager, which is the name of the entity managing the land. Note that the manager column refers to the specific agency or organization responsible for managing the land, while`manager_type` categorizes the type of jurisdiction (e.g., Federal, State, Non Profit)."
58
+ }}
59
+
60
+
61
+ ## Example:
62
+ example_user: "Show me the biggest protected area"
63
+ example_assistant: {{"sql_query":
64
+ SELECT "id", "geom", "name", "acres", "manager", "manager_type", "acres"
65
+ FROM mydata
66
+ ORDER BY "acres" DESC
67
+ LIMIT 1;
68
+ "explanation": "I identified the biggest protected area by sorting the data in descending order based on the `acres` column, which represents the size of each area."
69
+
70
+ ## Example:
71
+ example_user: "Show me the 50 most biodiverse areas found in disadvantaged communities."
72
+ example_assistant: {{"sql_query":
73
+ SELECT "id", "geom", "name", "acres", "richness", "percent_disadvantaged" FROM mydata
74
+ WHERE "percent_disadvantaged" > 0
75
+ ORDER BY "richness" DESC
76
+ LIMIT 50;
77
+ "explanation": "I used the `richness` column to measure biodiversity and the `percent_disadvantaged` column to identify areas located in disadvantaged communities. The `percent_disadvantaged` value is derived from the Justice40 initiative, which identifies communities burdened by systemic inequities and vulnerabilities across multiple domains, including climate resilience, energy access, health disparities, housing affordability, pollution exposure, transportation infrastructure, water quality, and workforce opportunities.
78
+
79
+ The results are sorted in descending order by biodiversity richness (highest biodiversity first), and only areas with a `percent_disadvantaged` value greater than 0 (indicating some portion of the area overlaps with a disadvantaged community) are included."
80
+ }}
81
+
82
+
83
+ ## Example:
84
+ example_user: "Show me federally managed gap 3 lands that are in the top 5% of biodiversity richness and have experienced forest fire over at least 50% of their area"
85
+ sql_query:
86
+ WITH temp_tab AS (
87
+ SELECT PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "richness") AS temp
88
+ FROM mydata
89
+ )
90
+ SELECT "id", "geom", "name", "acres","richness", "reGAP"
91
+ FROM mydata
92
+ WHERE "reGAP" = 3
93
+ AND "percent_fire_10yr" >= 0.5
94
+ and "manager_type" = "Federal"
95
+ AND "richness" > (SELECT temp FROM temp_tab);
96
+
97
+
98
+ ## Example:
99
+ example_user: "What is the total acreage of areas designated as easements?
100
+ sql_query:
101
+ SELECT SUM("acres") AS total_acres
102
+ FROM mydata
103
+ WHERE "easement" = "True";
104
+
105
+
106
+ # Detailed Explanation of the Columns in the California Dataset
107
+ - "established": The time range which the land was acquired, either "2024" or "pre-2024".
108
+ - "reGAP": The GAP status code; corresponds to the level of protection the area has. There are 4 gap codes and are defined as the following.
109
+ Status 1: Permanently protected to maintain a natural state, allowing natural disturbances or mimicking them through management.
110
+ Status 2: Permanently protected but may allow some uses or management practices that degrade natural communities or suppress natural disturbances.
111
+ Status 3: Permanently protected from major land cover conversion but allows some extractive uses (e.g., logging, mining) and protects federally listed species.
112
+ Status 4: No protection mandates; land may be converted to unnatural habitat types or its management intent is unknown.
113
+
114
+ - "name": The name of a protected area. The user may use a shortened name and/or not capitalize it. For example, "redwoods" may refer to "Redwood National Park", or "klamath" refers to "Klamath National Forest". Another example, "san diego wildlife refuge" could refer to multiple areas, so you would use "WHERE LOWER("name") LIKE '%san diego%' AND LOWER("name") LIKE '%wildlife%' AND LOWER("name") LIKE '%refuge%';" in your SQL query, to ensure that it is case-insensitive and matches any record that includes our phrases, because we don't want to overlook a match. If the name isn't capitalized, you MUST ensure the search is case-insensitive by converting "name" to lowercase.
115
+ The names of the largest parks are {names}.
116
+ - "access_type": Level of access to the land: "Unknown Access","Restricted Access","No Public Access" and "Open Access".
117
+ - "manager": The name of land manager for the area. Also referred to as the agency name. These are the manager names: {managers}. Users might use acronyms or could omit "United States" in the agency name, make sure to use the name used in the table. Some examples: "BLM" or "Bureau of Land Management" refers to the "United States Bureau of Land Management" or "CDFW" is "California Department of Fish and Wildlife". Similar to the "name" field, you can search for managers using "LIKE" in the SQL query.
118
+ - "manager_type": The jurisdiction of the land manager: "Federal","State","Non Profit","Special District","Unknown","County","City","Joint","Tribal","Private","HOA". If the user says "non-profit", do not use a hyphen in your query.
119
+ - "easement": Boolean value; whether or not the land is an easement.
120
+ - "acres": Land acreage; measures the size of the area.
121
+ - "id": unique id for each area. This is necessary for displaying queried results on a map.
122
+ - "type": Physical type of area, either "Land" or "Water".
123
+ - "richness": Species richness; higher values indicate better biodiversity.
124
+ - "rsr": Range-size rarity; higher values indicate better rarity metrics.
125
+ - "svi": Social Vulnerability Index based on 4 themes: socioeconomic status, household characteristics, racial & ethnic minority status, and housing & transportation. Higher values indicate greater vulnerability.
126
+ - Themes:
127
+ - "svi_socioeconomic_status": Poverty, unemployment, housing cost burden, education, and health insurance.
128
+ - "svi_household_char": Age, disability, single-parent households, and language proficiency.
129
+ - "svi_racial_ethnic_minority": Race and ethnicity variables.
130
+ - "svi_housing_transit": Housing type, crowding, vehicles, and group quarters.
131
+ - "percent_disadvantaged": Justice40-defined disadvantaged communities overburdened by climate, energy, health, housing, pollution, transportation, water, and workforce factors. Higher values indicate more disadvantage. Range is between 0 and 1.
132
+ - "deforest_carbon": Carbon emissions due to deforestation.
133
+ - "human_impact": A score representing the human footprint: cumulative anthropogenic impacts such as land cover change, population density, and infrastructure.
134
+ - "percent_fire_10yr": The percentage of the area burned by fires from (2013-2022). Range is between 0 and 1.
135
+ - "percent_rxburn_10yr": The percentage of the area affected by prescribed burns from (2013-2022). Range is between 0 and 1.
136
+
137
+ Only use the following tables:
138
+ {table_info}.
139
+
140
+ Question: {input}
app/utils.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ import base64
4
+ import leafmap.maplibregl as leafmap
5
+ import altair as alt
6
+ import ibis
7
+ from ibis import _
8
+ import ibis.selectors as s
9
+ import os
10
+ import pandas as pd
11
+ import geopandas as gpd
12
+ from shapely import wkb
13
+ import sqlalchemy
14
+ import pathlib
15
+ from typing import Optional
16
+ from functools import reduce
17
+
18
+ from variables import *
19
+
20
+ def get_summary(ca, combined_filter, column, colors=None): #summary stats, based on filtered data
21
+ df = ca.filter(combined_filter)
22
+ df = (df
23
+ .group_by(*column) # unpack the list for grouping
24
+ .aggregate(percent_protected=100 * _.acres.sum() / ca_area_acres,
25
+ mean_richness = (_.richness * _.acres).sum() / _.acres.sum(),
26
+ mean_rsr = (_.rsr * _.acres).sum() / _.acres.sum(),
27
+ mean_irrecoverable_carbon = (_.irrecoverable_carbon * _.acres).sum() / _.acres.sum(),
28
+ mean_manageable_carbon = (_.manageable_carbon * _.acres).sum() / _.acres.sum(),
29
+ mean_percent_fire_10yr = (_.percent_fire_10yr *_.acres).sum()/_.acres.sum(),
30
+ mean_percent_rxburn_10yr = (_.percent_rxburn_10yr *_.acres).sum()/_.acres.sum(),
31
+ mean_percent_disadvantaged = (_.percent_disadvantaged * _.acres).sum() / _.acres.sum(),
32
+ mean_svi = (_.svi * _.acres).sum() / _.acres.sum(),
33
+ mean_svi_socioeconomic_status = (_.svi_socioeconomic_status * _.acres).sum() / _.acres.sum(),
34
+ mean_svi_household_char = (_.svi_household_char * _.acres).sum() / _.acres.sum(),
35
+ mean_svi_racial_ethnic_minority = (_.svi_racial_ethnic_minority * _.acres).sum() / _.acres.sum(),
36
+ mean_svi_housing_transit = (_.svi_housing_transit * _.acres).sum() / _.acres.sum(),
37
+ mean_carbon_lost = (_.deforest_carbon * _.acres).sum() / _.acres.sum(),
38
+ mean_human_impact = (_.human_impact * _.acres).sum() / _.acres.sum(),
39
+ )
40
+ .mutate(percent_protected=_.percent_protected.round(1))
41
+ )
42
+ if colors is not None and not colors.empty: #only the df will have colors, df_tab doesn't since we are printing it.
43
+ df = df.inner_join(colors, column)
44
+ df = df.cast({col: "string" for col in column})
45
+ df = df.to_pandas()
46
+ return df
47
+
48
+
49
+ def summary_table(ca, column, colors, filter_cols, filter_vals,colorby_vals): # get df for charts + df_tab for printed table
50
+ filters = []
51
+ if filter_cols and filter_vals: #if a filter is selected, add to list of filters
52
+ for filter_col, filter_val in zip(filter_cols, filter_vals):
53
+ if len(filter_val) > 1:
54
+ filters.append(getattr(_, filter_col).isin(filter_val))
55
+ else:
56
+ filters.append(getattr(_, filter_col) == filter_val[0])
57
+ if column not in filter_cols: #show color_by column in table by adding it as a filter (if it's not already a filter)
58
+ filter_cols.append(column)
59
+ filters.append(getattr(_, column).isin(colorby_vals[column]))
60
+ combined_filter = reduce(lambda x, y: x & y, filters) #combining all the filters into ibis filter expression
61
+ df = get_summary(ca, combined_filter, [column], colors) # df used for charts
62
+ df_tab = get_summary(ca, combined_filter, filter_cols, colors = None) #df used for printed table
63
+ return df, df_tab
64
+
65
+
66
+
67
+ def area_plot(df, column): #percent protected pie chart
68
+ base = alt.Chart(df).encode(
69
+ alt.Theta("percent_protected:Q").stack(True),
70
+ )
71
+ pie = ( base
72
+ .mark_arc(innerRadius= 40, outerRadius=100)
73
+ .encode(alt.Color("color:N").scale(None).legend(None),
74
+ tooltip=['percent_protected', column])
75
+ )
76
+ text = ( base
77
+ .mark_text(radius=80, size=14, color="white")
78
+ .encode(text = column + ":N")
79
+ )
80
+ plot = pie # pie + text
81
+ return plot.properties(width="container", height=290)
82
+
83
+
84
+ def bar_chart(df, x, y, title): #display summary stats for color_by column
85
+
86
+ #axis label angles / chart size
87
+ if x == "manager_type": #labels are too long, making vertical
88
+ angle = 270
89
+ height = 373
90
+ else: #other labels are horizontal
91
+ angle = 0
92
+ height = 310
93
+
94
+ # order of bars
95
+ if x == "established": # order labels in chronological order, not alphabetic.
96
+ sort = '-x'
97
+ elif x == "access_type": #order based on levels of openness
98
+ sort=['Open', 'Restricted', 'No Public', "Unknown"]
99
+ elif x == "manager_type":
100
+ sort = ["Federal","Tribal","State","Special District", "County", "City", "HOA","Joint","Non Profit","Private","Unknown"]
101
+ else:
102
+ sort = 'x'
103
+
104
+ x_title = next(key for key, value in select_column.items() if value == x)
105
+ chart = alt.Chart(df).mark_bar().transform_calculate(
106
+ access_label=f"replace(datum.{x}, ' Access', '')" #omit access from access_type labels so it fits in frame
107
+ ).encode(
108
+ x=alt.X("access_label:N",
109
+ axis=alt.Axis(labelAngle=angle, title=x_title),
110
+ sort=sort),
111
+ y=alt.Y(y, axis=alt.Axis()),
112
+ color=alt.Color('color').scale(None)
113
+ ).properties(width="container", height=height, title = title
114
+ )
115
+ # sizing for poster
116
+ # ).configure_title(
117
+ # fontSize=40
118
+ # ).configure_axis(
119
+ # labelFontSize=24,
120
+ # titleFontSize=34
121
+ # )
122
+ return chart
123
+
124
+
125
+
126
+ def getButtons(style_options, style_choice, default_gap=None): #finding the buttons selected to use as filters
127
+ column = style_options[style_choice]['property']
128
+ opts = [style[0] for style in style_options[style_choice]['stops']]
129
+ default_gap = default_gap or {}
130
+ buttons = {
131
+ name: st.checkbox(f"{name}", value=default_gap.get(name, True), key=column + str(name))
132
+ for name in opts
133
+ }
134
+ filter_choice = [key for key, value in buttons.items() if value] # return only selected
135
+ d = {}
136
+ d[column] = filter_choice
137
+ return d
138
+
139
+
140
+
141
+ def getColorVals(style_options, style_choice):
142
+ #df_tab only includes filters selected, we need to manually add "color_by" column (if it's not already a filter).
143
+ column = style_options[style_choice]['property']
144
+ opts = [style[0] for style in style_options[style_choice]['stops']]
145
+ d = {}
146
+ d[column] = opts
147
+ return d
148
+
149
+
150
+
151
+ def fire_style(layer):
152
+ return {"version": 8,
153
+ "sources": {
154
+ "source1": {
155
+ "type": "vector",
156
+ "url": "pmtiles://" + url_calfire,
157
+ "attribution": "CAL FIRE"
158
+ }
159
+ },
160
+ "layers": [
161
+ {
162
+ "id": "fire",
163
+ "source": "source1",
164
+ "source-layer": layer,
165
+ "type": "fill",
166
+ "paint": {
167
+ "fill-color": "#D22B2B",
168
+ }
169
+ }
170
+ ]
171
+ }
172
+ def rx_style(layer):
173
+ return{
174
+ "version": 8,
175
+ "sources": {
176
+ "source2": {
177
+ "type": "vector",
178
+ "url": "pmtiles://" + url_rxburn,
179
+ "attribution": "CAL FIRE"
180
+ }
181
+ },
182
+ "layers": [
183
+ {
184
+ "id": "fire",
185
+ "source": "source2",
186
+ "source-layer": layer,
187
+ # "filter": [">=", ["get", "YEAR_"], year],
188
+ "type": "fill",
189
+ "paint": {
190
+ "fill-color": "#702963",
191
+ }
192
+ }
193
+ ]
194
+ }
195
+
196
+ def get_sv_style(column):
197
+ return {
198
+ "layers": [
199
+ {
200
+ "id": "SVI",
201
+ "source": column, #need different "source" for multiple pmtiles layers w/ same file
202
+ "source-layer": "SVI2020_US_county",
203
+ "filter": ["match", ["get", "STATE"], "California", True, False],
204
+ "type": "fill",
205
+ "paint": {
206
+ "fill-color": [
207
+ "interpolate", ["linear"], ["get", column],
208
+ 0, white,
209
+ 1, svi_color
210
+ ]
211
+ }
212
+ }
213
+ ]
214
+ }
215
+
216
+
217
+ def get_pmtiles_style(paint, alpha, filter_cols, filter_vals):
218
+ filters = []
219
+ for col, val in zip(filter_cols, filter_vals):
220
+ filters.append(["match", ["get", col], val, True, False])
221
+ combined_filters = ["all"] + filters
222
+ style = {
223
+ "version": 8,
224
+ "sources": {
225
+ "ca": {
226
+ "type": "vector",
227
+ "url": "pmtiles://" + ca_pmtiles,
228
+ }
229
+ },
230
+ "layers": [
231
+ {
232
+ "id": "ca30x30",
233
+ "source": "ca",
234
+ "source-layer": "layer",
235
+ "type": "fill",
236
+ "filter": combined_filters,
237
+ "paint": {
238
+ "fill-color": paint,
239
+ "fill-opacity": alpha
240
+ }
241
+ }
242
+ ]
243
+ }
244
+ return style
245
+
246
+ def get_pmtiles_style_llm(paint, ids):
247
+ combined_filters = ["all", ["match", ["get", "id"], ids, True, False]]
248
+ style = {
249
+ "version": 8,
250
+ "sources": {
251
+ "ca": {
252
+ "type": "vector",
253
+ "url": "pmtiles://" + ca_pmtiles,
254
+ }
255
+ },
256
+ "layers": [
257
+ {
258
+ "id": "ca30x30",
259
+ "source": "ca",
260
+ "source-layer": "layer",
261
+ "type": "fill",
262
+ "filter": combined_filters,
263
+ "paint": {
264
+ "fill-color": paint,
265
+ "fill-opacity": 1,
266
+ # "fill-extrusion-height": 1000
267
+ }
268
+ }
269
+ ]
270
+ }
271
+ return style
app/variables.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # urls for main layer
2
+ ca_pmtiles = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cpad-stats.pmtiles"
3
+ ca_parquet = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cpad-stats.parquet"
4
+
5
+ ca_area_acres = 1.014e8 #acres
6
+ style_choice = "GAP Status Code"
7
+
8
+
9
+ # urls for additional data layers
10
+ url_sr = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/species-richness-ca/{z}/{x}/{y}.png"
11
+ url_rsr = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/range-size-rarity/{z}/{x}/{y}.png"
12
+ url_irr_carbon = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/ca_irrecoverable_c_2018_cog.tif"
13
+ url_man_carbon = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/ca_manageable_c_2018_cog.tif"
14
+ url_svi = "https://data.source.coop/cboettig/social-vulnerability/svi2020_us_county.pmtiles"
15
+ url_justice40 = "https://data.source.coop/cboettig/justice40/disadvantaged-communities.pmtiles"
16
+ url_loss_carbon = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/deforest-carbon-ca/{z}/{x}/{y}.png"
17
+ url_hi = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/ca_human_impact_cog.tif"
18
+ url_calfire = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cal_fire_2022.pmtiles"
19
+ url_rxburn = "https://huggingface.co/datasets/boettiger-lab/ca-30x30/resolve/main/cal_rxburn_2022.pmtiles"
20
+
21
+ # colors for plotting
22
+ private_access_color = "#DE881E" # orange
23
+ public_access_color = "#3388ff" # blue
24
+ tribal_color = "#BF40BF" # purple
25
+ mixed_color = "#005a00" # green
26
+ year2023_color = "#26542C" # green
27
+ year2024_color = "#F3AB3D" # orange
28
+ federal_color = "#529642" # green
29
+ state_color = "#A1B03D" # light green
30
+ local_color = "#365591" # blue
31
+ special_color = "#0096FF" # blue
32
+ private_color = "#7A3F1A" # brown
33
+ joint_color = "#DAB0AE" # light pink
34
+ county_color = "#DE3163" # magenta
35
+ city_color = "#ADD8E6" #light blue
36
+ hoa_color = "#A89BBC" # purple
37
+ nonprofit_color = "#D77031" #orange
38
+ justice40_color = "#00008B" #purple
39
+ svi_color = "#1bc7c3" #cyan
40
+ white = "#FFFFFF"
41
+
42
+ # gap codes 3 and 4 are off by default.
43
+ default_gap = {
44
+ 3: False,
45
+ 4: False,
46
+ }
47
+
48
+ # Maplibre styles. (should these be functions?)
49
+ manager = {
50
+ 'property': 'manager_type',
51
+ 'type': 'categorical',
52
+ 'stops': [
53
+ ['Federal', federal_color],
54
+ ['State', state_color],
55
+ ['Non Profit', nonprofit_color],
56
+ ['Special District', special_color],
57
+ ['Unknown', "#bbbbbb"],
58
+ ['County', county_color],
59
+ ['City', city_color],
60
+ ['Joint', joint_color],
61
+ ['Tribal', tribal_color],
62
+ ['Private', private_color],
63
+ ['HOA', hoa_color]
64
+ ]
65
+ }
66
+
67
+ easement = {
68
+ 'property': 'easement',
69
+ 'type': 'categorical',
70
+ 'stops': [
71
+ ['True', private_access_color],
72
+ ['False', public_access_color]
73
+ ]
74
+ }
75
+
76
+ year = {
77
+ 'property': 'established',
78
+ 'type': 'categorical',
79
+ 'stops': [
80
+ ['pre-2024', year2023_color],
81
+ ['2024', year2024_color]
82
+ ]
83
+ }
84
+
85
+ access = {
86
+ 'property': 'access_type',
87
+ 'type': 'categorical',
88
+ 'stops': [
89
+ ['Open Access', public_access_color],
90
+ ['No Public Access', private_access_color],
91
+ ['Unknown Access', "#bbbbbb"],
92
+ ['Restricted Access', tribal_color]
93
+ ]
94
+ }
95
+
96
+ gap = {
97
+ 'property': 'reGAP',
98
+ 'type': 'categorical',
99
+ 'stops': [
100
+ [1, "#26633d"],
101
+ [2, "#879647"],
102
+ [3, "#EE4B2B"],
103
+ [4, "#BF40BF"]
104
+ ]
105
+ }
106
+
107
+ style_options = {
108
+ "Year": year,
109
+ "GAP Status Code": gap,
110
+ "Manager Type": manager,
111
+ "Easement": easement,
112
+ "Access Type": access,
113
+ }
114
+
115
+ justice40_fill = {
116
+ 'property': 'Disadvan',
117
+ 'type': 'categorical',
118
+ 'stops': [
119
+ [0, white],
120
+ [1, justice40_color]
121
+ ]
122
+ }
123
+
124
+ justice40_style = {
125
+ "version": 8,
126
+ "sources": {
127
+ "source1": {
128
+ "type": "vector",
129
+ "url": "pmtiles://" + url_justice40,
130
+ "attribution": "Justice40"
131
+ }
132
+ },
133
+ "layers": [
134
+ {
135
+ "id": "layer1",
136
+ "source": "source1",
137
+ "source-layer": "DisadvantagedCommunitiesCEJST",
138
+ "filter": ["match", ["get", "StateName"], "California", True, False],
139
+ "type": "fill",
140
+ "paint": {
141
+ "fill-color": justice40_fill,
142
+ }
143
+ }
144
+ ]
145
+ }
146
+
147
+ select_column = {
148
+ "Year": "established",
149
+ "GAP Status Code": "reGAP",
150
+ "Manager Type": "manager_type",
151
+ "Easement": "easement",
152
+ "Access Type": "access_type",
153
+ }
154
+