File size: 8,851 Bytes
daec3df
 
 
 
 
 
 
 
 
 
 
 
58f1141
 
daec3df
 
 
 
 
19d03d3
 
daec3df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58f1141
 
 
 
 
 
 
 
 
 
 
 
 
 
daec3df
 
 
 
58f1141
 
 
 
daec3df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# This example does not use a langchain agent, 
# The langchain sql chain has knowledge of the database, but doesn't interact with it becond intialization.
# The output of the sql chain is parsed seperately and passed to `duckdb.sql()` by streamlit

import os
os.environ["WEBSOCKET_TIMEOUT_MS"] = "300000" # no effect

import streamlit as st
import geopandas as gpd
import pandas as pd
from shapely import wkb

st.set_page_config(page_title="Explore US Protected Areas", page_icon="🦜", layout="wide")
st.title("Explore US Protected Areas")

## Database connection, reading directly from remote parquet file
from sqlalchemy import create_engine
from langchain.sql_database import SQLDatabase
db_uri = "duckdb:///my.duckdb"
stats = "https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/pad-stats.parquet"
groups = "https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/pad-groupings.parquet"
engine = create_engine(db_uri) #connect_args={'read_only': True})
con = engine.connect()
con.execute("install spatial; load spatial;")
# con.execute(f"create or replace view stats as select * from read_parquet('{stats}');").fetchall()
con.execute(f"create or replace table groups as select * from read_parquet('{groups}');").fetchall()

db = SQLDatabase(engine, view_support=True)


@st.cache_data
def query_database(response):
    z = con.execute(response).fetchall()
    return pd.DataFrame(z).head(25)
     
import ibis
from ibis import _
import ibis.selectors as s
import altair as alt
ibis_con = ibis.duckdb.connect("my.duckdb")
stats = ibis_con.read_parquet(stats)
us_lower_48_area_m2 = 7.8e+12




def summary_table(stats, query, column):
    #z = con.execute(query).fetchall()
    groups = ibis_con.table("groups").sql(query.replace(";", ""))

    df = (stats
        .inner_join(groups, "row_n")
        .select(~s.contains("_right"))
        .rename(area = "area_square_meters")
        .group_by(_[column])
        .aggregate(percent_protected =  100 * _.area.sum() / us_lower_48_area_m2,
                hectares =  _.area.sum() / 10000,
                n = _.area.count(),
                richness = (_.richness * _.area).sum() / _.area.sum(),
                rsr = (_.rsr * _.area).sum() / _.area.sum(),
                carbon_lost = (_.deforest_carbon * _.area).sum() / _.area.sum(),
                crop_expansion = (_.crop_expansion * _.area).sum() / _.area.sum(),
                human_impact =  (_.human_impact * _.area).sum() / _.area.sum(),
                )
        .mutate(percent_protected = _.percent_protected.round(1))
        )
    return df.to_pandas()

def area_plot(df, column):
    base = alt.Chart(df).encode(
           alt.Theta("percent_protected:Q").stack(True),
           alt.Color(column+":N").legend(None)
    )

    pie = base.mark_arc(innerRadius= 40, outerRadius=80)
    text = base.mark_text(radius=120, size=20).encode(
    text="percent_protected:Q"
)
    plot = pie + text
    return st.altair_chart(plot)

def bar_chart(df, x, y):
    chart = alt.Chart(df).mark_bar().encode(
        x=x,
        y=y,
        color=alt.Color(x).legend(None)
    ).properties(width="container", height=300)
    return chart


## ChatGPT Connection
from langchain_openai import ChatOpenAI 
from langchain_community.llms import Ollama
# from langchain_community.llms import ChatOllama

models = {"chatgpt3.5": ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])}

other_models = {
          "chatgpt4": ChatOpenAI(model="gpt-4", temperature=0, api_key=st.secrets["OPENAI_API_KEY"]),
          "duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
          "command-r-plus": Ollama(model="command-r-plus", temperature=0),
          "mixtral:8x22b":  Ollama(model="mixtral:8x22b", temperature=0),
          "wizardlm2:8x22b":  Ollama(model="wizardlm2:8x22b", temperature=0),
          "sqlcoder": Ollama(model="sqlcoder", temperature=0),
          "zephyr": Ollama(model="zephyr", temperature=0),
          "gemma:7b": Ollama(model="gemma:7b", temperature=0),
          "codegemma": Ollama(model="codegemma", temperature=0),
          "llama2": Ollama(model="llama2", temperature=0),
         }

with st.sidebar:
    choice = st.radio("Select an LLM:", models)
    llm = models[choice]
    column = st.text_input("grouping column", "labels")


## A SQL Chain
from langchain.chains import create_sql_query_chain
chain = create_sql_query_chain(llm, db)


main = st.container()

## Does not preserve history
with main:

    '''
    The US [recently announced](https://www.conservation.gov/pages/america-the-beautiful-initiative) the first-ever national goal to conserve at least 30 percent of our lands and waters by the year 2030.
    But which 30%? 

    Protected areas span a range of "GAP" areas [indicating the degree of protection](https://www.protectedlands.net/uses-of-pad-us/#conservation-of-biodiversity-2).  Protected areas include not only owned or "fee"-based parcels such as National Parks and Monuments,
    but also "easements" (see [feature classes](https://www.protectedlands.net/pad-us-technical-how-tos/#feature-classes-in-pad-us-2))

    - GAP 1: Managed for biodiversity with natural disturbance events allowed (for example, Wilderness, Research Natural Areas, some National Parks, some State or NGO Nature Preserves)
    - GAP 2: Managed for biodiversity with management that may interfere with natural processes (for example, suppress wildfire or flood)
    - GAP 3: Permanent protection, but the land is subject to multiple uses (forestry, farming, intensive recreation, etc.
    - GAP 4: No known institutional mandates to prevent conversion of natural habitat types

    Use the chat tool below to specify your own groupings of the data and see how they compare.

    ##### Try these example queries:

    - gap 1, 2, 3 are labelled 'conserved lands' and gap 4 is labeled 'other'
    - exclude gap 4, include only Federal manager types, labelled by manager_name
    - label gap 1, 2 as "permanently protected", label gap 3 as "additional conserved area", and gap 4 as other
    - label gap 1, 2 areas in category ="Easements" as "protected easements", gap 1,2 category="Fee" as "protected areas", gap 3 easements as "mixed use easements", gap 3 Fee as "mixed use lands". exclude gap 4.






    '''
    
    prefix = "construct a select query that creates a column called 'labels' that only contains rows that meet the following criteria:"
    suffix = ". Do not use LIMIT. Always return all columns. Do not try to select specific columns."

    st.markdown("Specify how data should be labelled, as in the examples above:")
    chatbox = st.container()
    with chatbox:           
        if prompt := st.chat_input(key="chain"):
            st.chat_message("user").write(prompt)
            with st.chat_message("assistant"):
                response = chain.invoke({"question": prefix + prompt + suffix})
                st.write(response)
                df = summary_table(stats, response, column)
                
                with st.container():
                    col1, col2, col3 = st.columns(3)
                    with col1:
                        total_percent = df.percent_protected.sum().round(1)
                        f"{total_percent}% Continental US Covered"
                        area_plot(df, column)

                    with col2:
                        "Species Richness"
                        st.altair_chart(bar_chart(df, column, "richness"), use_container_width=True)

                    with col3:
                        "Range-Size Rarity"
                        st.altair_chart(bar_chart(df, column, "rsr"), use_container_width=True)

                with st.container():
                    col1b, col2b, col3b = st.columns(3)
                    with col1b:
                        "Carbon Lost ('02-'22)"
                        st.altair_chart(bar_chart(df, column, "carbon_lost"), use_container_width=True)

                    with col2b:
                        "Crop expansion"
                        st.altair_chart(bar_chart(df, column, "crop_expansion"), use_container_width=True)

                    with col3b:
                        "Human Impact"
                        st.altair_chart(bar_chart(df, column, "human_impact"), use_container_width=True)
                        

                    st.divider()
                    st.dataframe(df)





                
                
#st.divider()
#with st.container():
#    st.text("Database schema (top 3 rows)")
#    tbl = tbl = query_database("select * from groups limit 3")
#    st.dataframe(tbl)


st.divider()

'''
Experimental prototype.  

- Author: [Carl Boettiger](https://carlboettiger.info)
- For data sources and processing, see: https://beta.source.coop/repositories/cboettig/pad-us-3/description/


'''