File size: 7,537 Bytes
057cfd5
 
 
 
9d1df78
057cfd5
0539eb3
057cfd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2071015
057cfd5
27b4ed0
0539eb3
057cfd5
 
 
0539eb3
057cfd5
 
0539eb3
057cfd5
 
9d1df78
057cfd5
 
9d1df78
057cfd5
9d1df78
057cfd5
 
 
 
 
 
 
 
 
 
 
 
d86085f
 
057cfd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d86085f
 
 
057cfd5
 
d86085f
 
057cfd5
 
 
 
d86085f
057cfd5
 
 
 
9d1df78
057cfd5
9d1df78
057cfd5
9d1df78
057cfd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# This example does not use a langchain agent, 
# The langchain sql chain has knowledge of the database, but doesn't interact with it becond intialization.
# The output of the sql chain is parsed seperately and passed to `duckdb.sql()` by streamlit

import os
os.environ["WEBSOCKET_TIMEOUT_MS"] = "300000" # no effect

import streamlit as st
import geopandas as gpd
from shapely import wkb
import leafmap.foliumap as leafmap

# Helper plotting functions
import pydeck as pdk
def deck_map(gdf):
    st.write(
        pdk.Deck(
            map_style="mapbox://styles/mapbox/light-v9",
            initial_view_state={
                "latitude": 35,
                "longitude": -100,
                "zoom": 3,
                "pitch": 50,
            },
            layers=[
                pdk.Layer(
                    "GeoJsonLayer",
                    gdf,
                    pickable=True,
                    stroked=True,
                    filled=True,
                    extruded=True,
                    elevation_scale=10,
                    get_fill_color=[2, 200, 100],
                    get_line_color=[0,0,0],
                    line_width_min_pixels=0,
                ),
            ],
        )
    )

def leaf_map(gdf):
    m = leafmap.Map(center=[35, -100], zoom=4, layers_control=True)
    m.add_gdf(gdf)
    return m.to_streamlit()


@st.cache_data
def query_database(response):
    return con.sql(response).to_pandas().head(25)

@st.cache_data
def get_geom(tbl):
    tbl['geometry'] = tbl['geometry'].apply(wkb.loads)
    gdf = gpd.GeoDataFrame(tbl, geometry='geometry')
    return gdf


## Database connection
from sqlalchemy import create_engine
from langchain.sql_database import SQLDatabase
db_uri = "duckdb:///pad.duckdb"
engine = create_engine(db_uri, connect_args={'read_only': True})
db = SQLDatabase(engine, view_support=True)

import ibis
con = ibis.connect("duckdb://pad.duckdb", read_only=True)
con.load_extension("spatial")

## ChatGPT Connection
from langchain_openai import ChatOpenAI 

# Requires ollama server running locally
from langchain_community.llms import Ollama

## should we use ChatOllama instead?
# from langchain_community.llms import ChatOllama

models = {"chatgpt3.5": ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])}

other_models = {
          "chatgpt4": ChatOpenAI(model="gpt-4", temperature=0, api_key=st.secrets["OPENAI_API_KEY"]),
          "duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
          "command-r-plus": Ollama(model="command-r-plus", temperature=0),
          "mixtral:8x22b":  Ollama(model="mixtral:8x22b", temperature=0),
          "wizardlm2:8x22b":  Ollama(model="wizardlm2:8x22b", temperature=0),
          "sqlcoder": Ollama(model="sqlcoder", temperature=0),
          "zephyr": Ollama(model="zephyr", temperature=0),
          "gemma:7b": Ollama(model="gemma:7b", temperature=0),
          "codegemma": Ollama(model="codegemma", temperature=0),
          "llama2": Ollama(model="llama2", temperature=0),
         }


st.set_page_config(page_title="Protected Areas Database Chat", page_icon="🦜", layout="wide")
st.title("Protected Areas Database Chat")

map_tool = {"leafmap": leaf_map,
            "deckgl": deck_map
           }

with st.sidebar:
    choice = st.radio("Select an LLM:", models)
    llm = models[choice]
    map_choice = st.radio("Select mapping tool", map_tool)
    mapper = map_tool[map_choice]
## A SQL Chain
from langchain.chains import create_sql_query_chain
chain = create_sql_query_chain(llm, db)


main = st.container()

## Does not preserve history
with main:

    '''
    The Protected Areas Database of the United States (PAD-US) is the official national inventory of
    America’s parks and other protected lands, and is published by the USGS Gap Analysis Project,
    [https://doi.org/10.5066/P9Q9LQ4B.](https://doi.org/10.5066/P9Q9LQ4B).  

    This interactive tool allows users to explore the dataset, as well as a range of biodiversity
    and climate indicators associated with each protected area. These indicators are integrated into 
    a single table format shown below.  The chatbot assistant can turn natural language queries into
    SQL queries based on the table schema.

    See our [Protected Areas Explorer](https://huggingface.co/spaces/boettiger-lab/pad-us) for a companion non-chat-based tool.

    ##### Example Queries returning summary tables

    - What is the percent area in each gap code as a fraction of the total protected area?
    - The manager_type column indicates whether a manager is federal, state, local, private, or NGO.  
      the manager_name column indicates the responsible agency (National Park Service, Bureau of Land Management,
      etc) in the case of federal manager types.  Which of the federal managers manage the most land in
      gap_code 1 or 2, as a fraction of the total area?

    When queries refer to specific managed areas, the chatbot can show those areas on an interactive map.
    Do to software limitations, these maps will show no more than 25 polygons, even if more areas match the
    requested search. The chatbot sometimes requires help identifying the right columns.  In order to create
    a map, the SQL query must also return the geometry column.  Conisder the following examples:

    ##### Example queries returning maps + tables

    - Show me all the national monuments (designation_type) in Utah. Include the geometry column
    - Show examples of Bureau of Land Management (manager_name) with the highest species richness? Include the geometry column
    - Which site has the overall highest range-size-rarity? Include the geometry column, manager_name, and IUCN category.

    '''

    st.markdown("## 🦜 Chatbot:")
    chatbox = st.container()
    with chatbox:           
        if prompt := st.chat_input(key="chain"):
            st.chat_message("user").write(prompt)
            with st.chat_message("assistant"):
                response = chain.invoke({"question": prompt})
                st.write(response)
                tbl = query_database(response)
                if 'geometry' in tbl:
                    gdf = get_geom(tbl)
                    mapper(gdf)
                    n = len(gdf)
                    st.write(f"matching features: {n}")
                st.dataframe(tbl)


st.divider()

with st.container():
    st.text("Database schema (top 3 rows)")
    tbl = tbl = query_database("select * from pad limit 3")
    st.dataframe(tbl)


st.divider()

'''
Experimental prototype.  

- Author: [Carl Boettiger](https://carlboettiger.info)
- For data sources and processing, see: https://beta.source.coop/repositories/cboettig/pad-us-3/description/


'''

# duckdb_sql fails but chatgpt3.5 succeeds with a query like:
# use the st_area function and st_GeomFromWKB functions to compute the area of the Shape column in the fee table, and then use that to compute the total area under each GAP_Sts category

# For most queries, duckdb_sql does much better than alternative open models though

# Federal agencies are identified as 'FED' in the Mang_Type column in the 'combined' data table. The Mang_Name column indicates the different agencies. Which federal agencies manage the greatest area of GAP_Sts 1 or 2 land?

# Federal agencies are identified as 'FED' in the Mang_Type column in the table named "fee". The Mang_Name column indicates the different agencies. List which managers manage the largest total areas that identified as GAP_Sts '1' or '2' ?