Spaces:
Sleeping
Sleeping
File size: 7,172 Bytes
f03c8f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
# This example does not use a langchain agent,
# The langchain sql chain has knowledge of the database, but doesn't interact with it becond intialization.
# The output of the sql chain is parsed seperately and passed to `duckdb.sql()` by streamlit
import os
os.environ["WEBSOCKET_TIMEOUT_MS"] = "300000" # no effect
import streamlit as st
import geopandas as gpd
import pandas as pd
from shapely import wkb
st.set_page_config(page_title="Protected Areas Database Chat", page_icon="🦜", layout="wide")
st.title("Protected Areas Database Chat")
## Database connection, reading directly from remote parquet file
from sqlalchemy import create_engine
from langchain.sql_database import SQLDatabase
db_uri = "duckdb:///:memory:"
parquet = "https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/pad-stats.parquet"
engine = create_engine(db_uri) #connect_args={'read_only': True})
con = engine.connect()
con.execute("install spatial; load spatial;")
h = con.execute(f"create or replace view pad as select * from read_parquet('{parquet}');")
h.fetchall()
db = SQLDatabase(engine, view_support=True)
@st.cache_data
def query_database(response):
# con.sql(response).to_pandas().head(25) # uses ibis connection
# instead, use direct sqlAlchemy connection
z = con.execute(response).fetchall()
return pd.DataFrame(z).head(25)
query_database("select * from pad limit 1")
@st.cache_data
def get_geom(tbl):
tbl['geometry'] = tbl['geometry'].apply(wkb.loads)
gdf = gpd.GeoDataFrame(tbl, geometry='geometry')
return gdf
# Helper plotting functions
import pydeck as pdk
def deck_map(gdf):
st.write(
pdk.Deck(
map_style="mapbox://styles/mapbox/light-v9",
initial_view_state={
"latitude": 35,
"longitude": -100,
"zoom": 3,
"pitch": 50,
},
layers=[
pdk.Layer(
"GeoJsonLayer",
gdf,
pickable=True,
stroked=True,
filled=True,
extruded=True,
elevation_scale=10,
get_fill_color=[2, 200, 100],
get_line_color=[0,0,0],
line_width_min_pixels=0,
),
],
)
)
import leafmap.foliumap as leafmap
def leaf_map(gdf):
m = leafmap.Map(center=[35, -100], zoom=4, layers_control=True)
m.add_gdf(gdf)
return m.to_streamlit()
## ChatGPT Connection
from langchain_openai import ChatOpenAI
# Requires ollama server running locally
from langchain_community.llms import Ollama
# # should we use ChatOllama instead?
# from langchain_community.llms import ChatOllama
models = {"chatgpt3.5": ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])}
other_models = {
"chatgpt4": ChatOpenAI(model="gpt-4", temperature=0, api_key=st.secrets["OPENAI_API_KEY"]),
"duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
"command-r-plus": Ollama(model="command-r-plus", temperature=0),
"mixtral:8x22b": Ollama(model="mixtral:8x22b", temperature=0),
"wizardlm2:8x22b": Ollama(model="wizardlm2:8x22b", temperature=0),
"sqlcoder": Ollama(model="sqlcoder", temperature=0),
"zephyr": Ollama(model="zephyr", temperature=0),
"gemma:7b": Ollama(model="gemma:7b", temperature=0),
"codegemma": Ollama(model="codegemma", temperature=0),
"llama2": Ollama(model="llama2", temperature=0),
}
map_tool = {"leafmap": leaf_map,
"deckgl": deck_map
}
with st.sidebar:
choice = st.radio("Select an LLM:", models)
llm = models[choice]
map_choice = st.radio("Select mapping tool", map_tool)
mapper = map_tool[map_choice]
## A SQL Chain
from langchain.chains import create_sql_query_chain
chain = create_sql_query_chain(llm, db)
main = st.container()
## Does not preserve history
with main:
'''
The Protected Areas Database of the United States (PAD-US) is the official national inventory of
America’s parks and other protected lands, and is published by the USGS Gap Analysis Project,
[https://doi.org/10.5066/P9Q9LQ4B.](https://doi.org/10.5066/P9Q9LQ4B).
This interactive tool allows users to explore the dataset, as well as a range of biodiversity
and climate indicators associated with each protected area. These indicators are integrated into
a single table format shown below. The chatbot assistant can turn natural language queries into
SQL queries based on the table schema.
See our [Protected Areas Explorer](https://huggingface.co/spaces/boettiger-lab/pad-us) for a companion non-chat-based tool.
##### Example Queries returning summary tables
- What is the percent area in each gap code as a fraction of the total protected area?
- The manager_type column indicates whether a manager is federal, state, local, private, or NGO.
the manager_name column indicates the responsible agency (National Park Service, Bureau of Land Management,
etc) in the case of federal manager types. Which of the federal managers manage the most land in
gap_code 1 or 2, as a fraction of the total area?
When queries refer to specific managed areas, the chatbot can show those areas on an interactive map.
Do to software limitations, these maps will show no more than 25 polygons, even if more areas match the
requested search. The chatbot sometimes requires help identifying the right columns. In order to create
a map, the SQL query must also return the geometry column. Conisder the following examples:
##### Example queries returning maps + tables
- Show me all the national monuments (designation_type) in Utah. Include the geometry column
- Show examples of Bureau of Land Management (manager_name) with the highest species richness? Include the geometry column
- Which site has the overall highest range-size-rarity? Include the geometry column, manager_name, and IUCN category.
'''
st.markdown("## 🦜 Chatbot:")
chatbox = st.container()
with chatbox:
if prompt := st.chat_input(key="chain"):
st.chat_message("user").write(prompt)
with st.chat_message("assistant"):
response = chain.invoke({"question": prompt})
st.write(response)
tbl = query_database(response)
if 'geometry' in tbl:
gdf = get_geom(tbl)
mapper(gdf)
n = len(gdf)
st.write(f"matching features: {n}")
st.dataframe(tbl)
st.divider()
with st.container():
st.text("Database schema (top 3 rows)")
tbl = tbl = query_database("select * from pad limit 3")
st.dataframe(tbl)
st.divider()
'''
Experimental prototype.
- Author: [Carl Boettiger](https://carlboettiger.info)
- For data sources and processing, see: https://beta.source.coop/repositories/cboettig/pad-us-3/description/
'''
|