cboettig commited on
Commit
f03c8f7
·
1 Parent(s): 8c995f3

working examples with sqlalchemy2.0 using original design

Browse files
Files changed (1) hide show
  1. graphs.py +198 -0
graphs.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This example does not use a langchain agent,
2
+ # The langchain sql chain has knowledge of the database, but doesn't interact with it becond intialization.
3
+ # The output of the sql chain is parsed seperately and passed to `duckdb.sql()` by streamlit
4
+
5
+ import os
6
+ os.environ["WEBSOCKET_TIMEOUT_MS"] = "300000" # no effect
7
+
8
+ import streamlit as st
9
+ import geopandas as gpd
10
+ import pandas as pd
11
+ from shapely import wkb
12
+
13
+
14
+ st.set_page_config(page_title="Protected Areas Database Chat", page_icon="🦜", layout="wide")
15
+ st.title("Protected Areas Database Chat")
16
+
17
+ ## Database connection, reading directly from remote parquet file
18
+ from sqlalchemy import create_engine
19
+ from langchain.sql_database import SQLDatabase
20
+ db_uri = "duckdb:///:memory:"
21
+ parquet = "https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/pad-stats.parquet"
22
+ engine = create_engine(db_uri) #connect_args={'read_only': True})
23
+ con = engine.connect()
24
+ con.execute("install spatial; load spatial;")
25
+ h = con.execute(f"create or replace view pad as select * from read_parquet('{parquet}');")
26
+ h.fetchall()
27
+ db = SQLDatabase(engine, view_support=True)
28
+
29
+
30
+ @st.cache_data
31
+ def query_database(response):
32
+ # con.sql(response).to_pandas().head(25) # uses ibis connection
33
+ # instead, use direct sqlAlchemy connection
34
+ z = con.execute(response).fetchall()
35
+ return pd.DataFrame(z).head(25)
36
+
37
+
38
+ query_database("select * from pad limit 1")
39
+
40
+
41
+ @st.cache_data
42
+ def get_geom(tbl):
43
+ tbl['geometry'] = tbl['geometry'].apply(wkb.loads)
44
+ gdf = gpd.GeoDataFrame(tbl, geometry='geometry')
45
+ return gdf
46
+
47
+
48
+
49
+ # Helper plotting functions
50
+ import pydeck as pdk
51
+ def deck_map(gdf):
52
+ st.write(
53
+ pdk.Deck(
54
+ map_style="mapbox://styles/mapbox/light-v9",
55
+ initial_view_state={
56
+ "latitude": 35,
57
+ "longitude": -100,
58
+ "zoom": 3,
59
+ "pitch": 50,
60
+ },
61
+ layers=[
62
+ pdk.Layer(
63
+ "GeoJsonLayer",
64
+ gdf,
65
+ pickable=True,
66
+ stroked=True,
67
+ filled=True,
68
+ extruded=True,
69
+ elevation_scale=10,
70
+ get_fill_color=[2, 200, 100],
71
+ get_line_color=[0,0,0],
72
+ line_width_min_pixels=0,
73
+ ),
74
+ ],
75
+ )
76
+ )
77
+
78
+ import leafmap.foliumap as leafmap
79
+ def leaf_map(gdf):
80
+ m = leafmap.Map(center=[35, -100], zoom=4, layers_control=True)
81
+ m.add_gdf(gdf)
82
+ return m.to_streamlit()
83
+
84
+
85
+
86
+ ## ChatGPT Connection
87
+ from langchain_openai import ChatOpenAI
88
+ # Requires ollama server running locally
89
+ from langchain_community.llms import Ollama
90
+
91
+ # # should we use ChatOllama instead?
92
+ # from langchain_community.llms import ChatOllama
93
+
94
+ models = {"chatgpt3.5": ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])}
95
+
96
+ other_models = {
97
+ "chatgpt4": ChatOpenAI(model="gpt-4", temperature=0, api_key=st.secrets["OPENAI_API_KEY"]),
98
+ "duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
99
+ "command-r-plus": Ollama(model="command-r-plus", temperature=0),
100
+ "mixtral:8x22b": Ollama(model="mixtral:8x22b", temperature=0),
101
+ "wizardlm2:8x22b": Ollama(model="wizardlm2:8x22b", temperature=0),
102
+ "sqlcoder": Ollama(model="sqlcoder", temperature=0),
103
+ "zephyr": Ollama(model="zephyr", temperature=0),
104
+ "gemma:7b": Ollama(model="gemma:7b", temperature=0),
105
+ "codegemma": Ollama(model="codegemma", temperature=0),
106
+ "llama2": Ollama(model="llama2", temperature=0),
107
+ }
108
+
109
+
110
+
111
+
112
+ map_tool = {"leafmap": leaf_map,
113
+ "deckgl": deck_map
114
+ }
115
+
116
+ with st.sidebar:
117
+ choice = st.radio("Select an LLM:", models)
118
+ llm = models[choice]
119
+ map_choice = st.radio("Select mapping tool", map_tool)
120
+ mapper = map_tool[map_choice]
121
+ ## A SQL Chain
122
+ from langchain.chains import create_sql_query_chain
123
+ chain = create_sql_query_chain(llm, db)
124
+
125
+
126
+ main = st.container()
127
+
128
+ ## Does not preserve history
129
+ with main:
130
+
131
+ '''
132
+ The Protected Areas Database of the United States (PAD-US) is the official national inventory of
133
+ America’s parks and other protected lands, and is published by the USGS Gap Analysis Project,
134
+ [https://doi.org/10.5066/P9Q9LQ4B.](https://doi.org/10.5066/P9Q9LQ4B).
135
+
136
+ This interactive tool allows users to explore the dataset, as well as a range of biodiversity
137
+ and climate indicators associated with each protected area. These indicators are integrated into
138
+ a single table format shown below. The chatbot assistant can turn natural language queries into
139
+ SQL queries based on the table schema.
140
+
141
+ See our [Protected Areas Explorer](https://huggingface.co/spaces/boettiger-lab/pad-us) for a companion non-chat-based tool.
142
+
143
+ ##### Example Queries returning summary tables
144
+
145
+ - What is the percent area in each gap code as a fraction of the total protected area?
146
+ - The manager_type column indicates whether a manager is federal, state, local, private, or NGO.
147
+ the manager_name column indicates the responsible agency (National Park Service, Bureau of Land Management,
148
+ etc) in the case of federal manager types. Which of the federal managers manage the most land in
149
+ gap_code 1 or 2, as a fraction of the total area?
150
+
151
+ When queries refer to specific managed areas, the chatbot can show those areas on an interactive map.
152
+ Do to software limitations, these maps will show no more than 25 polygons, even if more areas match the
153
+ requested search. The chatbot sometimes requires help identifying the right columns. In order to create
154
+ a map, the SQL query must also return the geometry column. Conisder the following examples:
155
+
156
+ ##### Example queries returning maps + tables
157
+
158
+ - Show me all the national monuments (designation_type) in Utah. Include the geometry column
159
+ - Show examples of Bureau of Land Management (manager_name) with the highest species richness? Include the geometry column
160
+ - Which site has the overall highest range-size-rarity? Include the geometry column, manager_name, and IUCN category.
161
+
162
+ '''
163
+
164
+ st.markdown("## 🦜 Chatbot:")
165
+ chatbox = st.container()
166
+ with chatbox:
167
+ if prompt := st.chat_input(key="chain"):
168
+ st.chat_message("user").write(prompt)
169
+ with st.chat_message("assistant"):
170
+ response = chain.invoke({"question": prompt})
171
+ st.write(response)
172
+ tbl = query_database(response)
173
+ if 'geometry' in tbl:
174
+ gdf = get_geom(tbl)
175
+ mapper(gdf)
176
+ n = len(gdf)
177
+ st.write(f"matching features: {n}")
178
+ st.dataframe(tbl)
179
+
180
+
181
+ st.divider()
182
+
183
+ with st.container():
184
+ st.text("Database schema (top 3 rows)")
185
+ tbl = tbl = query_database("select * from pad limit 3")
186
+ st.dataframe(tbl)
187
+
188
+
189
+ st.divider()
190
+
191
+ '''
192
+ Experimental prototype.
193
+
194
+ - Author: [Carl Boettiger](https://carlboettiger.info)
195
+ - For data sources and processing, see: https://beta.source.coop/repositories/cboettig/pad-us-3/description/
196
+
197
+
198
+ '''