mtyrrell commited on
Commit
d49d09a
·
1 Parent(s): cd9150d
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # .gitignore
2
+
3
+ .env
4
+ __pycache__/
5
+ *.pyc
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: blue
6
  sdk: streamlit
7
  python_version: 3.10.11
8
- sdk_version: 1.21.0
9
  app_file: app.py
10
  pinned: false
11
  ---
 
5
  colorTo: blue
6
  sdk: streamlit
7
  python_version: 3.10.11
8
+ sdk_version: 1.38.0
9
  app_file: app.py
10
  pinned: false
11
  ---
app.py CHANGED
@@ -1,289 +1,142 @@
 
1
 
2
  import streamlit as st
3
  import os
4
- import pkg_resources
5
-
6
- # # Using this wacky hack to get around the massively ridicolous managed env loading order
7
- # def is_installed(package_name, version):
8
- # try:
9
- # pkg = pkg_resources.get_distribution(package_name)
10
- # return pkg.version == version
11
- # except pkg_resources.DistributionNotFound:
12
- # return False
13
-
14
- # @st.cache_resource
15
- # def install_packages():
16
- # install_commands = []
17
-
18
- # if not is_installed("spaces", "0.12.0"):
19
- # install_commands.append("pip install spaces==0.12.0")
20
-
21
- # if not is_installed("pydantic", "1.8.2"):
22
- # install_commands.append("pip install pydantic==1.8.2")
23
-
24
- # if install_commands:
25
- # os.system(" && ".join(install_commands))
26
-
27
- # # install packages if necessary
28
- # # install_packages()
29
-
30
-
31
  import re
32
  import json
33
- from dotenv import load_dotenv
34
  import numpy as np
35
  import pandas as pd
36
  import getpass
37
- import os
38
  from dotenv import load_dotenv, find_dotenv
39
- from pinecone import Pinecone, ServerlessSpec
40
- from langchain_pinecone import PineconeVectorStore
41
- from langchain_huggingface import HuggingFaceEmbeddings
42
- # from langchain_core.output_parsers import StrOutputParser
43
- # from langchain_core.runnables import RunnablePassthrough
44
- # from langchain_openai import ChatOpenAI
45
  from langchain.docstore.document import Document
46
  from openai import OpenAI
 
47
 
48
- client = OpenAI(
49
- organization='org-x0YBcOjkdPyf6ExxWCkmFHAj',
50
- project='proj_40oH22n9XudeKL2rgka1IQ5B',
51
- api_key='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'
52
- )
53
-
54
- pinecone_api_key = os.environ.get("PINECONE_API_KEY")
55
-
56
- @st.cache_resource
57
- def initialize_embeddings(model_name: str = "all-mpnet-base-v2"):
58
- embeddings = HuggingFaceEmbeddings(model_name=model_name)
59
- return embeddings
60
-
61
-
62
- @st.cache_resource
63
- def initialize_vector_store(pinecone_api_key: str, index_name: str):
64
- # Initialize Pinecone
65
- pc = Pinecone(api_key=pinecone_api_key)
66
-
67
- # Access the index
68
- index = pc.Index(index_name)
69
-
70
- # Use the cached embeddings
71
- embeddings = initialize_embeddings()
72
-
73
- # Create the vector store
74
- vector_store = PineconeVectorStore(index=index, embedding=embeddings, text_key='content')
75
-
76
- return vector_store, embeddings
77
-
78
- # Unpack the tuple into both vector_store and embeddings
79
- vector_store, embeddings = initialize_vector_store(pinecone_api_key, index_name="cpv-full-southern-africa-test")
80
-
81
-
82
-
83
-
84
- def get_docs(query, country = [], vulnerability_cat = []):
85
-
86
- if not country:
87
- country = "All Countries"
88
- if not vulnerability_cat:
89
- if country == "All Countries":
90
- filters = None
91
- else:
92
- filters = {'country': {'$in': country}}
93
- else:
94
- if country == "All Countries":
95
- filters = {'vulnerability_cat': {'$in': vulnerability_cat}}
96
- else:
97
- filters = {'country': {'$in': country},'vulnerability_cat': {'$in': vulnerability_cat}}
98
-
99
-
100
- docs = vector_store.similarity_search_by_vector_with_score(
101
- embeddings.embed_query(query),
102
- k=20,
103
- filter=filters,
104
- )
105
 
106
- # Break out the key fields and convert to pandas for filtering
107
- docs_dict = [{**x[0].metadata,"score":x[1],"content":x[0].page_content} for x in docs]
108
- df_docs = pd.DataFrame(docs_dict)
109
- # Get ourselves an index setup from which to base the source reference number from (in the prompt and matching afterwards)
110
- df_docs = df_docs.reset_index()
111
- df_docs['ref_id'] = df_docs.index + 1 # start the index at 1
112
- # Convert back to Document format
113
- ls_dict = []
114
- # Iterate over df and add relevant fields to the dict object
115
- for index, row in df_docs.iterrows():
116
- # Create a Document object for each row
117
- doc = Document(
118
- page_content = row['content'],
119
- metadata={'country': row['country'],'document': row['document'], 'page': row['page'], 'file_name': row['file_name'], 'ref_id': row['ref_id'], 'vulnerability_cat': row['vulnerability_cat'], 'score': row['score']}
120
- )
121
- # Append the Document object to the documents list
122
- ls_dict.append(doc)
123
 
124
- return ls_dict
125
 
 
 
126
 
127
- prompt_template="Answer the given question using the following documents. \
128
- Formulate your answer in the style of an academic report. \
129
- Provide example quotes and citations using extracted text from the documents. \
130
- Use facts and numbers from the documents in your answer. \
131
- ALWAYS include references for information used from documents at the end of each applicable sentence using the format: '[ref. #]', where '[ref. #]' is included in the text provided at the start of each document (demarcated by the pattern '- &&& [ref. #] document_name &&&:')'. \
132
- Do not include page numbers in the references. \
133
- If no relevant information to answer the question is present in the documents, just say you don't have enough information to answer."
134
 
 
 
 
 
135
 
136
- # Create a list of options for the dropdown
137
- # model_options = ['chatGPT','Llama2']
 
 
 
 
 
 
 
 
 
 
138
 
139
- # Create a list of options for the dropdown
140
- country_options = ['All Countries','Angola','Botswana','Lesotho','Kenya','Malawi','Mozambique','Namibia','Rwanda','South Africa','Zambia','Zimbabwe']
 
 
 
141
 
142
- # Create a list of options for the dropdown
143
- vulnerability_options = ['All Categories','Agricultural communities', 'Children', 'Coastal communities', 'Ethnic, racial or other minorities', 'Fishery communities', 'Informal sector workers', 'Members of indigenous and local communities', 'Migrants and displaced persons', 'Older persons', 'Persons living in poverty', 'Persons with disabilities', 'Persons with pre-existing health conditions', 'Residents of drought-prone regions', 'Rural populations', 'Sexual minorities (LGBTQI+)', 'Urban populations', 'Women and other genders','Other']
 
 
 
 
 
 
 
144
 
145
- # List of examples
146
  examples = [
147
  "-",
148
  "What specific initiatives are presented in the context to address the needs of groups such as women and children to the effects climate change?",
149
  "In addition to gender, children, and youth, is there any mention of other groups facing disproportional impacts from climate change due to their geographic location, socio-economic status, age, gender, health, and occupation?"
150
  ]
151
 
152
-
153
- def get_refs(docs, res):
154
- '''
155
- Parse response for engineered reference ids (refer to prompt template)
156
- Extract documents using reference ids
157
- '''
158
- res = res.lower() # Convert to lowercase for matching
159
- # This pattern should be returned by gpt3.5
160
- # pattern = r'ref\. (\d+)\]\.'
161
- pattern = r'ref\. (\d+)'
162
- ref_ids = [int(match) for match in re.findall(pattern, res)]
163
- # extract
164
- result_str = "" # Initialize an empty string to store the result
165
- for i in range(len(docs)):
166
- ref_id = docs[i].metadata['ref_id']
167
- if ref_id in ref_ids:
168
- if docs[i].metadata['document'] == "Supplementary":
169
- result_str += "**Ref. " + str(ref_id) + " [" + docs[i].metadata['country'] + " " + docs[i].metadata['document'] + ':' + docs[i].metadata['file_name'] + ' p' + str(docs[i].metadata['page']) + '; vulnerabilities: ' + docs[i].metadata['vulnerability_cat'] + "]:** " + "*'" + docs[i].page_content + "'*<br> <br>" # Add <br> for a line break
170
- else:
171
- result_str += "**Ref. " + str(ref_id) + " [" + docs[i].metadata['country'] + " " + docs[i].metadata['document'] + ' p' + str(docs[i].metadata['page']) + '; vulnerabilities: ' + docs[i].metadata['vulnerability_cat'] + "]:** " + "*'" + docs[i].page_content + "'*<br> <br>" # Add <br> for a line break
172
-
173
- return result_str
174
-
175
- # define a special function for putting the prompt together (as we can't use haystack)
176
- def get_prompt(docs, input_query):
177
- base_prompt=prompt_template
178
- # Add the metadata data for references
179
- context = ' - '.join(['&&& [ref. '+str(d.metadata['ref_id'])+'] '+d.metadata['document']+' &&&: '+d.page_content for d in docs])
180
- prompt = base_prompt+"; Context: "+context+"; Question: "+input_query+"; Answer:"
181
- return(prompt)
182
-
183
- def run_query(query, country, model_sel):
184
- # first call the retriever function using selected filters
185
- docs = get_docs(query, country=country,vulnerability_cat=vulnerabilities_cat)
186
- # model selector (not currently being used)
187
- if model_sel == "chatGPT":
188
- # instantiate ChatCompletion as a generator object (stream is set to True)
189
- # response = openai.ChatCompletion.create(model="gpt-4o-mini-2024-07-18", messages=[{"role": "user", "content": get_prompt(docs, query)}], stream=True)
190
-
191
-
192
- stream = client.chat.completions.create(
193
- model="gpt-4o-mini-2024-07-18",
194
- messages=[{"role": "user", "content": get_prompt(docs, query)}],
195
- stream=True,
196
- )
197
- # iterate through the streamed output
198
- report = []
199
-
200
- for chunk in stream:
201
- if chunk.choices[0].delta.content is not None:
202
- # print(chunk.choices[0].delta.content, end="")
203
- report.append(chunk.choices[0].delta.content)
204
- result = "".join(report).strip()
205
- res_box.success(result) # output to response text box
206
-
207
- references = get_refs(docs, result) # extract references from the generated text
208
- # Llama2 selection (was running on HF)
209
- # else:
210
- # res = client.text_generation(get_prompt(docs, query=input_query), max_new_tokens=4000, temperature=0.01, model=model)
211
- # output = res
212
- # references = get_refs(docs, res)
213
-
214
- st.markdown("----")
215
- st.markdown('**REFERENCES:**')
216
- st.markdown('References are based on text automatically extracted from climate policy documents. These extracts may contain non-legible characters or disjointed text as an artifact of the extraction procedure')
217
- st.markdown(references, unsafe_allow_html=True)
218
-
219
-
220
- #___________________________________________________________________________________________________________
221
-
222
- # Sidebar (filters)
223
  with st.sidebar:
224
- country = st.sidebar.multiselect('Filter by country:', country_options)
225
- vulnerabilities_cat = st.sidebar.multiselect('Filter by vulnerabilities category:', vulnerability_options)
226
  with st.expander("ℹ️ - About filters", expanded=False):
227
  st.markdown(
228
  """
229
- * *These selections will filter the data matched against your query*
230
- * *For a comparative analysis of multiple countries or vulnerability categories, select the items you require or select **'All Countries'** or **'All Categories'***
231
- * *Be careful in using the vulnerabilities category filter, as many of the categories are not well represented in the documents. Therefore, this will severly limit the data available for analysis*
232
  """
233
  )
234
 
235
- # Main window title
236
- with st.container():
237
- st.markdown("<h2 style='text-align: center;'> Climate Policy Documents: Vulnerabilities Analysis Q&A </h2>", unsafe_allow_html=True)
238
- st.write(' ')
239
 
240
- # Main window instructions
241
  with st.expander("ℹ️ - About this app", expanded=False):
242
  st.write(
243
  """
244
- This tool seeks to provide an interface for quering national climate policy documents (NDCs, LTS etc.). The current version is powered by chatGPT (3.5). The document store is limited to 10 Southern African countries (Angola, Botswana, Eswatini, Lesotho, Malawi, Mozambique, Namibia, South Africa, Zambia, Zimbabwe), as well as Kenya and Rwanda. The intended use case is to allow users to interact with the documents and obtain valuable insights on various vulnerable groups affected by climate change.
245
 
246
  **DISCLAIMER:** *This prototype tool based on LLMs (Language Models) is provided "as is" for experimental and exploratory purposes only, and should not be used for critical or production applications. Users are advised that the tool may contain errors, bugs, or limitations and should be used with caution and awareness of potential risks, and the developers make no warranties or guarantees regarding its performance, reliability, or suitability for any specific purpose.*
247
- """)
248
- # Display the text passages as radio buttons
249
  selected_example = st.radio("Example questions", examples)
250
  st.write(
251
- """
252
- You can request comparative analyses between countries by filtering by country, and using more advanced prompts. For example:
 
 
253
 
254
- *Provide a comparative analysis between Angola and Kenya with regard to specific initiatives presented in the context to address the needs of groups such women and children to the effects climate change.*
255
-
256
- Make sure your filters match the countries you have specified for the analysis!
257
- """)
258
-
259
 
260
- # Dropdown selectbox: model (currently not used)
261
- # model_sel = st.selectbox('Select an LLM:', model_options)
262
  model_sel = "chatGPT"
263
 
264
- #----Model Select logic-------
265
  if model_sel == "chatGPT":
266
  model_name = "gpt-3.5-turbo"
267
 
268
- # else:
269
- # model = "meta-llama/Llama-2-70b-chat-hf"
270
- # # Instantiate the inference client
271
- # client = InferenceClient()
272
-
273
- # get prompt from user or example prompt
274
- if selected_example == "-": #hyphen used as a work around (st won't allow null selection)
275
  text = st.text_area('Enter your question in the text box below using natural language or select an example from above:')
276
  else:
277
  text = st.text_area('Enter your question in the text box below using natural language or select an example from above:', value=selected_example)
278
 
 
279
  if st.button('Submit'):
280
  st.markdown("----")
281
  st.markdown('**RESPONSE:**')
282
  res_box = st.empty()
283
- run_query(text, country=country, model_sel=model_sel)
284
-
285
-
286
-
287
-
288
-
289
-
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
 
3
  import streamlit as st
4
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import re
6
  import json
 
7
  import numpy as np
8
  import pandas as pd
9
  import getpass
 
10
  from dotenv import load_dotenv, find_dotenv
 
 
 
 
 
 
11
  from langchain.docstore.document import Document
12
  from openai import OpenAI
13
+ import utils # Import the utils module
14
 
15
+ from dotenv import load_dotenv
16
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ os.environ.pop("OPENAI_API_KEY", None)
19
+ load_dotenv()
20
+ # openai_api_key = os.getenv("OPENAI_API_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
22
 
23
+ # Initialize OpenAI client
24
+ client = utils.get_openai_client()
25
 
26
+ # Get Pinecone API key from environment variables
27
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
 
 
 
 
 
28
 
29
+ # Initialize vector store and embeddings
30
+ vector_store, embeddings = utils.initialize_vector_store(
31
+ pinecone_api_key, index_name="cpv-full-southern-africa-test"
32
+ )
33
 
34
+ # Prompt template
35
+ prompt_template = (
36
+ "Answer the given question using the following documents. "
37
+ "Formulate your answer in the style of an academic report. "
38
+ "Provide example quotes and citations using extracted text from the documents. "
39
+ "Use facts and numbers from the documents in your answer. "
40
+ "ALWAYS include references for information used from documents at the end of each applicable sentence "
41
+ "using the format: '[ref. #]', where '[ref. #]' is included in the text provided at the start of each document "
42
+ "(demarcated by the pattern '- &&& [ref. #] document_name &&&:')'. "
43
+ "Do not include page numbers in the references. "
44
+ "If no relevant information to answer the question is present in the documents, just say you don't have enough information to answer."
45
+ )
46
 
47
+ # Dropdown options
48
+ country_options = [
49
+ 'All Countries', 'Angola', 'Botswana', 'Lesotho', 'Kenya', 'Malawi',
50
+ 'Mozambique', 'Namibia', 'Rwanda', 'South Africa', 'Zambia', 'Zimbabwe'
51
+ ]
52
 
53
+ vulnerability_options = [
54
+ 'All Categories', 'Agricultural communities', 'Children', 'Coastal communities',
55
+ 'Ethnic, racial or other minorities', 'Fishery communities', 'Informal sector workers',
56
+ 'Members of indigenous and local communities', 'Migrants and displaced persons',
57
+ 'Older persons', 'Persons living in poverty', 'Persons with disabilities',
58
+ 'Persons with pre-existing health conditions', 'Residents of drought-prone regions',
59
+ 'Rural populations', 'Sexual minorities (LGBTQI+)', 'Urban populations',
60
+ 'Women and other genders', 'Other'
61
+ ]
62
 
 
63
  examples = [
64
  "-",
65
  "What specific initiatives are presented in the context to address the needs of groups such as women and children to the effects climate change?",
66
  "In addition to gender, children, and youth, is there any mention of other groups facing disproportional impacts from climate change due to their geographic location, socio-economic status, age, gender, health, and occupation?"
67
  ]
68
 
69
+ # Sidebar Filters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  with st.sidebar:
71
+ country = st.multiselect('Filter by country:', country_options)
72
+ vulnerabilities_cat = st.multiselect('Filter by vulnerabilities category:', vulnerability_options)
73
  with st.expander("ℹ️ - About filters", expanded=False):
74
  st.markdown(
75
  """
76
+ * *These selections will filter the data matched against your query.*
77
+ * *For a comparative analysis of multiple countries or vulnerability categories, select the items you require or select **'All Countries'** or **'All Categories'***.
78
+ * *Be careful in using the vulnerabilities category filter, as many of the categories are not well represented in the documents. Therefore, this will severely limit the data available for analysis.*
79
  """
80
  )
81
 
82
+ # Main Window Title
83
+ st.markdown("<h2 style='text-align: center;'> Climate Policy Documents: Vulnerabilities Analysis Q&A </h2>", unsafe_allow_html=True)
84
+ st.write(' ')
 
85
 
86
+ # Main Window Instructions
87
  with st.expander("ℹ️ - About this app", expanded=False):
88
  st.write(
89
  """
90
+ This tool seeks to provide an interface for querying national climate policy documents (NDCs, LTS etc.). The current version is powered by chatGPT (3.5). The document store is limited to 10 Southern African countries (Angola, Botswana, Eswatini, Lesotho, Malawi, Mozambique, Namibia, South Africa, Zambia, Zimbabwe), as well as Kenya and Rwanda. The intended use case is to allow users to interact with the documents and obtain valuable insights on various vulnerable groups affected by climate change.
91
 
92
  **DISCLAIMER:** *This prototype tool based on LLMs (Language Models) is provided "as is" for experimental and exploratory purposes only, and should not be used for critical or production applications. Users are advised that the tool may contain errors, bugs, or limitations and should be used with caution and awareness of potential risks, and the developers make no warranties or guarantees regarding its performance, reliability, or suitability for any specific purpose.*
93
+ """
94
+ )
95
  selected_example = st.radio("Example questions", examples)
96
  st.write(
97
+ """
98
+ You can request comparative analyses between countries by filtering by country, and using more advanced prompts. For example:
99
+
100
+ *Provide a comparative analysis between Angola and Kenya with regard to specific initiatives presented in the context to address the needs of groups such women and children to the effects climate change.*
101
 
102
+ Make sure your filters match the countries you have specified for the analysis!
103
+ """
104
+ )
 
 
105
 
106
+ # Model Selection (Currently fixed to chatGPT)
 
107
  model_sel = "chatGPT"
108
 
109
+ # Prompt Logic
110
  if model_sel == "chatGPT":
111
  model_name = "gpt-3.5-turbo"
112
 
113
+ # Input Text Area
114
+ if selected_example == "-":
 
 
 
 
 
115
  text = st.text_area('Enter your question in the text box below using natural language or select an example from above:')
116
  else:
117
  text = st.text_area('Enter your question in the text box below using natural language or select an example from above:', value=selected_example)
118
 
119
+ # Submit Button
120
  if st.button('Submit'):
121
  st.markdown("----")
122
  st.markdown('**RESPONSE:**')
123
  res_box = st.empty()
124
+
125
+ # Fetch documents based on user input and filters
126
+ docs = utils.get_docs(vector_store, embeddings, text, country=country, vulnerability_cat=vulnerabilities_cat)
127
+
128
+ # Construct the prompt
129
+ prompt = utils.get_prompt(prompt_template, docs, text)
130
+
131
+ # Run the query and get references
132
+ references = utils.run_query(client, prompt, docs, res_box)
133
+
134
+ # Display references
135
+ st.markdown("----")
136
+ st.markdown('**REFERENCES:**')
137
+ st.markdown(
138
+ 'References are based on text automatically extracted from climate policy documents. '
139
+ 'These extracts may contain non-legible characters or disjointed text as an artifact of the extraction procedure',
140
+ unsafe_allow_html=True
141
+ )
142
+ st.markdown(references, unsafe_allow_html=True)
cpv_full.db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:30511f3207929443a0ba546dd10de94c4d9f8d73dfe6cec34e0bcf2de8367862
3
- size 12734464
 
 
 
 
cpv_full.faiss DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1711d36c1fbb1425b9b257ce1b8a7dd5f40159e02f95a8170a006478f627b3a
3
- size 25543725
 
 
 
 
cpv_full.json DELETED
@@ -1 +0,0 @@
1
- {"faiss_index_factory_str": "Flat", "sql_url": "sqlite:///cpv_full.db"}
 
 
cpv_full_southern_africa_kenya.db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a31dd7681c0ec71efd469fe89329d1bb2138e8fb8e7019741dfca484242f6990
3
- size 18247680
 
 
 
 
cpv_full_southern_africa_kenya.faiss DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:53130abde470d7a3a63b543dbb4b1ae44eea230662ee7d484123e345a2ec50f9
3
- size 26173485
 
 
 
 
cpv_full_southern_africa_kenya.json DELETED
@@ -1 +0,0 @@
1
- {"faiss_index_factory_str": "Flat", "sql_url": "sqlite:///cpv_full_southern_africa_kenya.db"}
 
 
env DELETED
@@ -1,5 +0,0 @@
1
- OPENAI_API_KEY="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"
2
- HF_API_KEY="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"
3
- PINECONE_API_KEY="c3f5717c-f43a-46d0-893e-02b44dbcf13b"
4
- USER1_HASH="$2b$12$hZbOi6zKmQQWvvpcllds9uAB3ili66N0aQyPzuDctl7IkNhl226oG"
5
- USER2_HASH="$2b$12$kWnArbA.2QTkpMv2yvE2J.7UJw0Fgc/3FH1k5JRqhjg.cvytriGt2"
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- farm-haystack[colab, inference, pinecone]
2
- python-dotenv==1.0.0
3
- openai==0.27.8
 
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ numpy==1.26.4
3
+ pandas==2.2.3
4
+ python-dotenv==1.0.1
5
+ pinecone-client==5.0.1
6
+ langchain-pinecone==0.2.0
7
+ langchain-huggingface==0.1.0
8
+ openai==1.50.2
utils.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+
3
+ import streamlit as st
4
+ import os
5
+ import re
6
+ import pandas as pd
7
+ from langchain_pinecone import PineconeVectorStore
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain.docstore.document import Document
10
+ from dotenv import load_dotenv
11
+ from pinecone import Pinecone
12
+ from openai import OpenAI
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ # Initialize OpenAI client
18
+ def get_openai_client():
19
+ return OpenAI(
20
+ organization=os.getenv('OPENAI_ORG_ID'),
21
+ project=os.getenv('OPENAI_PROJECT_ID'),
22
+ api_key=os.getenv('OPENAI_API_KEY')
23
+ )
24
+
25
+ # Initialize embeddings
26
+ @st.cache_resource
27
+ def initialize_embeddings(model_name: str = "all-mpnet-base-v2"):
28
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
29
+ return embeddings
30
+
31
+ # Initialize vector store
32
+ @st.cache_resource
33
+ def initialize_vector_store(pinecone_api_key: str, index_name: str):
34
+ pc = Pinecone(api_key=pinecone_api_key)
35
+ index = pc.Index(index_name)
36
+ embeddings = initialize_embeddings()
37
+ vector_store = PineconeVectorStore(index=index, embedding=embeddings, text_key='content')
38
+ return vector_store, embeddings
39
+
40
+ # Fetch documents based on query and filters
41
+ def get_docs(vector_store, embeddings, query, country=[], vulnerability_cat=[]):
42
+ if not country:
43
+ country = "All Countries"
44
+ if not vulnerability_cat:
45
+ filters = None if country == "All Countries" else {'country': {'$in': country}}
46
+ else:
47
+ if country == "All Countries":
48
+ filters = {'vulnerability_cat': {'$in': vulnerability_cat}}
49
+ else:
50
+ filters = {
51
+ 'country': {'$in': country},
52
+ 'vulnerability_cat': {'$in': vulnerability_cat}
53
+ }
54
+
55
+ docs = vector_store.similarity_search_by_vector_with_score(
56
+ embeddings.embed_query(query),
57
+ k=20,
58
+ filter=filters,
59
+ )
60
+
61
+ docs_dict = [{**x[0].metadata, "score": x[1], "content": x[0].page_content} for x in docs]
62
+ df_docs = pd.DataFrame(docs_dict).reset_index()
63
+ df_docs['ref_id'] = df_docs.index + 1
64
+
65
+ ls_dict = [
66
+ Document(
67
+ page_content=row['content'],
68
+ metadata={
69
+ 'country': row['country'],
70
+ 'document': row['document'],
71
+ 'page': row['page'],
72
+ 'file_name': row['file_name'],
73
+ 'ref_id': row['ref_id'],
74
+ 'vulnerability_cat': row['vulnerability_cat'],
75
+ 'score': row['score']
76
+ }
77
+ )
78
+ for _, row in df_docs.iterrows()
79
+ ]
80
+
81
+ return ls_dict
82
+
83
+ # Extract references from the response
84
+ def get_refs(docs, res):
85
+ res = res.lower()
86
+ pattern = r'ref\. (\d+)'
87
+ ref_ids = [int(match) for match in re.findall(pattern, res)]
88
+ result_str = ""
89
+ for doc in docs:
90
+ ref_id = doc.metadata['ref_id']
91
+ if ref_id in ref_ids:
92
+ metadata = doc.metadata
93
+ if metadata['document'] == "Supplementary":
94
+ result_str += (
95
+ f"**Ref. {ref_id} [{metadata['country']} {metadata['document']}: {metadata['file_name']} p{metadata['page']}; "
96
+ f"vulnerabilities: {metadata['vulnerability_cat']}]:** *'{doc.page_content}'*<br><br>"
97
+ )
98
+ else:
99
+ result_str += (
100
+ f"**Ref. {ref_id} [{metadata['country']} {metadata['document']} p{metadata['page']}; "
101
+ f"vulnerabilities: {metadata['vulnerability_cat']}]:** *'{doc.page_content}'*<br><br>"
102
+ )
103
+ return result_str
104
+
105
+ # Construct the prompt for the model
106
+ def get_prompt(prompt_template, docs, input_query):
107
+ context = ' - '.join([
108
+ f"&&& [ref. {d.metadata['ref_id']}] {d.metadata['document']} &&&: {d.page_content}"
109
+ for d in docs
110
+ ])
111
+ prompt = f"{prompt_template}; Context: {context}; Question: {input_query}; Answer:"
112
+ return prompt
113
+
114
+ # Execute the query and generate the response
115
+ def run_query(client, prompt, docs, res_box):
116
+ stream = client.chat.completions.create(
117
+ model="gpt-4o-mini-2024-07-18",
118
+ messages=[{"role": "user", "content": prompt}],
119
+ stream=True,
120
+ )
121
+ report = []
122
+ for chunk in stream:
123
+ if chunk.choices[0].delta.content is not None:
124
+ report.append(chunk.choices[0].delta.content)
125
+ result = "".join(report).strip()
126
+ res_box.success(result)
127
+
128
+ references = get_refs(docs, result)
129
+ return references