ruisp commited on
Commit
c657ec0
·
1 Parent(s): 3d38978

Upload 4 files

Browse files

Adding the app and related scripts

Files changed (3) hide show
  1. filterminutes.py +71 -0
  2. prompts.py +23 -0
  3. public_app.py +83 -0
filterminutes.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ log = logging.getLogger('filter methods')
4
+ logging.basicConfig(level=logging.INFO)
5
+
6
+
7
+ def filter_docs_by_meta(docs, filter_dict):
8
+ """
9
+ Filter documents by multiple parameters
10
+ Parameters:
11
+ docs : List[langchain.schema.Document]
12
+ filter_dict : Dict[str, Any]
13
+
14
+ Returns: List of filtered documents
15
+
16
+ Examples:
17
+ docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
18
+ langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
19
+ filter_dict = {'a': 1}
20
+ filter_docs_by_meta(docs, filter_dict)
21
+ [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
22
+
23
+ docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
24
+ langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
25
+ filter_dict = {'a': 1, 'b': 2}
26
+ filter_docs_by_meta(docs, filter_dict)
27
+ [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
28
+
29
+ """
30
+ filtered_docs = []
31
+ for doc in docs:
32
+ append = True
33
+ for key, value in filter_dict.items():
34
+ if doc.metadata[key] != value:
35
+ append = False
36
+ break
37
+ if append:
38
+ filtered_docs.append(doc)
39
+ return filtered_docs
40
+
41
+
42
+ def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50):
43
+ """
44
+ Expand search with filter until reaching at least a pre-determined number of documents.
45
+ ----------
46
+ Parameters
47
+ vector_store : langchain.vectorstores.FAISS
48
+ The FAISS vector store.
49
+ query : str
50
+ The query to search for.
51
+ filter_dict : Dict[str, Any]
52
+ The parameters to filer for
53
+ target_k : int
54
+ The minimum number of documents desired after filtering
55
+ init_k : int
56
+ The top-k documents to extract for the initial search.
57
+ step : int
58
+ The size of the step when enlarging the search.
59
+
60
+ Returns: List of at least target_k Documents for post-processing
61
+
62
+ """
63
+ context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
64
+ while len(context) < target_k:
65
+ log.info(f'Context contains {len(context)} documents')
66
+ log.info(f'Expanding search with k={init_k}')
67
+ init_k += step
68
+ context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
69
+ log.info(f'Done. Context contains {len(context)} Documents matching the filtering criteria')
70
+ return context
71
+
prompts.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # ---Define the two prompts---
3
+ PROMPT_EXTRACT_DATE = """Extract the date elements from the question at the end of the two examples in numeric format.
4
+ If there is no date elements found output False. If there is mention of both year and month, use the datatime string format %Y-%m.
5
+
6
+ Example 1)The meeting took place in October 2022 ->> 2022-10. Then put them in a dictionary like so:
7
+ Example a) The meeting took place in October 2022 ->> (year: 2022, month: 10)
8
+ Example b) During 1 November 1968 ->> (year: 2022, month: 10, day:1). Use json format. You are allowed to use the keys 'year', 'month', 'day', and 'page'.
9
+
10
+ {question}
11
+ """
12
+
13
+ PROMPT_FED_ANALYST = """You are a research analyst at a federal reserve bank and you are trying to answer questions
14
+ or provide answers to queries about meetings of the Federal Open Market Committee. Use the following pieces of
15
+ context to answer the question at the end, giving special attention to economic, cultural, financial, or political
16
+ developments. If you don't have all the elements to answer the query, say it explicitly. Finally, if you are not
17
+ provided with date elements, warn the user that the output is likely to be wrong due to the time sensitivity of questions
18
+ related to economic matters.
19
+
20
+ {context}
21
+
22
+ Question: {question}
23
+ """
public_app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from langchain import PromptTemplate, LLMChain
4
+ from langchain.chains.question_answering import load_qa_chain
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.chat_models import ChatOpenAI
8
+ import gradio as gr
9
+ import json
10
+
11
+ from prompts import PROMPT_EXTRACT_DATE, PROMPT_FED_ANALYST
12
+ from filterminutes import search_with_filter
13
+
14
+ # --------------------------Load the sentence transformer and the vector store--------------------------#
15
+ model_name = 'sentence-transformers/all-mpnet-base-v2'
16
+ model_kwargs = {'device': 'cpu'}
17
+ encode_kwargs = {'normalize_embeddings': False}
18
+ embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
19
+ vs = FAISS.load_local("MINUTES_FOMC_HISTORY", embeddings)
20
+
21
+ # --------------------------Import the prompts------------------#
22
+ PROMPT_DATE = PromptTemplate.from_template(PROMPT_EXTRACT_DATE)
23
+ PROMPT_ANALYST = PromptTemplate.from_template(PROMPT_FED_ANALYST)
24
+
25
+
26
+ # --------------------------define the qa chain for answering queries--------------------------#
27
+ def load_chains(open_ai_key):
28
+ date_extractor = LLMChain(llm=ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', openai_api_key=open_ai_key),
29
+ prompt=PROMPT_DATE)
30
+ fed_chain = load_qa_chain(llm=ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, openai_api_key=open_ai_key),
31
+ chain_type='stuff', prompt=PROMPT_ANALYST)
32
+ return date_extractor, fed_chain
33
+
34
+
35
+ def get_chain(query, api_key):
36
+ """
37
+ Detects the date, computes similarity, and answers the query using
38
+ only documents corresponding to the date requested.
39
+ The query is first passed to the date extractor to extract the date
40
+ and then to the qa chain to answer the query.
41
+ Parameters
42
+ ----------
43
+ query : str
44
+ Query to be answered.
45
+ api_key : str
46
+ OpenAI API key.
47
+
48
+ Returns
49
+ Answer to the query.
50
+ """
51
+ date_extractor, fed_chain = load_chains(api_key)
52
+ logging.info('Extracting the date in numeric format..')
53
+ date_response = date_extractor.run(query)
54
+ if date_response != 'False':
55
+ filter_date = json.loads(date_response)
56
+
57
+ logging.info(f'Date parameters retrieved: {filter_date}')
58
+ logging.info('Running the qa with filtered context..')
59
+ filtered_context = search_with_filter(vs, query, init_k=200, step=300, target_k=7, filter_dict=filter_date)
60
+
61
+ logging.info(20 * '-' + 'Metadata for the documents to be used' + 20 * '-')
62
+ for doc in filtered_context:
63
+ logging.info(doc.metadata)
64
+ else:
65
+ logging.info('No date elements found. Running the qa without filtering can output incorrect results.')
66
+ filtered_context = vs.similarity_search(query, k=7)
67
+ return fed_chain({'input_documents': filtered_context[:7], 'question': query})['output_text']
68
+
69
+
70
+ if __name__ == '__main__':
71
+ app = gr.Interface(fn=get_chain,
72
+ inputs=[gr.Textbox(lines=2, placeholder="Enter your query", label='Your query'),
73
+ gr.Textbox(lines=1, placeholder="Your OpenAI API key here", label='OpenAI Key')],
74
+ description='Query the public database in FRED from 1936-2023',
75
+ outputs=gr.Textbox(lines=1, label='Answer'),
76
+ title='Chat with the FOMC meeting minutes',
77
+ examples=[['What was the economic outlook from the staff presented in the meeting '
78
+ 'of April 2009 with respect to labour market developments and industrial production?'],
79
+ ['Who were the voting members present in the meeting on March 2010?'],
80
+ ['How important was the pandemic of Covid-19 in the discussions during 2020?'],
81
+ ['What was the impact of the oil crisis for the economic outlook during 1973?']],
82
+ )
83
+ app.launch()