MonPol / filterminutes.py
ruisp's picture
Increased max number of meta-search iterations to 500
b218606
raw
history blame
3.12 kB
import logging
import gradio as gr
import numpy as np
log = logging.getLogger('filter methods')
logging.basicConfig(level=logging.INFO)
def filter_docs_by_meta(docs, filter_dict):
"""
Filter documents by multiple parameters
Parameters:
docs : List[langchain.schema.Document]
filter_dict : Dict[str, Any]
Returns: List of filtered documents
Examples:
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
filter_dict = {'a': 1}
filter_docs_by_meta(docs, filter_dict)
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
filter_dict = {'a': 1, 'b': 2}
filter_docs_by_meta(docs, filter_dict)
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
"""
filtered_docs = []
for doc in docs:
append = True
for key, value in filter_dict.items():
if doc.metadata[key] != value:
append = False
break
if append:
filtered_docs.append(doc)
return filtered_docs
def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50):
"""
Expand search with filter until reaching at least a pre-determined number of documents.
----------
Parameters
vector_store : langchain.vectorstores.FAISS
The FAISS vector store.
query : str
The query to search for.
filter_dict : Dict[str, Any]
The parameters to filer for
target_k : int
The minimum number of documents desired after filtering
init_k : int
The top-k documents to extract for the initial search.
step : int
The size of the step when enlarging the search.
Returns: List of at least target_k Documents for post-processing.
"""
context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
len_docs_begin = len(context)
if len_docs_begin >= target_k:
log.info(f'Initial search contains {len_docs_begin} Documents. Expansion not required. ')
return context
CUT_THE_LOOP_N = 1000
for top_k_docs in np.arange(init_k, CUT_THE_LOOP_N * init_k, step):
log.info(f'Context contains {len(context)} documents')
log.info(f'Expanding search with k={top_k_docs}')
context = filter_docs_by_meta(vector_store.similarity_search(query, k=int(top_k_docs)), filter_dict)
if len(context) == target_k:
log.info(f'Success. Context contains {len(context)} Documents matching the filtering criteria')
return context
log.info(f'Failed to reach target number of documents after {CUT_THE_LOOP_N} loops,'
f' context contains {len(context)} Documents matching the filtering criteria')
return context