Spaces:
Runtime error
Runtime error
File size: 2,577 Bytes
c657ec0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import logging
log = logging.getLogger('filter methods')
logging.basicConfig(level=logging.INFO)
def filter_docs_by_meta(docs, filter_dict):
"""
Filter documents by multiple parameters
Parameters:
docs : List[langchain.schema.Document]
filter_dict : Dict[str, Any]
Returns: List of filtered documents
Examples:
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
filter_dict = {'a': 1}
filter_docs_by_meta(docs, filter_dict)
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
filter_dict = {'a': 1, 'b': 2}
filter_docs_by_meta(docs, filter_dict)
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
"""
filtered_docs = []
for doc in docs:
append = True
for key, value in filter_dict.items():
if doc.metadata[key] != value:
append = False
break
if append:
filtered_docs.append(doc)
return filtered_docs
def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50):
"""
Expand search with filter until reaching at least a pre-determined number of documents.
----------
Parameters
vector_store : langchain.vectorstores.FAISS
The FAISS vector store.
query : str
The query to search for.
filter_dict : Dict[str, Any]
The parameters to filer for
target_k : int
The minimum number of documents desired after filtering
init_k : int
The top-k documents to extract for the initial search.
step : int
The size of the step when enlarging the search.
Returns: List of at least target_k Documents for post-processing
"""
context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
while len(context) < target_k:
log.info(f'Context contains {len(context)} documents')
log.info(f'Expanding search with k={init_k}')
init_k += step
context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
log.info(f'Done. Context contains {len(context)} Documents matching the filtering criteria')
return context
|