Spaces:

anakin87
/

fact-checking-rocks

Running

File size: 2,613 Bytes

import shutil
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import Pipeline

import streamlit as st

from app_utils.entailment_checker import EntailmentChecker

from app_utils.config import STATEMENTS_PATH, INDEX_DIR, RETRIEVER_MODEL, RETRIEVER_MODEL_FORMAT, NLI_MODEL

# cached to make index and models load only at start
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True)
def start_haystack():
    """
    load document store, retriever, reader and create pipeline
    """
    shutil.copy(f'{INDEX_DIR}/faiss_document_store.db', '.')
    document_store = FAISSDocumentStore(
        faiss_index_path=f'{INDEX_DIR}/my_faiss_index.faiss',
        faiss_config_path=f'{INDEX_DIR}/my_faiss_index.json')
    print(f'Index size: {document_store.get_document_count()}')
    
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=RETRIEVER_MODEL,
        model_format=RETRIEVER_MODEL_FORMAT
    )
    
    entailment_checker = EntailmentChecker(model_name_or_path=NLI_MODEL,
                        use_gpu=False)
    

    pipe = Pipeline()
    pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
    pipe.add_node(component=entailment_checker, name="ec", inputs=["retriever"])
    return pipe

pipe = start_haystack()
# the pipeline is not included as parameter of the following function,
# because it is difficult to cache
@st.cache(persist=True, allow_output_mutation=True)
def query(statement: str, retriever_top_k: int = 5):
    """Run query and verify statement"""
    params = {"retriever": {"top_k": retriever_top_k}}
    results = pipe.run(statement, params=params)

    scores, agg_con, agg_neu, agg_ent = 0,0,0,0
    for doc in results['documents']:
        scores+=doc.score
        ent_info=doc.meta['entailment_info']
        con,neu,ent = ent_info['contradiction'], ent_info['neutral'], ent_info['entailment']
        agg_con+=con*doc.score
        agg_neu+=neu*doc.score
        agg_ent+=ent*doc.score
    
    results['agg_entailment_info'] = {
        'contradiction': round(agg_con/scores, 2),
                            'neutral': round(agg_neu/scores, 2),
                            'entailment': round(agg_ent/scores, 2)}

    return results      

@st.cache()
def load_statements():
    """Load statements from file"""
    with open(STATEMENTS_PATH) as fin:
        statements = [line.strip() for line in fin.readlines()
                     if not line.startswith('#')]
    return statements