|
import streamlit as st |
|
import os |
|
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs |
|
from haystack.schema import Answer |
|
from haystack.document_stores import InMemoryDocumentStore |
|
from haystack.pipelines import ExtractiveQAPipeline |
|
from haystack.nodes import FARMReader, TfidfRetriever |
|
import logging |
|
from markdown import markdown |
|
from annotated_text import annotation |
|
from PIL import Image |
|
|
|
os.environ['TOKENIZERS_PARALLELISM'] = "false" |
|
|
|
|
|
|
|
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True) |
|
def start_haystack(): |
|
document_store = InMemoryDocumentStore() |
|
load_and_write_data(document_store) |
|
retriever = TfidfRetriever(document_store=document_store) |
|
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True) |
|
pipeline = ExtractiveQAPipeline(reader, retriever) |
|
return pipeline |
|
|
|
|
|
def load_and_write_data(document_store): |
|
doc_dir = './amazon_help_docs' |
|
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) |
|
|
|
document_store.write_documents(docs) |
|
|
|
|
|
pipeline = start_haystack() |
|
|
|
|
|
def set_state_if_absent(key, value): |
|
if key not in st.session_state: |
|
st.session_state[key] = value |
|
|
|
|
|
set_state_if_absent("question", "What is amazon music?") |
|
set_state_if_absent("results", None) |
|
|
|
|
|
def reset_results(*args): |
|
st.session_state.results = None |
|
|
|
|
|
|
|
|
|
image = Image.open('got-haystack.png') |
|
st.image(image) |
|
|
|
st.markdown(""" |
|
This QA demo uses a [Haystack Extractive QA Pipeline](https://haystack.deepset.ai/components/ready-made-pipelines#extractiveqapipeline) with |
|
an [InMemoryDocumentStore](https://haystack.deepset.ai/components/document-store) which contains documents about Game of Thrones π |
|
Go ahead and ask questions about the marvellous kingdom! |
|
""", unsafe_allow_html=True) |
|
|
|
question = st.text_input("", value=st.session_state.question, max_chars=100, on_change=reset_results) |
|
|
|
|
|
def ask_question(question): |
|
prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}) |
|
results = [] |
|
for answer in prediction["answers"]: |
|
answer = answer.to_dict() |
|
if answer["answer"]: |
|
results.append( |
|
{ |
|
"context": "..." + answer["context"] + "...", |
|
"answer": answer["answer"], |
|
"relevance": round(answer["score"] * 100, 2), |
|
"offset_start_in_doc": answer["offsets_in_document"][0]["start"], |
|
} |
|
) |
|
else: |
|
results.append( |
|
{ |
|
"context": None, |
|
"answer": None, |
|
"relevance": round(answer["score"] * 100, 2), |
|
} |
|
) |
|
return results |
|
|
|
|
|
if question: |
|
with st.spinner("π Performing semantic search on royal scripts..."): |
|
try: |
|
msg = 'Asked ' + question |
|
logging.info(msg) |
|
st.session_state.results = ask_question(question) |
|
except Exception as e: |
|
logging.exception(e) |
|
|
|
if st.session_state.results: |
|
st.write('## Top Results') |
|
for count, result in enumerate(st.session_state.results): |
|
if result["answer"]: |
|
answer, context = result["answer"], result["context"] |
|
start_idx = context.find(answer) |
|
end_idx = start_idx + len(answer) |
|
st.write( |
|
markdown(context[:start_idx] + str( |
|
annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[ |
|
end_idx:]), |
|
unsafe_allow_html=True, |
|
) |
|
st.markdown(f"**Relevance:** {result['relevance']}") |
|
else: |
|
st.info( |
|
"π€ Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!" |
|
) |
|
|
|
|
|
|