Spaces:
Runtime error
Runtime error
File size: 5,560 Bytes
dd124ec 55d03cf dd124ec 55d03cf dd124ec e21ac83 fedecb4 dd124ec 4413f1a 811e4f7 dd124ec 2a53bc3 4bd6367 dd124ec f75d001 ef1d02b a087ad0 f75d001 576942c fedecb4 b1b6b1b f75d001 65ba45d f75d001 dd124ec 121a5b7 a08237b dd124ec a08237b dd124ec 0368090 dd124ec 9aad845 ee50dc7 d4c667e 9aad845 bca1994 544db84 bca1994 544db84 4bd6367 b1b6b1b fe01337 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# set path
import glob, os, sys; sys.path.append('../scripts')
#import helper
import scripts.process as pre
import scripts.clean as clean
#import needed libraries
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd
from sklearn.feature_extraction import _stop_words
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TfidfRetriever; EmbeddingRetriever
import string
from markdown import markdown
from annotated_text import annotation
from tqdm.autonotebook import tqdm
import numpy as np
import tempfile
import logging
logger = logging.getLogger(__name__)
#Haystack Components
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack(documents_processed):
document_store = InMemoryDocumentStore()
document_store.write_documents(documents_processed)
retriever = EmbeddingRetriever(
document_store=document_store,
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
model_format="sentence_transformers")
document_store.update_embeddings(retriever)
retriever = EmbeddingRetriever(document_store=document_store)
reader = FARMReader(model_name_or_path="deepset/tinyroberta-squad2", use_gpu=True) #deepset/roberta-base-squad2
pipeline = ExtractiveQAPipeline(reader, retriever)
return pipeline
def ask_question(question,pipeline):
prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
results = []
for answer in prediction["answers"]:
answer = answer.to_dict()
if answer["answer"]:
results.append(
{
"context": "..." + answer["context"] + "...",
"answer": answer["answer"],
"relevance": round(answer["score"] * 100, 2),
"offset_start_in_doc": answer["offsets_in_document"][0]["start"],
}
)
else:
results.append(
{
"context": None,
"answer": None,
"relevance": round(answer["score"] * 100, 2),
}
)
return results
def app():
with st.container():
st.markdown("<h1 style='text-align: center; color: black;'> Keyword Search</h1>", unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("ℹ️ - About this app", expanded=False):
st.write("""
The *Keyword Search* app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network.
""")
st.markdown("")
with st.container():
question = st.text_input("Please enter your question here, we will look for the answer in the document.",
value="Which extreme weather is a particular risk?",)
if st.button("Find them."):
file = st.session_state['file']
if file is not None:
with tempfile.NamedTemporaryFile(mode="wb") as temp:
bytes_data = file.getvalue()
temp.write(bytes_data)
file_name = file.name
file_path = temp.name
# load document
documents = pre.load_document(temp.name,file_name)
documents_processed = pre.preprocessing(documents)
pipeline = start_haystack(documents_processed )
with st.spinner("👑 Performing semantic search on"):#+file.name+"..."):
try:
msg = 'Asked ' + question
logging.info(msg)
results = ask_question(question,pipeline)
st.write('## Top Results')
#st.write(results)
for count, result in enumerate(results):
if result["answer"]:
answer, context = result["answer"], result["context"]
start_idx = context.find(answer)
end_idx = start_idx + len(answer)
st.write(
markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]),
unsafe_allow_html=True,
)
st.markdown(f"**Relevance:** {result['relevance']}")
else:
st.info(
"🤔 Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
)
except Exception as e:
logging.exception(e)
else:
st.info("🤔 No document found, please try to upload it at the sidebar!") |