# set path import glob, os, sys from udfPreprocess.search import semantic_search sys.path.append('../udfPreprocess') #import helper import udfPreprocess.docPreprocessing as pre import udfPreprocess.cleaning as clean from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search #import needed libraries import seaborn as sns from pandas import DataFrame from sentence_transformers import SentenceTransformer, CrossEncoder, util # from keybert import KeyBERT from transformers import pipeline import matplotlib.pyplot as plt import numpy as np import streamlit as st import pandas as pd from rank_bm25 import BM25Okapi from sklearn.feature_extraction import _stop_words import string from tqdm.autonotebook import tqdm import numpy as np import docx from docx.shared import Inches from docx.shared import Pt from docx.enum.style import WD_STYLE_TYPE import logging logger = logging.getLogger(__name__) import tempfile import sqlite3 import json import configparser def app(): with st.container(): st.markdown("

Search

", unsafe_allow_html=True) st.write(' ') st.write(' ') with st.expander("ℹī¸ - About this app", expanded=False): st.write( """ The *Keyword Search* app is an easy-to-use interface \ built in Streamlit for doing keyword search in \ policy document - developed by GIZ Data and the \ Sustainable Development Solution Network. """) st.markdown("") with st.sidebar: with open('sample/keywordexample.json','r') as json_file: keywordexample = json.load(json_file) genre = st.radio("Select Keyword Category", list(keywordexample.keys())) if genre == 'Food': keywordList = keywordexample['Food'] elif genre == 'Climate': keywordList = keywordexample['Climate'] elif genre == 'Social': keywordList = keywordexample['Social'] elif genre == 'Nature': keywordList = keywordexample['Nature'] elif genre == 'Implementation': keywordList = keywordexample['Implementation'] else: keywordList = None searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning']) with st.container(): if keywordList is not None: queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre), value="{}".format(keywordList)) else: queryList = st.text_input("Please enter here your question and we will look \ for an answer in the document OR enter the keyword you \ are looking for and we will \ we will look for similar context \ in the document.", placeholder="Enter keyword here") if st.button("Find them"): if queryList == "": st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!") logging.warning("Terminated as no keyword provided") else: if 'docs' in st.session_state: docs = st.session_state['docs'] paraList = st.session_state['paraList'] if searchtype == 'Exact Matches': queryList = list(queryList.split(",")) logging.info("performing lexical search") tokenized_corpus = bm25TokenizeDoc(paraList) # st.write(len(tokenized_corpus)) document_bm25 = BM25Okapi(tokenized_corpus) with st.spinner("Performing Exact matching search (Lexical search) for you"): st.markdown("##### Top few lexical search (BM25) hits #####") for keyword in queryList: bm25_hits = lexical_search(keyword,document_bm25) counter = 0 for hit in bm25_hits: if hit['score'] > 0.00: counter += 1 if counter == 1: st.markdown("###### Results for keyword: **{}** ######".format(keyword)) # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " "))) st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " "))) if counter == 0: st.write("No results found for '**{}**' ".format(keyword)) st.markdown("---") else: logging.info("starting semantic search") with st.spinner("Performing Similar/Contextual search"): query = "Find {} related issues ?".format(queryList) config = configparser.ConfigParser() config.read_file(open('udfPreprocess/paramconfig.cfg')) threshold = float(config.get('semantic_search','THRESHOLD')) # st.write(query) semantic_hits = semantic_search(query,paraList) st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList)) for i,queryhit in enumerate(semantic_hits): # st.markdown("###### Results for query: **{}** ######".format(queryList[i])) counter = 0 for hit in queryhit: counter += 1 if hit['score'] > threshold: # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " "))) st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " "))) # document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " "))) st.markdown("---") # st.write(semantic_hits) else: st.info("🤔 No document found, please try to upload it at the sidebar!") logging.warning("Terminated as no keyword provided")