|
|
|
import glob, os, sys; |
|
sys.path.append('../utils') |
|
|
|
|
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
from utils.target_classifier import load_targetClassifier, target_classification |
|
import logging |
|
logger = logging.getLogger(__name__) |
|
from utils.config import get_classifier_params |
|
from io import BytesIO |
|
import xlsxwriter |
|
import plotly.express as px |
|
|
|
|
|
classifier_identifier = 'target' |
|
params = get_classifier_params(classifier_identifier) |
|
|
|
|
|
_lab_dict = { |
|
'NEGATIVE':'NO TARGET INFO', |
|
'TARGET':'TARGET', |
|
} |
|
|
|
@st.cache_data |
|
def to_excel(df): |
|
len_df = len(df) |
|
output = BytesIO() |
|
writer = pd.ExcelWriter(output, engine='xlsxwriter') |
|
df.to_excel(writer, index=False, sheet_name='Sheet1') |
|
workbook = writer.book |
|
worksheet = writer.sheets['Sheet1'] |
|
worksheet.data_validation('E2:E{}'.format(len_df), |
|
{'validate': 'list', |
|
'source': ['No', 'Yes', 'Discard']}) |
|
writer.save() |
|
processed_data = output.getvalue() |
|
return processed_data |
|
|
|
def app(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.container(): |
|
if 'key0' in st.session_state: |
|
df = st.session_state.key0 |
|
|
|
|
|
classifier = load_targetClassifier(classifier_name=params['model_name']) |
|
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier |
|
if len(df) > 100: |
|
warning_msg = ": This might take sometime, please sit back and relax." |
|
else: |
|
warning_msg = "" |
|
|
|
df = target_classification(haystack_doc=df, |
|
threshold= params['threshold']) |
|
st.session_state.key1 = df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def target_display(): |
|
if 'key1' in st.session_state: |
|
df = st.session_state.key1 |
|
hits = df[df['Target Label'] == 'TARGET'] |
|
range_val = min(5,len(hits)) |
|
if range_val !=0: |
|
count_target = sum(hits['Target Label'] == 'TARGET') |
|
count_netzero = sum(hits['Netzero Label'] == 'NETZERO') |
|
count_ghg = sum(hits['GHG Label'] == 'LABEL_2') |
|
count_economy = sum([True if 'Economy-wide' in x else False |
|
for x in hits['Sector Label']]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
c1, c2 = st.columns([1,1]) |
|
with c1: |
|
st.write('**Target Paragraphs**: `{}`'.format(count_target)) |
|
st.write('**NetZero Related Paragraphs**: `{}`'.format(count_netzero)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with c2: |
|
st.write('**GHG Related Paragraphs**: `{}`'.format(count_ghg)) |
|
st.write('**Economy-wide Related Paragraphs**: `{}`'.format(count_economy)) |
|
|
|
hits = hits.sort_values(by=['Relevancy'], ascending=False) |
|
st.write("") |
|
st.markdown("###### Top few Target Classified paragraph/text results ######") |
|
range_val = min(5,len(hits)) |
|
for i in range(range_val): |
|
|
|
|
|
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy'])) |
|
st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " "))) |
|
else: |
|
st.info("π€ No Targets found") |