# set path import glob, os, sys; sys.path.append('../utils') #import needed libraries import seaborn as sns import matplotlib.pyplot as plt import numpy as np import pandas as pd import streamlit as st from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification import logging logger = logging.getLogger(__name__) from utils.config import get_classifier_params from utils.preprocessing import paraLengthCheck from io import BytesIO import xlsxwriter import plotly.express as px from utils.vulnerability_classifier import label_dict # Declare all the necessary variables classifier_identifier = 'vulnerability' params = get_classifier_params(classifier_identifier) @st.cache_data def to_excel(df,sectorlist): len_df = len(df) output = BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, index=False, sheet_name='Sheet1') workbook = writer.book worksheet = writer.sheets['Sheet1'] worksheet.data_validation('S2:S{}'.format(len_df), {'validate': 'list', 'source': ['No', 'Yes', 'Discard']}) worksheet.data_validation('X2:X{}'.format(len_df), {'validate': 'list', 'source': sectorlist + ['Blank']}) worksheet.data_validation('T2:T{}'.format(len_df), {'validate': 'list', 'source': sectorlist + ['Blank']}) worksheet.data_validation('U2:U{}'.format(len_df), {'validate': 'list', 'source': sectorlist + ['Blank']}) worksheet.data_validation('V2:V{}'.format(len_df), {'validate': 'list', 'source': sectorlist + ['Blank']}) worksheet.data_validation('W2:U{}'.format(len_df), {'validate': 'list', 'source': sectorlist + ['Blank']}) writer.save() processed_data = output.getvalue() return processed_data def app(): ### Main app code ### with st.container(): # If a document has been processed if 'key0' in st.session_state: # Run vulnerability classifier df = st.session_state.key0 classifier = load_vulnerabilityClassifier(classifier_name=params['model_name']) st.session_state['{}_classifier'.format(classifier_identifier)] = classifier # Get the predictions df = vulnerability_classification(haystack_doc=df, threshold= params['threshold']) # Store df in session state with key1 st.session_state.key1 = df def vulnerability_display(): # Assign dataframe a name df_vul = st.session_state['key0'] st.write(df_vul) #st.write(df_vul) col1, col2 = st.columns([1,1]) with col1: # Header st.subheader("Explore references to vulnerable groups:") # Text num_paragraphs = len(df_vul['Vulnerability Label']) num_references = df_vul['Vulnerability Label'].apply(lambda x: 'Other' not in x).sum() st.markdown(f"""
The document contains a total of {num_paragraphs} paragraphs. We identified {num_references} references to vulnerable groups.

In the pie chart on the right you can see the distribution of the different groups defined. For a more detailed view in the text, see the paragraphs and their respective labels in the table below.""", unsafe_allow_html=True) with col2: ### Bar chart # # Create a df that stores all the labels df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label']) # Count how often each label appears in the "Vulnerability Labels" column group_counts = {} # Iterate through each sublist for index, row in df_vul.iterrows(): # Iterate through each group in the sublist for sublist in row['Vulnerability Label']: # Update the count in the dictionary group_counts[sublist] = group_counts.get(sublist, 0) + 1 # Create a new dataframe from group_counts df_label_count = pd.DataFrame(list(group_counts.items()), columns=['Label', 'Count']) # Merge the label counts with the df_label DataFrame df_label_count = df_labels.merge(df_label_count, on='Label', how='left') st.write("df_label_count") # Bar chart fig = px.bar(df_label_count, x='Label', y='Count', title='How many references have been found to each group?', labels={'Count': 'Frequency'}) #Show plot st.plotly_chart(fig, use_container_width=True) # ### Table st.write(df[df['groups_column'].apply(lambda x: 'Other' not in x)])