File size: 8,299 Bytes
6829fd5
 
 
 
 
 
 
 
 
 
 
 
 
 
2e05b67
 
 
 
 
 
6829fd5
 
 
 
2e05b67
6829fd5
 
 
f8734ec
 
6829fd5
 
 
 
 
 
3f5be66
6829fd5
 
 
 
 
 
 
 
 
 
 
 
295a965
6829fd5
 
 
 
 
 
295a965
 
 
 
 
6829fd5
295a965
 
 
6829fd5
 
 
 
 
 
 
 
 
295a965
6829fd5
 
 
 
 
 
 
a295957
6829fd5
 
 
82c8dd7
 
 
6829fd5
3f5be66
82c8dd7
a295957
 
6829fd5
 
 
 
 
 
 
295a965
6829fd5
82c8dd7
6829fd5
 
 
 
 
 
295a965
6829fd5
3f5be66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6829fd5
2e05b67
 
6829fd5
c8a9cbc
2e05b67
 
 
 
 
 
3f5be66
 
6829fd5
2e05b67
6829fd5
2e05b67
 
 
 
3f5be66
 
 
 
 
6829fd5
 
2e05b67
6829fd5
 
3f5be66
6829fd5
 
2e05b67
 
6829fd5
3f5be66
 
 
6829fd5
 
a295957
3f5be66
 
 
 
 
 
 
 
 
 
 
 
6829fd5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import torch
try:
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        try:
            print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
        except Exception as e:
            print(f"Error getting CUDA device name: {str(e)}")
    else:
        print("No CUDA device available - using CPU")
except Exception as e:
    print(f"Error checking CUDA availability: {str(e)}")
    print("Continuing with CPU...")


from modules.logging_config import setup_logging
setup_logging()
import logging
logger = logging.getLogger(__name__)

import streamlit as st
import os
from huggingface_hub import login
from datetime import datetime
from modules.auth import validate_login
from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data

# Local
# from dotenv import load_dotenv
# load_dotenv()


# Main app logic
def main():
    # Temporarily set authentication to True for testing
    if 'authenticated' not in st.session_state:
        st.session_state['authenticated'] = False

    if st.session_state['authenticated']:
        # Remove login success message for testing
        hf_token = os.environ["HF_TOKEN"]
        login(token=hf_token, add_to_git_credential=True)

        # Initialize session state variables
        if 'data_processed' not in st.session_state:
            st.session_state['data_processed'] = False
            st.session_state['df'] = None

        # Main Streamlit app
        st.title('Application Pre-Filtering Tool')

        # Sidebar (filters)
        with st.sidebar:
            with st.expander("ℹ️ - Instructions", expanded=False):
                st.markdown(
                    """
                    1. **Download the Excel Template file (below)**
                    2. **[OPTIONAL]: Select the desired filtering sensitivity level (below)**
                    3. **Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'**
                    4. **Upload the template file in the area to the right (or click browse files)**
                    5. **Click 'Start Analysis'**

                    The tool will start processing the uploaded application data. This can take some time
                    depending on the number of applications and the length of text in each. For example, a file with 1000 applications
                    could be expected to take approximately 5 minutes.

                    ***NOTE (1)** -  you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*

                    """
                )
            # Excel file download
            st.download_button(
                label="Download Excel Template",
                data=create_excel(),
                file_name="upload_template.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )

            # get sensitivity level for use in review / reject (ref. process_data function)
            sens_options = {
                "Low": 4,
                "Medium": 5,
                "High": 6,
            }

            sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
                                    help = 'Decreasing the level of sensitivity results in less \
                                    applications filtered out. This also \
                                    reduces the probability of false negatives (FNs). The rate of \
                                    FNs at the lowest setting is approximately 6 percent, and \
                                    approaches 13 percent at the highest setting. \
                                    NOTE: changing this setting does not affect the raw data in the CSV output file (only the labels)', 
                                    options = list(sens_options.keys()),
                                    index = list(sens_options.keys()).index("High"),
                                    horizontal = False)

            sens_level = sens_options[sens_input]

        with st.expander("ℹ️ - About this app", expanded=False):
            st.write(
                """
                This tool provides an interface for running an automated preliminary assessment of applications for a call for applications.

                The tool functions by running selected text fields from the application through a series of LLMs fine-tuned for text classification (ref. diagram below).
                The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against 
                human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).
                
                """)
            st.image('images/pipeline.png')

        uploaded_file = st.file_uploader("Select a file containing application pre-filtering data (see instructions in the sidebar)")

        # Add session state variables if they don't exist
        if 'show_button' not in st.session_state:
            st.session_state['show_button'] = True
        if 'processing' not in st.session_state:
            st.session_state['processing'] = False
        if 'data_processed' not in st.session_state:
            st.session_state['data_processed'] = False

        # Only show the button if show_button is True and file is uploaded and not processing
        if uploaded_file is not None and st.session_state['show_button'] and not st.session_state['processing']:
            if st.button("Start Analysis", key="start_analysis"):
                st.session_state['show_button'] = False
                st.session_state['processing'] = True
                st.rerun()

        # If we're processing, show the processing logic
        if st.session_state['processing']:
            try:
                logger.info(f"File uploaded: {uploaded_file.name}")
                
                if not st.session_state['data_processed']:
                    logger.info("Starting data processing")
                    try:
                        st.session_state['df'] = process_data(uploaded_file, sens_level)
                        logger.info("Data processing completed successfully")
                        st.session_state['data_processed'] = True
                    except Exception as e:
                        logger.error(f"Error in process_data: {str(e)}")
                        raise

                df = st.session_state['df']

                current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
                output_filename = f'processed_applications_{current_datetime}.csv'
                csv_buffer = df.to_csv(index=False).encode()
                logger.info("CSV buffer created successfully")
                
                def reset_button_state():
                    st.session_state['show_button'] = True
                    st.session_state['processing'] = False
                    st.session_state['data_processed'] = False
                
                st.download_button(
                    label="Download data as CSV",
                    data=csv_buffer,
                    file_name=output_filename,
                    mime='text/csv',
                    on_click=reset_button_state
                )

            except Exception as e:
                logger.error(f"Error processing file: {str(e)}")
                st.error("Failed to process the file. Please ensure your column names match the template file.")
                st.session_state['show_button'] = True
                st.session_state['processing'] = False
                st.rerun()


    # Comment out for testing
    else:
        username = st.text_input("Username")
        password = st.text_input("Password", type="password")
        if st.button("Login"):
            if validate_login(username, password):
                st.session_state['authenticated'] = True
                st.rerun()
            else:
                st.error("Incorrect username or password")



main()