Spaces:

mtyrrell
/

maf_prefilter_app

Running

App Files Files Community

maf_prefilter_app / app.py

mtyrrell

dotenv

f2efe55 5 months ago

raw

history blame contribute delete

9.14 kB

	import torch
	try:
	print(f"Is CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	try:
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
	except Exception as e:
	print(f"Error getting CUDA device name: {str(e)}")
	else:
	print("No CUDA device available - using CPU")
	except Exception as e:
	print(f"Error checking CUDA availability: {str(e)}")
	print("Continuing with CPU...")

	import streamlit as st
	import os
	from huggingface_hub import login
	from datetime import datetime
	from modules.auth import validate_login
	from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
	from modules.logging_config import setup_logging
	setup_logging()
	import logging
	from io import BytesIO

	logger = logging.getLogger(__name__)

	# Local
	# from dotenv import load_dotenv
	# load_dotenv()


	# Main app logic
	def main():
	# Temporarily set authentication to True for testing
	if 'authenticated' not in st.session_state:
	st.session_state['authenticated'] = False

	if st.session_state['authenticated']:
	# Remove login success message for testing
	hf_token = os.environ["HF_TOKEN"]
	login(token=hf_token, add_to_git_credential=True)

	# Initialize session state variables
	if 'data_processed' not in st.session_state:
	st.session_state['data_processed'] = False
	st.session_state['df'] = None

	# Main Streamlit app
	st.title('Application Pre-Filtering Tool')

	# Sidebar (filters)
	with st.sidebar:
	with st.expander("ℹ️ - Instructions", expanded=False):
	st.markdown(
	"""
	1. Download the Excel Template file (below)
	2. [OPTIONAL]: Select the desired filtering sensitivity level (below)
	3. Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'
	4. Upload the template file in the area to the right (or click browse files)
	5. Click 'Start Analysis'

	The tool will start processing the uploaded application data. This can take some time
	depending on the number of applications and the length of text in each. For example, a file with 1000 applications
	could be expected to take approximately 5 minutes.

	*NOTE - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*

	"""
	)
	# Excel file download
	st.download_button(
	label="Download Excel Template",
	data=create_excel(),
	file_name="upload_template.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# get sensitivity level for use in review / reject (ref. process_data function)
	sens_options = {
	"Low": 4,
	"Medium": 5,
	"High": 6,
	}

	sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
	help = 'Decreasing the level of sensitivity results in less \
	applications filtered out. This also \
	reduces the probability of false negatives (FNs). The rate of \
	FNs at the lowest setting is approximately 6 percent, and \
	approaches 13 percent at the highest setting. \
	NOTE: changing this setting does not affect the raw data in the CSV output file (only the labels)',
	options = list(sens_options.keys()),
	index = list(sens_options.keys()).index("High"),
	horizontal = False)

	sens_level = sens_options[sens_input]

	with st.expander("ℹ️ - About this app", expanded=False):
	st.write(
	"""
	This tool provides an interface for running an automated preliminary assessment of applications for a call for applications.

	The tool functions by running selected text fields from the application through a series of LLMs fine-tuned for text classification (ref. diagram below).
	The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against
	human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).

	""")
	st.image('images/pipeline.png')

	uploaded_file = st.file_uploader("Select a file containing application pre-filtering data (see instructions in the sidebar)")

	# Add session state variables if they don't exist
	if 'show_button' not in st.session_state:
	st.session_state['show_button'] = True
	if 'processing' not in st.session_state:
	st.session_state['processing'] = False
	if 'data_processed' not in st.session_state:
	st.session_state['data_processed'] = False

	# Only show the button if show_button is True and file is uploaded and not processing
	if uploaded_file is not None and st.session_state['show_button'] and not st.session_state['processing']:
	if st.button("Start Analysis", key="start_analysis"):
	st.session_state['show_button'] = False
	st.session_state['processing'] = True
	st.rerun()

	# If we're processing, show the processing logic
	if st.session_state['processing']:
	try:
	logger.info(f"File uploaded: {uploaded_file.name}")

	if not st.session_state['data_processed']:
	logger.info("Starting data processing")
	try:
	st.session_state['df'] = process_data(uploaded_file, sens_level)
	logger.info("Data processing completed successfully")
	st.session_state['data_processed'] = True
	except ValueError as e:
	# Handle specific validation errors
	logger.error(f"Validation error: {str(e)}")
	st.error(str(e))
	st.session_state['show_button'] = True
	st.session_state['processing'] = False
	st.rerun()
	except Exception as e:
	# Handle other unexpected errors
	logger.error(f"Error in process_data: {str(e)}")
	st.error("An unexpected error occurred. Please check your input file and try again.")
	st.session_state['show_button'] = True
	st.session_state['processing'] = False
	st.rerun()

	df = st.session_state['df']

	def reset_button_state():
	st.session_state['show_button'] = True
	st.session_state['processing'] = False
	st.session_state['data_processed'] = False

	# Create Excel buffer
	excel_buffer = BytesIO()
	df.to_excel(excel_buffer, index=False, engine='openpyxl')
	excel_buffer.seek(0)

	current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
	output_filename = f'processed_applications_{current_datetime}.xlsx'

	st.download_button(
	label="Download Analysis Data File",
	data=excel_buffer,
	file_name=output_filename,
	mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
	on_click=reset_button_state
	)

	except Exception as e:
	logger.error(f"Error processing file: {str(e)}")
	st.error("Failed to process the file. Please ensure your column names match the template file.")
	st.session_state['show_button'] = True
	st.session_state['processing'] = False
	st.rerun()


	# Comment out for testing
	else:
	username = st.text_input("Username")
	password = st.text_input("Password", type="password")
	if st.button("Login"):
	if validate_login(username, password):
	st.session_state['authenticated'] = True
	st.rerun()
	else:
	st.error("Incorrect username or password")



	main()