mtyrrell commited on
Commit
6829fd5
·
0 Parent(s):

init new space

Browse files
Files changed (7) hide show
  1. .gitignore +6 -0
  2. README.md +12 -0
  3. app.py +144 -0
  4. images/pipeline.png +0 -0
  5. modules/auth.py +13 -0
  6. modules/utils.py +216 -0
  7. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ .DS_Store
3
+ *.csv
4
+ *.xlsx
5
+ /testing/
6
+ /modules/__pycache__/
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MAF Prefilter
3
+ emoji: 🦀
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ try:
3
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
4
+ if torch.cuda.is_available():
5
+ try:
6
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
7
+ except Exception as e:
8
+ print(f"Error getting CUDA device name: {str(e)}")
9
+ else:
10
+ print("No CUDA device available - using CPU")
11
+ except Exception as e:
12
+ print(f"Error checking CUDA availability: {str(e)}")
13
+ print("Continuing with CPU...")
14
+
15
+ import streamlit as st
16
+ import os
17
+ from huggingface_hub import login
18
+ from datetime import datetime
19
+ from modules.auth import validate_login, check_password
20
+ from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
21
+
22
+ # Local
23
+ # from dotenv import load_dotenv
24
+ # load_dotenv()
25
+
26
+
27
+ # Main app logic
28
+ def main():
29
+ # Temporarily set authentication to True for testing
30
+ if 'authenticated' not in st.session_state:
31
+ st.session_state['authenticated'] = True
32
+
33
+ if st.session_state['authenticated']:
34
+ # Remove login success message for testing
35
+ hf_token = os.environ["HF_TOKEN"]
36
+ login(token=hf_token, add_to_git_credential=True)
37
+
38
+ # Initialize session state variables
39
+ if 'data_processed' not in st.session_state:
40
+ st.session_state['data_processed'] = False
41
+ st.session_state['df'] = None
42
+
43
+ # Main Streamlit app
44
+ st.title('MAF Application Pre-Filtering Tool')
45
+
46
+ # Sidebar (filters)
47
+ with st.sidebar:
48
+ with st.expander("ℹ️ - Instructions", expanded=False):
49
+ st.markdown(
50
+ """
51
+ 1. **Download the Excel Template file (below).**
52
+ 2. **[OPTIONAL]: Select the desired filtering sensitivity level (below).**
53
+ 3. **Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'.**
54
+ 4. **Upload the template file in the area to the right (or click browse files).**
55
+
56
+ The tool will immediately start processing the uploaded application data. This can take considerable time
57
+ depending on the number of applications and the length of text in each. For example, a file with 500 applications
58
+ could be expected to take approximately 20 minutes.
59
+
60
+ ***NOTE (1)** - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
61
+
62
+ ***NOTE (2)** - as of April 2024 this app running as a **test version**, NOT on a GPU. So the process can take up to 30 minutes for 20 applications.*
63
+ """
64
+ )
65
+ # Excel file download
66
+ st.download_button(
67
+ label="Download Excel Template",
68
+ data=create_excel(),
69
+ file_name="MAF_upload_template.xlsx",
70
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
71
+ )
72
+
73
+ # get sensitivity level for use in review / reject (ref. process_data function)
74
+ sens_options = {
75
+ "Low": 4,
76
+ "Medium": 5,
77
+ "High": 7,
78
+ }
79
+
80
+ sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
81
+ help = 'Increasing the level of sensitivity results in more \
82
+ applications being filtered out. At the same time, this also \
83
+ increases the probability of false negatives (FNs). The rate of \
84
+ FNs at the lowest setting is approximately 6 percent, and \
85
+ approaches 13 percent at the highest setting. ',
86
+ options = list(sens_options.keys()),
87
+ horizontal = False)
88
+
89
+ sens_level = sens_options[sens_input]
90
+
91
+ with st.expander("ℹ️ - About this app", expanded=False):
92
+ st.write(
93
+ """
94
+ This tool provides an interface for running an automated preliminary assessment of applications to the MAF call for applications.
95
+
96
+ The tool functions by running selected text fields from the application through a series of 8 LLMs fine-tuned for text classification (ref. diagram below).
97
+ The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against
98
+ human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).
99
+
100
+ """)
101
+ st.image('images/pipeline.png')
102
+
103
+ uploaded_file = st.file_uploader("Select a file containing MAF application pre-filtering data (see instructions in the sidebar)")
104
+
105
+ if uploaded_file is not None:
106
+ try:
107
+ if not st.session_state['data_processed']:
108
+ st.session_state['df'] = process_data(uploaded_file, sens_level)
109
+ st.session_state['data_processed'] = True
110
+
111
+ df = st.session_state['df']
112
+
113
+ # Get the current date
114
+ current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
115
+ output_filename = 'processed_applications_'+current_datetime+'.csv'
116
+
117
+ output_file = 'processed_applications.csv'
118
+ df.to_csv(output_file, index=False)
119
+ st.download_button(
120
+ label="Download data as CSV",
121
+ data=open(output_file, 'rb'),
122
+ file_name=output_filename,
123
+ mime='text/csv',
124
+ )
125
+
126
+ except:
127
+ st.error("Failed to process the file. Please ensure your column names match the template file.")
128
+
129
+
130
+ # Comment out or remove the else block containing login form
131
+ # else:
132
+ # username = st.text_input("Username")
133
+ # password = st.text_input("Password", type="password")
134
+ # if st.button("Login"):
135
+ # if validate_login(username, password):
136
+ # st.session_state['authenticated'] = True
137
+ # st.experimental_rerun()
138
+ # else:
139
+ # st.error("Incorrect username or password")
140
+
141
+
142
+ # Run the main function
143
+ main()
144
+
images/pipeline.png ADDED
modules/auth.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import bcrypt
3
+
4
+ # Helper functions
5
+ def check_password(provided_password, stored_hash):
6
+ return bcrypt.checkpw(provided_password.encode(), stored_hash)
7
+
8
+ def validate_login(username, password):
9
+ # Retrieve user's hashed password from environment variables
10
+ user_hash = os.getenv(username.upper() + '_HASH') # Assumes an env var like 'USER1_HASH'
11
+ if user_hash:
12
+ return check_password(password, user_hash.encode())
13
+ return False
modules/utils.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ import pandas as pd
4
+ from io import BytesIO
5
+ import streamlit as st
6
+ import torch
7
+ from setfit import SetFitModel
8
+ from transformers import pipeline
9
+ from openpyxl import Workbook
10
+ from openpyxl.styles import Font, NamedStyle, PatternFill
11
+ from openpyxl.styles.differential import DifferentialStyle
12
+
13
+
14
+ # Function for creating Upload template file
15
+ def create_excel():
16
+ # Create a workbook and select the active worksheet
17
+ wb = Workbook()
18
+ sheet = wb.active
19
+ sheet.title = "template"
20
+ columns = ['id','scope','technology','financial','barrier']
21
+ sheet.append(columns) # Appending columns to the first row
22
+
23
+ # formatting
24
+ for c in sheet['A1:E4'][0]:
25
+ c.fill = PatternFill('solid', fgColor = 'bad8e1')
26
+ c.font = Font(bold=True)
27
+
28
+ # Save to a BytesIO object
29
+ output = BytesIO()
30
+ wb.save(output)
31
+ return output.getvalue()
32
+
33
+
34
+ # Function to clean text
35
+ def clean_text(input_text):
36
+ cleaned_text = re.sub(r"[^a-zA-Z0-9\s.,:;!?()\-\n]", "", input_text)
37
+ cleaned_text = re.sub(r"x000D", "", cleaned_text)
38
+ cleaned_text = re.sub(r"\s+", " ", cleaned_text)
39
+ cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
40
+ return cleaned_text
41
+
42
+
43
+ # # Function for extracting classifications for each SECTOR label
44
+ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
45
+
46
+ # verify output is a list of dictionaries
47
+ if isinstance(output, list) and all(isinstance(item, dict) for item in output):
48
+ # filter items with scores above the threshold
49
+ filtered_items = [item for item in output if item.get('score', 0) > threshold]
50
+
51
+ # sort the filtered items by score in descending order
52
+ sorted_items = sorted(filtered_items, key=lambda x: x.get('score', 0), reverse=True)
53
+
54
+ # extract the highest and second-highest labels
55
+ if len(sorted_items) >= 2:
56
+ highest_label = sorted_items[0].get('label')
57
+ second_highest_label = sorted_items[1].get('label')
58
+ elif len(sorted_items) == 1:
59
+ highest_label = sorted_items[0].get('label')
60
+ second_highest_label = None
61
+ else:
62
+ print("Warning: Less than two items above the threshold in the current list.")
63
+ highest_label = None
64
+ second_highest_label = None
65
+ else:
66
+ print("Error: Inner data is not formatted correctly. Each item must be a dictionary.")
67
+ highest_label = None
68
+ second_highest_label = None
69
+
70
+ # Output dictionary of highest and second-highest labels to the all_predicted_labels list
71
+ predicted_labels = {"SECTOR1": highest_label, "SECTOR2": second_highest_label}
72
+ return predicted_labels
73
+
74
+ # Function to call model and run inference for varying classification tasks/models
75
+ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
76
+ device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.has_mps else torch.device("cpu"))
77
+ # model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
78
+ model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
79
+ if model_name in model_names_sf:
80
+ col_name = re.sub(r'_(.*)', r'_txt', model_name)
81
+ model = SetFitModel.from_pretrained(profile+"/"+repo)
82
+ model.to(device)
83
+ else:
84
+ col_name = 'scope_txt'
85
+ model = pipeline("text-classification", model=profile+"/"+repo, device=device, return_all_scores=multilabel)
86
+ predictions = []
87
+ total = len(df)
88
+ for i, text in enumerate(df[col_name]):
89
+ prediction = model(text)
90
+ if model_name in model_names_sf:
91
+ predictions.append(0 if prediction == 'NEGATIVE' else 1)
92
+ elif model_name == 'ADAPMIT':
93
+ predictions.append(re.sub('Label$', '', prediction[0]['label']))
94
+ elif model_name == 'SECTOR':
95
+ predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
96
+ elif model_name == 'LANG':
97
+ predictions.append(prediction[0]['label'])
98
+ # Update progress bar with each iteration
99
+ progress = (i + 1) / total
100
+ progress_bar.progress(progress)
101
+ # st.write(predictions)
102
+ return predictions
103
+
104
+
105
+ # Main function to process data
106
+ def process_data(uploaded_file, sens_level):
107
+ df = pd.read_excel(uploaded_file)
108
+ # Column renaming and initial processing
109
+
110
+ df.rename(columns={
111
+ 'id': 'id',
112
+ 'scope': 'scope_txt',
113
+ 'technology': 'tech_txt',
114
+ 'financial': 'fin_txt',
115
+ 'barrier': 'bar_txt',
116
+ 'maf_funding_requested':'maf_funding',
117
+ 'contributions_public_sector':'cont_public',
118
+ 'contributions_private_sector':'cont_private',
119
+ 'contributions_other':'cont_other'}, inplace=True)
120
+
121
+ # df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt', 'bar_txt'])
122
+ # df.fillna('', inplace=True)
123
+ # df[['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']].applymap(clean_text)
124
+
125
+ df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt','maf_funding','cont_public','cont_private','cont_other'])
126
+ df.fillna('', inplace=True)
127
+ df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
128
+
129
+ # Define models and predictions
130
+ model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
131
+ model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG']
132
+ total_predictions = len(model_names) * len(df)
133
+ progress_count = 0
134
+
135
+ # UI setup for progress tracking
136
+ st.subheader("Overall Progress:")
137
+ patience_text = st.empty()
138
+ patience_text.markdown("*You may want to grab a coffee, this can take a while...*")
139
+ overall_progress = st.progress(0)
140
+ overall_start_time = time.time()
141
+ estimated_time_remaining_text = st.empty()
142
+
143
+ # Model processing
144
+ step_count = 0
145
+ total_steps = len(model_names)
146
+ for model_name in model_names:
147
+ step_count += 1
148
+ model_processing_text = st.empty()
149
+ model_processing_text.markdown(f'**Current Task: Processing with model "{model_name}"**')
150
+ model_progress = st.empty()
151
+ progress_bar = model_progress.progress(0)
152
+
153
+ # Load the model and run inference
154
+ if model_name in model_names_sf:
155
+ df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
156
+ elif model_name == 'ADAPMIT':
157
+ df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
158
+ elif model_name == 'SECTOR':
159
+ sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
160
+ df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
161
+ df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
162
+ elif model_name == 'LANG':
163
+ df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
164
+
165
+ model_progress.empty()
166
+
167
+ progress_count += len(df)
168
+ overall_progress_value = progress_count / total_predictions
169
+ overall_progress.progress(overall_progress_value)
170
+
171
+ # Calculate and display estimated time remaining
172
+ elapsed_time = time.time() - overall_start_time
173
+ steps_remaining = total_steps - step_count
174
+ if step_count > 1:
175
+ estimated_time_remaining = (elapsed_time / step_count) * steps_remaining
176
+ estimated_time_remaining_text.write(f'Estimated Time Remaining: {estimated_time_remaining:.0f} seconds (step {step_count+1} of 9)')
177
+ else:
178
+ estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of 9)')
179
+
180
+ model_processing_text.empty()
181
+
182
+ patience_text.empty()
183
+ estimated_time_remaining_text.empty()
184
+
185
+ st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
186
+
187
+
188
+ # Convert funding columns to numeric, replacing any non-numeric values with NaN
189
+ df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
190
+ df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
191
+ df['cont_private'] = pd.to_numeric(df['cont_private'], errors='coerce')
192
+ df['cont_other'] = pd.to_numeric(df['cont_other'], errors='coerce')
193
+
194
+ # Fill any NaN values with 0
195
+ df[['maf_funding', 'cont_public', 'cont_private', 'cont_other']] = df[['maf_funding', 'cont_public', 'cont_private', 'cont_other']].fillna(0)
196
+
197
+
198
+ df['lev_total'] = df.apply(lambda x: x['cont_public'] + x['cont_private'] + x['cont_other'], axis=1)
199
+
200
+ df['lev_gt_maf'] = df.apply(lambda x: 'True' if x['lev_total'] > x['maf_funding'] else 'False', axis=1)
201
+
202
+ df['lev_gt_0'] = (df['lev_total'] > 0).astype(int)
203
+
204
+ # Calculate leverage as percentage of MAF funding
205
+ df['lev_maf_%'] = df.apply(lambda x: round(x['lev_total']/x['maf_funding']*100,2) if x['maf_funding'] != 0 else 0, axis=1)
206
+
207
+ # Create normalized leverage scale (0-1) where 300% leverage = 1
208
+ df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
209
+
210
+ # Further data processing and actions
211
+ sector_classes = ['Energy','Transport','Industries']
212
+ # df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10,0), axis=1)
213
+ df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0'])/9*10,0), axis=1)
214
+ df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
215
+
216
+ return df
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ openpyxl
4
+ setfit
5
+ bcrypt
6
+ --extra-index-url https://download.pytorch.org/whl/cu113
7
+ torch