Spaces:
Running
Running
Commit
·
6829fd5
0
Parent(s):
init new space
Browse files- .gitignore +6 -0
- README.md +12 -0
- app.py +144 -0
- images/pipeline.png +0 -0
- modules/auth.py +13 -0
- modules/utils.py +216 -0
- requirements.txt +7 -0
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.DS_Store
|
3 |
+
*.csv
|
4 |
+
*.xlsx
|
5 |
+
/testing/
|
6 |
+
/modules/__pycache__/
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: MAF Prefilter
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: red
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.33.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
try:
|
3 |
+
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
4 |
+
if torch.cuda.is_available():
|
5 |
+
try:
|
6 |
+
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
7 |
+
except Exception as e:
|
8 |
+
print(f"Error getting CUDA device name: {str(e)}")
|
9 |
+
else:
|
10 |
+
print("No CUDA device available - using CPU")
|
11 |
+
except Exception as e:
|
12 |
+
print(f"Error checking CUDA availability: {str(e)}")
|
13 |
+
print("Continuing with CPU...")
|
14 |
+
|
15 |
+
import streamlit as st
|
16 |
+
import os
|
17 |
+
from huggingface_hub import login
|
18 |
+
from datetime import datetime
|
19 |
+
from modules.auth import validate_login, check_password
|
20 |
+
from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
|
21 |
+
|
22 |
+
# Local
|
23 |
+
# from dotenv import load_dotenv
|
24 |
+
# load_dotenv()
|
25 |
+
|
26 |
+
|
27 |
+
# Main app logic
|
28 |
+
def main():
|
29 |
+
# Temporarily set authentication to True for testing
|
30 |
+
if 'authenticated' not in st.session_state:
|
31 |
+
st.session_state['authenticated'] = True
|
32 |
+
|
33 |
+
if st.session_state['authenticated']:
|
34 |
+
# Remove login success message for testing
|
35 |
+
hf_token = os.environ["HF_TOKEN"]
|
36 |
+
login(token=hf_token, add_to_git_credential=True)
|
37 |
+
|
38 |
+
# Initialize session state variables
|
39 |
+
if 'data_processed' not in st.session_state:
|
40 |
+
st.session_state['data_processed'] = False
|
41 |
+
st.session_state['df'] = None
|
42 |
+
|
43 |
+
# Main Streamlit app
|
44 |
+
st.title('MAF Application Pre-Filtering Tool')
|
45 |
+
|
46 |
+
# Sidebar (filters)
|
47 |
+
with st.sidebar:
|
48 |
+
with st.expander("ℹ️ - Instructions", expanded=False):
|
49 |
+
st.markdown(
|
50 |
+
"""
|
51 |
+
1. **Download the Excel Template file (below).**
|
52 |
+
2. **[OPTIONAL]: Select the desired filtering sensitivity level (below).**
|
53 |
+
3. **Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'.**
|
54 |
+
4. **Upload the template file in the area to the right (or click browse files).**
|
55 |
+
|
56 |
+
The tool will immediately start processing the uploaded application data. This can take considerable time
|
57 |
+
depending on the number of applications and the length of text in each. For example, a file with 500 applications
|
58 |
+
could be expected to take approximately 20 minutes.
|
59 |
+
|
60 |
+
***NOTE (1)** - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
|
61 |
+
|
62 |
+
***NOTE (2)** - as of April 2024 this app running as a **test version**, NOT on a GPU. So the process can take up to 30 minutes for 20 applications.*
|
63 |
+
"""
|
64 |
+
)
|
65 |
+
# Excel file download
|
66 |
+
st.download_button(
|
67 |
+
label="Download Excel Template",
|
68 |
+
data=create_excel(),
|
69 |
+
file_name="MAF_upload_template.xlsx",
|
70 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
71 |
+
)
|
72 |
+
|
73 |
+
# get sensitivity level for use in review / reject (ref. process_data function)
|
74 |
+
sens_options = {
|
75 |
+
"Low": 4,
|
76 |
+
"Medium": 5,
|
77 |
+
"High": 7,
|
78 |
+
}
|
79 |
+
|
80 |
+
sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
|
81 |
+
help = 'Increasing the level of sensitivity results in more \
|
82 |
+
applications being filtered out. At the same time, this also \
|
83 |
+
increases the probability of false negatives (FNs). The rate of \
|
84 |
+
FNs at the lowest setting is approximately 6 percent, and \
|
85 |
+
approaches 13 percent at the highest setting. ',
|
86 |
+
options = list(sens_options.keys()),
|
87 |
+
horizontal = False)
|
88 |
+
|
89 |
+
sens_level = sens_options[sens_input]
|
90 |
+
|
91 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
92 |
+
st.write(
|
93 |
+
"""
|
94 |
+
This tool provides an interface for running an automated preliminary assessment of applications to the MAF call for applications.
|
95 |
+
|
96 |
+
The tool functions by running selected text fields from the application through a series of 8 LLMs fine-tuned for text classification (ref. diagram below).
|
97 |
+
The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against
|
98 |
+
human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).
|
99 |
+
|
100 |
+
""")
|
101 |
+
st.image('images/pipeline.png')
|
102 |
+
|
103 |
+
uploaded_file = st.file_uploader("Select a file containing MAF application pre-filtering data (see instructions in the sidebar)")
|
104 |
+
|
105 |
+
if uploaded_file is not None:
|
106 |
+
try:
|
107 |
+
if not st.session_state['data_processed']:
|
108 |
+
st.session_state['df'] = process_data(uploaded_file, sens_level)
|
109 |
+
st.session_state['data_processed'] = True
|
110 |
+
|
111 |
+
df = st.session_state['df']
|
112 |
+
|
113 |
+
# Get the current date
|
114 |
+
current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
|
115 |
+
output_filename = 'processed_applications_'+current_datetime+'.csv'
|
116 |
+
|
117 |
+
output_file = 'processed_applications.csv'
|
118 |
+
df.to_csv(output_file, index=False)
|
119 |
+
st.download_button(
|
120 |
+
label="Download data as CSV",
|
121 |
+
data=open(output_file, 'rb'),
|
122 |
+
file_name=output_filename,
|
123 |
+
mime='text/csv',
|
124 |
+
)
|
125 |
+
|
126 |
+
except:
|
127 |
+
st.error("Failed to process the file. Please ensure your column names match the template file.")
|
128 |
+
|
129 |
+
|
130 |
+
# Comment out or remove the else block containing login form
|
131 |
+
# else:
|
132 |
+
# username = st.text_input("Username")
|
133 |
+
# password = st.text_input("Password", type="password")
|
134 |
+
# if st.button("Login"):
|
135 |
+
# if validate_login(username, password):
|
136 |
+
# st.session_state['authenticated'] = True
|
137 |
+
# st.experimental_rerun()
|
138 |
+
# else:
|
139 |
+
# st.error("Incorrect username or password")
|
140 |
+
|
141 |
+
|
142 |
+
# Run the main function
|
143 |
+
main()
|
144 |
+
|
images/pipeline.png
ADDED
![]() |
modules/auth.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import bcrypt
|
3 |
+
|
4 |
+
# Helper functions
|
5 |
+
def check_password(provided_password, stored_hash):
|
6 |
+
return bcrypt.checkpw(provided_password.encode(), stored_hash)
|
7 |
+
|
8 |
+
def validate_login(username, password):
|
9 |
+
# Retrieve user's hashed password from environment variables
|
10 |
+
user_hash = os.getenv(username.upper() + '_HASH') # Assumes an env var like 'USER1_HASH'
|
11 |
+
if user_hash:
|
12 |
+
return check_password(password, user_hash.encode())
|
13 |
+
return False
|
modules/utils.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import time
|
3 |
+
import pandas as pd
|
4 |
+
from io import BytesIO
|
5 |
+
import streamlit as st
|
6 |
+
import torch
|
7 |
+
from setfit import SetFitModel
|
8 |
+
from transformers import pipeline
|
9 |
+
from openpyxl import Workbook
|
10 |
+
from openpyxl.styles import Font, NamedStyle, PatternFill
|
11 |
+
from openpyxl.styles.differential import DifferentialStyle
|
12 |
+
|
13 |
+
|
14 |
+
# Function for creating Upload template file
|
15 |
+
def create_excel():
|
16 |
+
# Create a workbook and select the active worksheet
|
17 |
+
wb = Workbook()
|
18 |
+
sheet = wb.active
|
19 |
+
sheet.title = "template"
|
20 |
+
columns = ['id','scope','technology','financial','barrier']
|
21 |
+
sheet.append(columns) # Appending columns to the first row
|
22 |
+
|
23 |
+
# formatting
|
24 |
+
for c in sheet['A1:E4'][0]:
|
25 |
+
c.fill = PatternFill('solid', fgColor = 'bad8e1')
|
26 |
+
c.font = Font(bold=True)
|
27 |
+
|
28 |
+
# Save to a BytesIO object
|
29 |
+
output = BytesIO()
|
30 |
+
wb.save(output)
|
31 |
+
return output.getvalue()
|
32 |
+
|
33 |
+
|
34 |
+
# Function to clean text
|
35 |
+
def clean_text(input_text):
|
36 |
+
cleaned_text = re.sub(r"[^a-zA-Z0-9\s.,:;!?()\-\n]", "", input_text)
|
37 |
+
cleaned_text = re.sub(r"x000D", "", cleaned_text)
|
38 |
+
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
|
39 |
+
cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
|
40 |
+
return cleaned_text
|
41 |
+
|
42 |
+
|
43 |
+
# # Function for extracting classifications for each SECTOR label
|
44 |
+
def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
|
45 |
+
|
46 |
+
# verify output is a list of dictionaries
|
47 |
+
if isinstance(output, list) and all(isinstance(item, dict) for item in output):
|
48 |
+
# filter items with scores above the threshold
|
49 |
+
filtered_items = [item for item in output if item.get('score', 0) > threshold]
|
50 |
+
|
51 |
+
# sort the filtered items by score in descending order
|
52 |
+
sorted_items = sorted(filtered_items, key=lambda x: x.get('score', 0), reverse=True)
|
53 |
+
|
54 |
+
# extract the highest and second-highest labels
|
55 |
+
if len(sorted_items) >= 2:
|
56 |
+
highest_label = sorted_items[0].get('label')
|
57 |
+
second_highest_label = sorted_items[1].get('label')
|
58 |
+
elif len(sorted_items) == 1:
|
59 |
+
highest_label = sorted_items[0].get('label')
|
60 |
+
second_highest_label = None
|
61 |
+
else:
|
62 |
+
print("Warning: Less than two items above the threshold in the current list.")
|
63 |
+
highest_label = None
|
64 |
+
second_highest_label = None
|
65 |
+
else:
|
66 |
+
print("Error: Inner data is not formatted correctly. Each item must be a dictionary.")
|
67 |
+
highest_label = None
|
68 |
+
second_highest_label = None
|
69 |
+
|
70 |
+
# Output dictionary of highest and second-highest labels to the all_predicted_labels list
|
71 |
+
predicted_labels = {"SECTOR1": highest_label, "SECTOR2": second_highest_label}
|
72 |
+
return predicted_labels
|
73 |
+
|
74 |
+
# Function to call model and run inference for varying classification tasks/models
|
75 |
+
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
76 |
+
device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.has_mps else torch.device("cpu"))
|
77 |
+
# model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
|
78 |
+
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
79 |
+
if model_name in model_names_sf:
|
80 |
+
col_name = re.sub(r'_(.*)', r'_txt', model_name)
|
81 |
+
model = SetFitModel.from_pretrained(profile+"/"+repo)
|
82 |
+
model.to(device)
|
83 |
+
else:
|
84 |
+
col_name = 'scope_txt'
|
85 |
+
model = pipeline("text-classification", model=profile+"/"+repo, device=device, return_all_scores=multilabel)
|
86 |
+
predictions = []
|
87 |
+
total = len(df)
|
88 |
+
for i, text in enumerate(df[col_name]):
|
89 |
+
prediction = model(text)
|
90 |
+
if model_name in model_names_sf:
|
91 |
+
predictions.append(0 if prediction == 'NEGATIVE' else 1)
|
92 |
+
elif model_name == 'ADAPMIT':
|
93 |
+
predictions.append(re.sub('Label$', '', prediction[0]['label']))
|
94 |
+
elif model_name == 'SECTOR':
|
95 |
+
predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
|
96 |
+
elif model_name == 'LANG':
|
97 |
+
predictions.append(prediction[0]['label'])
|
98 |
+
# Update progress bar with each iteration
|
99 |
+
progress = (i + 1) / total
|
100 |
+
progress_bar.progress(progress)
|
101 |
+
# st.write(predictions)
|
102 |
+
return predictions
|
103 |
+
|
104 |
+
|
105 |
+
# Main function to process data
|
106 |
+
def process_data(uploaded_file, sens_level):
|
107 |
+
df = pd.read_excel(uploaded_file)
|
108 |
+
# Column renaming and initial processing
|
109 |
+
|
110 |
+
df.rename(columns={
|
111 |
+
'id': 'id',
|
112 |
+
'scope': 'scope_txt',
|
113 |
+
'technology': 'tech_txt',
|
114 |
+
'financial': 'fin_txt',
|
115 |
+
'barrier': 'bar_txt',
|
116 |
+
'maf_funding_requested':'maf_funding',
|
117 |
+
'contributions_public_sector':'cont_public',
|
118 |
+
'contributions_private_sector':'cont_private',
|
119 |
+
'contributions_other':'cont_other'}, inplace=True)
|
120 |
+
|
121 |
+
# df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt', 'bar_txt'])
|
122 |
+
# df.fillna('', inplace=True)
|
123 |
+
# df[['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']].applymap(clean_text)
|
124 |
+
|
125 |
+
df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt','maf_funding','cont_public','cont_private','cont_other'])
|
126 |
+
df.fillna('', inplace=True)
|
127 |
+
df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
|
128 |
+
|
129 |
+
# Define models and predictions
|
130 |
+
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
131 |
+
model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG']
|
132 |
+
total_predictions = len(model_names) * len(df)
|
133 |
+
progress_count = 0
|
134 |
+
|
135 |
+
# UI setup for progress tracking
|
136 |
+
st.subheader("Overall Progress:")
|
137 |
+
patience_text = st.empty()
|
138 |
+
patience_text.markdown("*You may want to grab a coffee, this can take a while...*")
|
139 |
+
overall_progress = st.progress(0)
|
140 |
+
overall_start_time = time.time()
|
141 |
+
estimated_time_remaining_text = st.empty()
|
142 |
+
|
143 |
+
# Model processing
|
144 |
+
step_count = 0
|
145 |
+
total_steps = len(model_names)
|
146 |
+
for model_name in model_names:
|
147 |
+
step_count += 1
|
148 |
+
model_processing_text = st.empty()
|
149 |
+
model_processing_text.markdown(f'**Current Task: Processing with model "{model_name}"**')
|
150 |
+
model_progress = st.empty()
|
151 |
+
progress_bar = model_progress.progress(0)
|
152 |
+
|
153 |
+
# Load the model and run inference
|
154 |
+
if model_name in model_names_sf:
|
155 |
+
df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
|
156 |
+
elif model_name == 'ADAPMIT':
|
157 |
+
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
158 |
+
elif model_name == 'SECTOR':
|
159 |
+
sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
|
160 |
+
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
161 |
+
df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
|
162 |
+
elif model_name == 'LANG':
|
163 |
+
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
164 |
+
|
165 |
+
model_progress.empty()
|
166 |
+
|
167 |
+
progress_count += len(df)
|
168 |
+
overall_progress_value = progress_count / total_predictions
|
169 |
+
overall_progress.progress(overall_progress_value)
|
170 |
+
|
171 |
+
# Calculate and display estimated time remaining
|
172 |
+
elapsed_time = time.time() - overall_start_time
|
173 |
+
steps_remaining = total_steps - step_count
|
174 |
+
if step_count > 1:
|
175 |
+
estimated_time_remaining = (elapsed_time / step_count) * steps_remaining
|
176 |
+
estimated_time_remaining_text.write(f'Estimated Time Remaining: {estimated_time_remaining:.0f} seconds (step {step_count+1} of 9)')
|
177 |
+
else:
|
178 |
+
estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of 9)')
|
179 |
+
|
180 |
+
model_processing_text.empty()
|
181 |
+
|
182 |
+
patience_text.empty()
|
183 |
+
estimated_time_remaining_text.empty()
|
184 |
+
|
185 |
+
st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
|
186 |
+
|
187 |
+
|
188 |
+
# Convert funding columns to numeric, replacing any non-numeric values with NaN
|
189 |
+
df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
|
190 |
+
df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
|
191 |
+
df['cont_private'] = pd.to_numeric(df['cont_private'], errors='coerce')
|
192 |
+
df['cont_other'] = pd.to_numeric(df['cont_other'], errors='coerce')
|
193 |
+
|
194 |
+
# Fill any NaN values with 0
|
195 |
+
df[['maf_funding', 'cont_public', 'cont_private', 'cont_other']] = df[['maf_funding', 'cont_public', 'cont_private', 'cont_other']].fillna(0)
|
196 |
+
|
197 |
+
|
198 |
+
df['lev_total'] = df.apply(lambda x: x['cont_public'] + x['cont_private'] + x['cont_other'], axis=1)
|
199 |
+
|
200 |
+
df['lev_gt_maf'] = df.apply(lambda x: 'True' if x['lev_total'] > x['maf_funding'] else 'False', axis=1)
|
201 |
+
|
202 |
+
df['lev_gt_0'] = (df['lev_total'] > 0).astype(int)
|
203 |
+
|
204 |
+
# Calculate leverage as percentage of MAF funding
|
205 |
+
df['lev_maf_%'] = df.apply(lambda x: round(x['lev_total']/x['maf_funding']*100,2) if x['maf_funding'] != 0 else 0, axis=1)
|
206 |
+
|
207 |
+
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
208 |
+
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
209 |
+
|
210 |
+
# Further data processing and actions
|
211 |
+
sector_classes = ['Energy','Transport','Industries']
|
212 |
+
# df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10,0), axis=1)
|
213 |
+
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0'])/9*10,0), axis=1)
|
214 |
+
df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
|
215 |
+
|
216 |
+
return df
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
openpyxl
|
4 |
+
setfit
|
5 |
+
bcrypt
|
6 |
+
--extra-index-url https://download.pytorch.org/whl/cu113
|
7 |
+
torch
|