Spaces:
Running
Running
word length logic; improved lang classifier
Browse files- app.py +24 -13
- modules/org_count.py +4 -2
- modules/utils.py +12 -13
app.py
CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
-
from dotenv import load_dotenv
|
30 |
-
load_dotenv()
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
-
st.session_state['authenticated'] =
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
@@ -136,9 +136,20 @@ def main():
|
|
136 |
st.session_state['df'] = process_data(uploaded_file, sens_level)
|
137 |
logger.info("Data processing completed successfully")
|
138 |
st.session_state['data_processed'] = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
except Exception as e:
|
|
|
140 |
logger.error(f"Error in process_data: {str(e)}")
|
141 |
-
|
|
|
|
|
|
|
142 |
|
143 |
df = st.session_state['df']
|
144 |
|
@@ -172,15 +183,15 @@ def main():
|
|
172 |
|
173 |
|
174 |
# Comment out for testing
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
|
185 |
|
186 |
|
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
+
# from dotenv import load_dotenv
|
30 |
+
# load_dotenv()
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
+
st.session_state['authenticated'] = False
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
|
|
136 |
st.session_state['df'] = process_data(uploaded_file, sens_level)
|
137 |
logger.info("Data processing completed successfully")
|
138 |
st.session_state['data_processed'] = True
|
139 |
+
except ValueError as e:
|
140 |
+
# Handle specific validation errors
|
141 |
+
logger.error(f"Validation error: {str(e)}")
|
142 |
+
st.error(str(e))
|
143 |
+
st.session_state['show_button'] = True
|
144 |
+
st.session_state['processing'] = False
|
145 |
+
st.rerun()
|
146 |
except Exception as e:
|
147 |
+
# Handle other unexpected errors
|
148 |
logger.error(f"Error in process_data: {str(e)}")
|
149 |
+
st.error("An unexpected error occurred. Please check your input file and try again.")
|
150 |
+
st.session_state['show_button'] = True
|
151 |
+
st.session_state['processing'] = False
|
152 |
+
st.rerun()
|
153 |
|
154 |
df = st.session_state['df']
|
155 |
|
|
|
183 |
|
184 |
|
185 |
# Comment out for testing
|
186 |
+
else:
|
187 |
+
username = st.text_input("Username")
|
188 |
+
password = st.text_input("Password", type="password")
|
189 |
+
if st.button("Login"):
|
190 |
+
if validate_login(username, password):
|
191 |
+
st.session_state['authenticated'] = True
|
192 |
+
st.rerun()
|
193 |
+
else:
|
194 |
+
st.error("Incorrect username or password")
|
195 |
|
196 |
|
197 |
|
modules/org_count.py
CHANGED
@@ -17,6 +17,9 @@ def standardize_organization_names(df):
|
|
17 |
"""
|
18 |
# Make a copy to avoid modifying the original DataFrame
|
19 |
df = df.copy()
|
|
|
|
|
|
|
20 |
|
21 |
# Return DataFrame as-is if 'organization' column is not present
|
22 |
if 'organization' not in df.columns:
|
@@ -55,7 +58,7 @@ def standardize_organization_names(df):
|
|
55 |
'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
|
56 |
'Sumy City Council': ['sumy city council'],
|
57 |
'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
|
58 |
-
'UN-Habitat': ['united nations human settlement','un-habitat'],
|
59 |
'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
|
60 |
'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
|
61 |
'United Nations Development Programme (UNDP)': ['united nations development program'],
|
@@ -81,7 +84,6 @@ def standardize_organization_names(df):
|
|
81 |
'Development Initiative for Community Impact (DICI)': ['DICI'],
|
82 |
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
|
83 |
'Global Green Growth Institute (GGGI)': ['GGGI'],
|
84 |
-
'UN-Habitat': ['UN-Habitat'],
|
85 |
'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
|
86 |
'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
|
87 |
'United Nations Development Programme (UNDP)': ['UNDP'],
|
|
|
17 |
"""
|
18 |
# Make a copy to avoid modifying the original DataFrame
|
19 |
df = df.copy()
|
20 |
+
|
21 |
+
# Sort DataFrame by 'id' column in ascending order
|
22 |
+
df = df.sort_values('id', ascending=True)
|
23 |
|
24 |
# Return DataFrame as-is if 'organization' column is not present
|
25 |
if 'organization' not in df.columns:
|
|
|
58 |
'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
|
59 |
'Sumy City Council': ['sumy city council'],
|
60 |
'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
|
61 |
+
'United Nations Human Settlement Programme (UN-Habitat)': ['united nations human settlement','un-habitat'],
|
62 |
'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
|
63 |
'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
|
64 |
'United Nations Development Programme (UNDP)': ['united nations development program'],
|
|
|
84 |
'Development Initiative for Community Impact (DICI)': ['DICI'],
|
85 |
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
|
86 |
'Global Green Growth Institute (GGGI)': ['GGGI'],
|
|
|
87 |
'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
|
88 |
'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
|
89 |
'United Nations Development Programme (UNDP)': ['UNDP'],
|
modules/utils.py
CHANGED
@@ -143,12 +143,14 @@ def process_data(uploaded_file, sens_level):
|
|
143 |
# Read the Excel file
|
144 |
try:
|
145 |
df = pd.read_excel(uploaded_file)
|
146 |
-
df = standardize_organization_names(df)
|
147 |
logger.info("Data import successful")
|
|
|
|
|
148 |
except Exception as e:
|
149 |
-
|
|
|
150 |
st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
|
151 |
-
|
152 |
|
153 |
# Validate required columns
|
154 |
missing_columns = [col for col in required_columns.keys() if col not in df.columns]
|
@@ -156,7 +158,7 @@ def process_data(uploaded_file, sens_level):
|
|
156 |
error_msg = f"Missing required columns: {', '.join(missing_columns)}"
|
157 |
logger.error(error_msg)
|
158 |
st.error(error_msg)
|
159 |
-
|
160 |
|
161 |
# Rename required columns while preserving all others
|
162 |
df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
|
@@ -250,23 +252,20 @@ def process_data(uploaded_file, sens_level):
|
|
250 |
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
251 |
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
252 |
|
253 |
-
#
|
254 |
-
df['scope_words_lt_10'] = df['scope_txt'].str.split().str.len() < 10
|
255 |
-
df['fin_words_lt_10'] = df['fin_txt'].str.split().str.len() < 10
|
256 |
-
df['tech_words_lt_10'] = df['tech_txt'].str.split().str.len() < 10
|
257 |
-
|
258 |
df['word_length_check'] = df.apply(lambda x:
|
259 |
-
True if x['scope_txt'].
|
260 |
-
x['fin_txt'].
|
261 |
-
x['tech_txt'].
|
262 |
else False, axis=1)
|
|
|
263 |
# Predict score
|
264 |
sector_classes = ['Energy','Transport','Industries']
|
265 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
266 |
# labelling logic
|
267 |
df['pred_action'] = df.apply(lambda x:
|
268 |
'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
|
269 |
-
x['LANG'] != 'en
|
270 |
x['ADAPMIT'] == 'Adaptation' or
|
271 |
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
|
272 |
x['word_length_check'] == True)
|
|
|
143 |
# Read the Excel file
|
144 |
try:
|
145 |
df = pd.read_excel(uploaded_file)
|
|
|
146 |
logger.info("Data import successful")
|
147 |
+
df = standardize_organization_names(df)
|
148 |
+
|
149 |
except Exception as e:
|
150 |
+
error_msg = f"Failed to read Excel file: {str(e)}"
|
151 |
+
logger.error(error_msg)
|
152 |
st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
|
153 |
+
raise ValueError(error_msg)
|
154 |
|
155 |
# Validate required columns
|
156 |
missing_columns = [col for col in required_columns.keys() if col not in df.columns]
|
|
|
158 |
error_msg = f"Missing required columns: {', '.join(missing_columns)}"
|
159 |
logger.error(error_msg)
|
160 |
st.error(error_msg)
|
161 |
+
raise ValueError(error_msg)
|
162 |
|
163 |
# Rename required columns while preserving all others
|
164 |
df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
|
|
|
252 |
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
253 |
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
254 |
|
255 |
+
# Test if all text fields don't have minimum required words
|
|
|
|
|
|
|
|
|
256 |
df['word_length_check'] = df.apply(lambda x:
|
257 |
+
True if len(x['scope_txt'].split()) < 10 and
|
258 |
+
len(x['fin_txt'].split()) < 10 and
|
259 |
+
len(x['tech_txt'].split()) < 10
|
260 |
else False, axis=1)
|
261 |
+
|
262 |
# Predict score
|
263 |
sector_classes = ['Energy','Transport','Industries']
|
264 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
265 |
# labelling logic
|
266 |
df['pred_action'] = df.apply(lambda x:
|
267 |
'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
|
268 |
+
x['LANG'] != 'en' or
|
269 |
x['ADAPMIT'] == 'Adaptation' or
|
270 |
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
|
271 |
x['word_length_check'] == True)
|