mtyrrell commited on
Commit
c572984
·
1 Parent(s): 88e08d0

word length logic; improved lang classifier

Browse files
Files changed (3) hide show
  1. app.py +24 -13
  2. modules/org_count.py +4 -2
  3. modules/utils.py +12 -13
app.py CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
- from dotenv import load_dotenv
30
- load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
- st.session_state['authenticated'] = True
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
@@ -136,9 +136,20 @@ def main():
136
  st.session_state['df'] = process_data(uploaded_file, sens_level)
137
  logger.info("Data processing completed successfully")
138
  st.session_state['data_processed'] = True
 
 
 
 
 
 
 
139
  except Exception as e:
 
140
  logger.error(f"Error in process_data: {str(e)}")
141
- raise
 
 
 
142
 
143
  df = st.session_state['df']
144
 
@@ -172,15 +183,15 @@ def main():
172
 
173
 
174
  # Comment out for testing
175
- # else:
176
- # username = st.text_input("Username")
177
- # password = st.text_input("Password", type="password")
178
- # if st.button("Login"):
179
- # if validate_login(username, password):
180
- # st.session_state['authenticated'] = True
181
- # st.rerun()
182
- # else:
183
- # st.error("Incorrect username or password")
184
 
185
 
186
 
 
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
+ # from dotenv import load_dotenv
30
+ # load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
+ st.session_state['authenticated'] = False
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
 
136
  st.session_state['df'] = process_data(uploaded_file, sens_level)
137
  logger.info("Data processing completed successfully")
138
  st.session_state['data_processed'] = True
139
+ except ValueError as e:
140
+ # Handle specific validation errors
141
+ logger.error(f"Validation error: {str(e)}")
142
+ st.error(str(e))
143
+ st.session_state['show_button'] = True
144
+ st.session_state['processing'] = False
145
+ st.rerun()
146
  except Exception as e:
147
+ # Handle other unexpected errors
148
  logger.error(f"Error in process_data: {str(e)}")
149
+ st.error("An unexpected error occurred. Please check your input file and try again.")
150
+ st.session_state['show_button'] = True
151
+ st.session_state['processing'] = False
152
+ st.rerun()
153
 
154
  df = st.session_state['df']
155
 
 
183
 
184
 
185
  # Comment out for testing
186
+ else:
187
+ username = st.text_input("Username")
188
+ password = st.text_input("Password", type="password")
189
+ if st.button("Login"):
190
+ if validate_login(username, password):
191
+ st.session_state['authenticated'] = True
192
+ st.rerun()
193
+ else:
194
+ st.error("Incorrect username or password")
195
 
196
 
197
 
modules/org_count.py CHANGED
@@ -17,6 +17,9 @@ def standardize_organization_names(df):
17
  """
18
  # Make a copy to avoid modifying the original DataFrame
19
  df = df.copy()
 
 
 
20
 
21
  # Return DataFrame as-is if 'organization' column is not present
22
  if 'organization' not in df.columns:
@@ -55,7 +58,7 @@ def standardize_organization_names(df):
55
  'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
56
  'Sumy City Council': ['sumy city council'],
57
  'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
58
- 'UN-Habitat': ['united nations human settlement','un-habitat'],
59
  'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
60
  'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
61
  'United Nations Development Programme (UNDP)': ['united nations development program'],
@@ -81,7 +84,6 @@ def standardize_organization_names(df):
81
  'Development Initiative for Community Impact (DICI)': ['DICI'],
82
  'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
83
  'Global Green Growth Institute (GGGI)': ['GGGI'],
84
- 'UN-Habitat': ['UN-Habitat'],
85
  'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
86
  'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
87
  'United Nations Development Programme (UNDP)': ['UNDP'],
 
17
  """
18
  # Make a copy to avoid modifying the original DataFrame
19
  df = df.copy()
20
+
21
+ # Sort DataFrame by 'id' column in ascending order
22
+ df = df.sort_values('id', ascending=True)
23
 
24
  # Return DataFrame as-is if 'organization' column is not present
25
  if 'organization' not in df.columns:
 
58
  'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
59
  'Sumy City Council': ['sumy city council'],
60
  'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
61
+ 'United Nations Human Settlement Programme (UN-Habitat)': ['united nations human settlement','un-habitat'],
62
  'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
63
  'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
64
  'United Nations Development Programme (UNDP)': ['united nations development program'],
 
84
  'Development Initiative for Community Impact (DICI)': ['DICI'],
85
  'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
86
  'Global Green Growth Institute (GGGI)': ['GGGI'],
 
87
  'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
88
  'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
89
  'United Nations Development Programme (UNDP)': ['UNDP'],
modules/utils.py CHANGED
@@ -143,12 +143,14 @@ def process_data(uploaded_file, sens_level):
143
  # Read the Excel file
144
  try:
145
  df = pd.read_excel(uploaded_file)
146
- df = standardize_organization_names(df)
147
  logger.info("Data import successful")
 
 
148
  except Exception as e:
149
- logger.error(f"Failed to read Excel file: {str(e)}")
 
150
  st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
151
- return None
152
 
153
  # Validate required columns
154
  missing_columns = [col for col in required_columns.keys() if col not in df.columns]
@@ -156,7 +158,7 @@ def process_data(uploaded_file, sens_level):
156
  error_msg = f"Missing required columns: {', '.join(missing_columns)}"
157
  logger.error(error_msg)
158
  st.error(error_msg)
159
- return None
160
 
161
  # Rename required columns while preserving all others
162
  df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
@@ -250,23 +252,20 @@ def process_data(uploaded_file, sens_level):
250
  # Create normalized leverage scale (0-1) where 300% leverage = 1
251
  df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
252
 
253
- # Check if text fields have minimum required words
254
- df['scope_words_lt_10'] = df['scope_txt'].str.split().str.len() < 10
255
- df['fin_words_lt_10'] = df['fin_txt'].str.split().str.len() < 10
256
- df['tech_words_lt_10'] = df['tech_txt'].str.split().str.len() < 10
257
-
258
  df['word_length_check'] = df.apply(lambda x:
259
- True if x['scope_txt'].str.split().str.len() < 10 and
260
- x['fin_txt'].str.split().str.len() < 10 and
261
- x['tech_txt'].str.split().str.len() < 10
262
  else False, axis=1)
 
263
  # Predict score
264
  sector_classes = ['Energy','Transport','Industries']
265
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
266
  # labelling logic
267
  df['pred_action'] = df.apply(lambda x:
268
  'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
269
- x['LANG'] != 'en-US' or
270
  x['ADAPMIT'] == 'Adaptation' or
271
  not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
272
  x['word_length_check'] == True)
 
143
  # Read the Excel file
144
  try:
145
  df = pd.read_excel(uploaded_file)
 
146
  logger.info("Data import successful")
147
+ df = standardize_organization_names(df)
148
+
149
  except Exception as e:
150
+ error_msg = f"Failed to read Excel file: {str(e)}"
151
+ logger.error(error_msg)
152
  st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
153
+ raise ValueError(error_msg)
154
 
155
  # Validate required columns
156
  missing_columns = [col for col in required_columns.keys() if col not in df.columns]
 
158
  error_msg = f"Missing required columns: {', '.join(missing_columns)}"
159
  logger.error(error_msg)
160
  st.error(error_msg)
161
+ raise ValueError(error_msg)
162
 
163
  # Rename required columns while preserving all others
164
  df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
 
252
  # Create normalized leverage scale (0-1) where 300% leverage = 1
253
  df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
254
 
255
+ # Test if all text fields don't have minimum required words
 
 
 
 
256
  df['word_length_check'] = df.apply(lambda x:
257
+ True if len(x['scope_txt'].split()) < 10 and
258
+ len(x['fin_txt'].split()) < 10 and
259
+ len(x['tech_txt'].split()) < 10
260
  else False, axis=1)
261
+
262
  # Predict score
263
  sector_classes = ['Energy','Transport','Industries']
264
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
265
  # labelling logic
266
  df['pred_action'] = df.apply(lambda x:
267
  'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
268
+ x['LANG'] != 'en' or
269
  x['ADAPMIT'] == 'Adaptation' or
270
  not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
271
  x['word_length_check'] == True)