mtyrrell commited on
Commit
88e08d0
·
1 Parent(s): 8f49843

lang classifier, text word count

Browse files
Files changed (2) hide show
  1. app.py +12 -12
  2. modules/utils.py +14 -2
app.py CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
- # from dotenv import load_dotenv
30
- # load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
- st.session_state['authenticated'] = False
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
@@ -172,15 +172,15 @@ def main():
172
 
173
 
174
  # Comment out for testing
175
- else:
176
- username = st.text_input("Username")
177
- password = st.text_input("Password", type="password")
178
- if st.button("Login"):
179
- if validate_login(username, password):
180
- st.session_state['authenticated'] = True
181
- st.rerun()
182
- else:
183
- st.error("Incorrect username or password")
184
 
185
 
186
 
 
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
+ from dotenv import load_dotenv
30
+ load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
+ st.session_state['authenticated'] = True
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
 
172
 
173
 
174
  # Comment out for testing
175
+ # else:
176
+ # username = st.text_input("Username")
177
+ # password = st.text_input("Password", type="password")
178
+ # if st.button("Login"):
179
+ # if validate_login(username, password):
180
+ # st.session_state['authenticated'] = True
181
+ # st.rerun()
182
+ # else:
183
+ # st.error("Incorrect username or password")
184
 
185
 
186
 
modules/utils.py CHANGED
@@ -200,7 +200,8 @@ def process_data(uploaded_file, sens_level):
200
  df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
201
  df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
202
  elif model_name == 'LANG':
203
- df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
 
204
 
205
  logger.info(f"Completed: {model_name}")
206
  model_progress.empty()
@@ -249,6 +250,16 @@ def process_data(uploaded_file, sens_level):
249
  # Create normalized leverage scale (0-1) where 300% leverage = 1
250
  df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
251
 
 
 
 
 
 
 
 
 
 
 
252
  # Predict score
253
  sector_classes = ['Energy','Transport','Industries']
254
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
@@ -257,7 +268,8 @@ def process_data(uploaded_file, sens_level):
257
  'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
258
  x['LANG'] != 'en-US' or
259
  x['ADAPMIT'] == 'Adaptation' or
260
- not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
 
261
  else 'REJECT' if x['pred_score'] <= sens_level
262
  else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
263
  else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
 
200
  df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
201
  df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
202
  elif model_name == 'LANG':
203
+ # df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
204
+ df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
205
 
206
  logger.info(f"Completed: {model_name}")
207
  model_progress.empty()
 
250
  # Create normalized leverage scale (0-1) where 300% leverage = 1
251
  df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
252
 
253
+ # Check if text fields have minimum required words
254
+ df['scope_words_lt_10'] = df['scope_txt'].str.split().str.len() < 10
255
+ df['fin_words_lt_10'] = df['fin_txt'].str.split().str.len() < 10
256
+ df['tech_words_lt_10'] = df['tech_txt'].str.split().str.len() < 10
257
+
258
+ df['word_length_check'] = df.apply(lambda x:
259
+ True if x['scope_txt'].str.split().str.len() < 10 and
260
+ x['fin_txt'].str.split().str.len() < 10 and
261
+ x['tech_txt'].str.split().str.len() < 10
262
+ else False, axis=1)
263
  # Predict score
264
  sector_classes = ['Energy','Transport','Industries']
265
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
 
268
  'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
269
  x['LANG'] != 'en-US' or
270
  x['ADAPMIT'] == 'Adaptation' or
271
+ not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
272
+ x['word_length_check'] == True)
273
  else 'REJECT' if x['pred_score'] <= sens_level
274
  else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
275
  else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2