Spaces:
Running
Running
lang classifier, text word count
Browse files- app.py +12 -12
- modules/utils.py +14 -2
app.py
CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
-
|
30 |
-
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
-
st.session_state['authenticated'] =
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
@@ -172,15 +172,15 @@ def main():
|
|
172 |
|
173 |
|
174 |
# Comment out for testing
|
175 |
-
else:
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
|
185 |
|
186 |
|
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
+
from dotenv import load_dotenv
|
30 |
+
load_dotenv()
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
+
st.session_state['authenticated'] = True
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
|
|
172 |
|
173 |
|
174 |
# Comment out for testing
|
175 |
+
# else:
|
176 |
+
# username = st.text_input("Username")
|
177 |
+
# password = st.text_input("Password", type="password")
|
178 |
+
# if st.button("Login"):
|
179 |
+
# if validate_login(username, password):
|
180 |
+
# st.session_state['authenticated'] = True
|
181 |
+
# st.rerun()
|
182 |
+
# else:
|
183 |
+
# st.error("Incorrect username or password")
|
184 |
|
185 |
|
186 |
|
modules/utils.py
CHANGED
@@ -200,7 +200,8 @@ def process_data(uploaded_file, sens_level):
|
|
200 |
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
201 |
df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
|
202 |
elif model_name == 'LANG':
|
203 |
-
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
|
|
204 |
|
205 |
logger.info(f"Completed: {model_name}")
|
206 |
model_progress.empty()
|
@@ -249,6 +250,16 @@ def process_data(uploaded_file, sens_level):
|
|
249 |
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
250 |
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
# Predict score
|
253 |
sector_classes = ['Energy','Transport','Industries']
|
254 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
@@ -257,7 +268,8 @@ def process_data(uploaded_file, sens_level):
|
|
257 |
'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
|
258 |
x['LANG'] != 'en-US' or
|
259 |
x['ADAPMIT'] == 'Adaptation' or
|
260 |
-
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes)
|
|
|
261 |
else 'REJECT' if x['pred_score'] <= sens_level
|
262 |
else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
|
263 |
else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
|
|
|
200 |
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
201 |
df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
|
202 |
elif model_name == 'LANG':
|
203 |
+
# df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
204 |
+
df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
|
205 |
|
206 |
logger.info(f"Completed: {model_name}")
|
207 |
model_progress.empty()
|
|
|
250 |
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
251 |
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
252 |
|
253 |
+
# Check if text fields have minimum required words
|
254 |
+
df['scope_words_lt_10'] = df['scope_txt'].str.split().str.len() < 10
|
255 |
+
df['fin_words_lt_10'] = df['fin_txt'].str.split().str.len() < 10
|
256 |
+
df['tech_words_lt_10'] = df['tech_txt'].str.split().str.len() < 10
|
257 |
+
|
258 |
+
df['word_length_check'] = df.apply(lambda x:
|
259 |
+
True if x['scope_txt'].str.split().str.len() < 10 and
|
260 |
+
x['fin_txt'].str.split().str.len() < 10 and
|
261 |
+
x['tech_txt'].str.split().str.len() < 10
|
262 |
+
else False, axis=1)
|
263 |
# Predict score
|
264 |
sector_classes = ['Energy','Transport','Industries']
|
265 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
|
|
268 |
'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
|
269 |
x['LANG'] != 'en-US' or
|
270 |
x['ADAPMIT'] == 'Adaptation' or
|
271 |
+
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
|
272 |
+
x['word_length_check'] == True)
|
273 |
else 'REJECT' if x['pred_score'] <= sens_level
|
274 |
else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
|
275 |
else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
|