mtyrrell commited on
Commit
bf83e2c
·
1 Parent(s): ab78519

organization count

Browse files
Files changed (3) hide show
  1. app.py +6 -7
  2. modules/org_count.py +9 -1
  3. modules/utils.py +33 -11
app.py CHANGED
@@ -12,18 +12,17 @@ except Exception as e:
12
  print(f"Error checking CUDA availability: {str(e)}")
13
  print("Continuing with CPU...")
14
 
15
-
16
- from modules.logging_config import setup_logging
17
- setup_logging()
18
- import logging
19
- logger = logging.getLogger(__name__)
20
-
21
  import streamlit as st
22
  import os
23
  from huggingface_hub import login
24
  from datetime import datetime
25
  from modules.auth import validate_login
26
  from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
 
 
 
 
 
27
 
28
  # Local
29
  # from dotenv import load_dotenv
@@ -64,7 +63,7 @@ def main():
64
  depending on the number of applications and the length of text in each. For example, a file with 1000 applications
65
  could be expected to take approximately 5 minutes.
66
 
67
- ***NOTE (1)** - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
68
 
69
  """
70
  )
 
12
  print(f"Error checking CUDA availability: {str(e)}")
13
  print("Continuing with CPU...")
14
 
 
 
 
 
 
 
15
  import streamlit as st
16
  import os
17
  from huggingface_hub import login
18
  from datetime import datetime
19
  from modules.auth import validate_login
20
  from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
21
+ from modules.logging_config import setup_logging
22
+ setup_logging()
23
+ import logging
24
+ logger = logging.getLogger(__name__)
25
+
26
 
27
  # Local
28
  # from dotenv import load_dotenv
 
63
  depending on the number of applications and the length of text in each. For example, a file with 1000 applications
64
  could be expected to take approximately 5 minutes.
65
 
66
+ ***NOTE** - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
67
 
68
  """
69
  )
modules/org_count.py CHANGED
@@ -1,5 +1,8 @@
1
  import pandas as pd
2
  from thefuzz import fuzz
 
 
 
3
 
4
 
5
  def standardize_organization_names(df):
@@ -14,7 +17,7 @@ def standardize_organization_names(df):
14
  """
15
  # Make a copy to avoid modifying the original DataFrame
16
  df = df.copy()
17
-
18
  # Dictionary of organization variations and their standardized names
19
  org_variations = {
20
  'Adventist Development Relief Agency': ['adventist development'],
@@ -116,6 +119,11 @@ def standardize_organization_names(df):
116
 
117
  # Add concept count
118
  df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
 
 
 
 
 
119
 
120
  return df
121
 
 
1
  import pandas as pd
2
  from thefuzz import fuzz
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
 
7
 
8
  def standardize_organization_names(df):
 
17
  """
18
  # Make a copy to avoid modifying the original DataFrame
19
  df = df.copy()
20
+ logger.info(f"Checking org names")
21
  # Dictionary of organization variations and their standardized names
22
  org_variations = {
23
  'Adventist Development Relief Agency': ['adventist development'],
 
119
 
120
  # Add concept count
121
  df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
122
+
123
+ # Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
124
+ cols = ['id', 'organization', 'org_renamed', 'concept_count']
125
+ other_cols = [col for col in df.columns if col not in cols]
126
+ df = df[cols + other_cols]
127
 
128
  return df
129
 
modules/utils.py CHANGED
@@ -9,6 +9,7 @@ from transformers import pipeline
9
  from openpyxl import Workbook
10
  from openpyxl.styles import Font, NamedStyle, PatternFill
11
  from openpyxl.styles.differential import DifferentialStyle
 
12
  import logging
13
 
14
  logger = logging.getLogger(__name__)
@@ -127,20 +128,40 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
127
 
128
  # Main function to process data
129
  def process_data(uploaded_file, sens_level):
130
- df = pd.read_excel(uploaded_file)
131
- logger.info(f"Data import successful")
132
- # Rename columns
133
- df.rename(columns={
134
  'id': 'id',
135
  'scope': 'scope_txt',
136
  'technology': 'tech_txt',
137
  'financial': 'fin_txt',
138
- 'maf_funding_requested':'maf_funding',
139
- 'contributions_public_sector':'cont_public',
140
- 'contributions_private_sector':'cont_private',
141
- 'contributions_other':'cont_other'}, inplace=True)
142
- # clean the text fields
143
- df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt','maf_funding','cont_public','cont_private','cont_other'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  df.fillna('', inplace=True)
145
  df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
146
 
@@ -234,7 +255,8 @@ def process_data(uploaded_file, sens_level):
234
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
235
  # df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
236
  df['pred_action'] = df.apply(lambda x:
237
- 'INELIGIBLE' if (x['LANG'] != 'en-US' or
 
238
  x['ADAPMIT'] == 'Adaptation' or
239
  not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
240
  else 'REJECT' if x['pred_score'] <= sens_level
 
9
  from openpyxl import Workbook
10
  from openpyxl.styles import Font, NamedStyle, PatternFill
11
  from openpyxl.styles.differential import DifferentialStyle
12
+ from modules.org_count import standardize_organization_names
13
  import logging
14
 
15
  logger = logging.getLogger(__name__)
 
128
 
129
  # Main function to process data
130
  def process_data(uploaded_file, sens_level):
131
+ # Define required columns and their mappings
132
+ required_columns = {
 
 
133
  'id': 'id',
134
  'scope': 'scope_txt',
135
  'technology': 'tech_txt',
136
  'financial': 'fin_txt',
137
+ 'maf_funding_requested': 'maf_funding',
138
+ 'contributions_public_sector': 'cont_public',
139
+ 'contributions_private_sector': 'cont_private',
140
+ 'contributions_other': 'cont_other'
141
+ }
142
+
143
+ # Read the Excel file
144
+ try:
145
+ df = pd.read_excel(uploaded_file)
146
+ df = standardize_organization_names(df)
147
+ logger.info("Data import successful")
148
+ except Exception as e:
149
+ logger.error(f"Failed to read Excel file: {str(e)}")
150
+ st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
151
+ return None
152
+
153
+ # Validate required columns
154
+ missing_columns = [col for col in required_columns.keys() if col not in df.columns]
155
+ if missing_columns:
156
+ error_msg = f"Missing required columns: {', '.join(missing_columns)}"
157
+ logger.error(error_msg)
158
+ st.error(error_msg)
159
+ return None
160
+
161
+ # Rename required columns while preserving all others
162
+ df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
163
+
164
+ # Clean and process text fields
165
  df.fillna('', inplace=True)
166
  df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
167
 
 
255
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
256
  # df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
257
  df['pred_action'] = df.apply(lambda x:
258
+ 'INELIGIBLE' if (x['concept_count'] > 6 or
259
+ x['LANG'] != 'en-US' or
260
  x['ADAPMIT'] == 'Adaptation' or
261
  not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
262
  else 'REJECT' if x['pred_score'] <= sens_level