Spaces:
Running
Running
organization count
Browse files- app.py +6 -7
- modules/org_count.py +9 -1
- modules/utils.py +33 -11
app.py
CHANGED
@@ -12,18 +12,17 @@ except Exception as e:
|
|
12 |
print(f"Error checking CUDA availability: {str(e)}")
|
13 |
print("Continuing with CPU...")
|
14 |
|
15 |
-
|
16 |
-
from modules.logging_config import setup_logging
|
17 |
-
setup_logging()
|
18 |
-
import logging
|
19 |
-
logger = logging.getLogger(__name__)
|
20 |
-
|
21 |
import streamlit as st
|
22 |
import os
|
23 |
from huggingface_hub import login
|
24 |
from datetime import datetime
|
25 |
from modules.auth import validate_login
|
26 |
from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Local
|
29 |
# from dotenv import load_dotenv
|
@@ -64,7 +63,7 @@ def main():
|
|
64 |
depending on the number of applications and the length of text in each. For example, a file with 1000 applications
|
65 |
could be expected to take approximately 5 minutes.
|
66 |
|
67 |
-
***NOTE
|
68 |
|
69 |
"""
|
70 |
)
|
|
|
12 |
print(f"Error checking CUDA availability: {str(e)}")
|
13 |
print("Continuing with CPU...")
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
import streamlit as st
|
16 |
import os
|
17 |
from huggingface_hub import login
|
18 |
from datetime import datetime
|
19 |
from modules.auth import validate_login
|
20 |
from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
|
21 |
+
from modules.logging_config import setup_logging
|
22 |
+
setup_logging()
|
23 |
+
import logging
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
|
26 |
|
27 |
# Local
|
28 |
# from dotenv import load_dotenv
|
|
|
63 |
depending on the number of applications and the length of text in each. For example, a file with 1000 applications
|
64 |
could be expected to take approximately 5 minutes.
|
65 |
|
66 |
+
***NOTE** - you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
|
67 |
|
68 |
"""
|
69 |
)
|
modules/org_count.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import pandas as pd
|
2 |
from thefuzz import fuzz
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
def standardize_organization_names(df):
|
@@ -14,7 +17,7 @@ def standardize_organization_names(df):
|
|
14 |
"""
|
15 |
# Make a copy to avoid modifying the original DataFrame
|
16 |
df = df.copy()
|
17 |
-
|
18 |
# Dictionary of organization variations and their standardized names
|
19 |
org_variations = {
|
20 |
'Adventist Development Relief Agency': ['adventist development'],
|
@@ -116,6 +119,11 @@ def standardize_organization_names(df):
|
|
116 |
|
117 |
# Add concept count
|
118 |
df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
return df
|
121 |
|
|
|
1 |
import pandas as pd
|
2 |
from thefuzz import fuzz
|
3 |
+
import logging
|
4 |
+
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
|
7 |
|
8 |
def standardize_organization_names(df):
|
|
|
17 |
"""
|
18 |
# Make a copy to avoid modifying the original DataFrame
|
19 |
df = df.copy()
|
20 |
+
logger.info(f"Checking org names")
|
21 |
# Dictionary of organization variations and their standardized names
|
22 |
org_variations = {
|
23 |
'Adventist Development Relief Agency': ['adventist development'],
|
|
|
119 |
|
120 |
# Add concept count
|
121 |
df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
|
122 |
+
|
123 |
+
# Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
|
124 |
+
cols = ['id', 'organization', 'org_renamed', 'concept_count']
|
125 |
+
other_cols = [col for col in df.columns if col not in cols]
|
126 |
+
df = df[cols + other_cols]
|
127 |
|
128 |
return df
|
129 |
|
modules/utils.py
CHANGED
@@ -9,6 +9,7 @@ from transformers import pipeline
|
|
9 |
from openpyxl import Workbook
|
10 |
from openpyxl.styles import Font, NamedStyle, PatternFill
|
11 |
from openpyxl.styles.differential import DifferentialStyle
|
|
|
12 |
import logging
|
13 |
|
14 |
logger = logging.getLogger(__name__)
|
@@ -127,20 +128,40 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
127 |
|
128 |
# Main function to process data
|
129 |
def process_data(uploaded_file, sens_level):
|
130 |
-
|
131 |
-
|
132 |
-
# Rename columns
|
133 |
-
df.rename(columns={
|
134 |
'id': 'id',
|
135 |
'scope': 'scope_txt',
|
136 |
'technology': 'tech_txt',
|
137 |
'financial': 'fin_txt',
|
138 |
-
'maf_funding_requested':'maf_funding',
|
139 |
-
'contributions_public_sector':'cont_public',
|
140 |
-
'contributions_private_sector':'cont_private',
|
141 |
-
'contributions_other':'cont_other'
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
df.fillna('', inplace=True)
|
145 |
df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
|
146 |
|
@@ -234,7 +255,8 @@ def process_data(uploaded_file, sens_level):
|
|
234 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
235 |
# df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
|
236 |
df['pred_action'] = df.apply(lambda x:
|
237 |
-
'INELIGIBLE' if (x['
|
|
|
238 |
x['ADAPMIT'] == 'Adaptation' or
|
239 |
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
|
240 |
else 'REJECT' if x['pred_score'] <= sens_level
|
|
|
9 |
from openpyxl import Workbook
|
10 |
from openpyxl.styles import Font, NamedStyle, PatternFill
|
11 |
from openpyxl.styles.differential import DifferentialStyle
|
12 |
+
from modules.org_count import standardize_organization_names
|
13 |
import logging
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
|
|
128 |
|
129 |
# Main function to process data
|
130 |
def process_data(uploaded_file, sens_level):
|
131 |
+
# Define required columns and their mappings
|
132 |
+
required_columns = {
|
|
|
|
|
133 |
'id': 'id',
|
134 |
'scope': 'scope_txt',
|
135 |
'technology': 'tech_txt',
|
136 |
'financial': 'fin_txt',
|
137 |
+
'maf_funding_requested': 'maf_funding',
|
138 |
+
'contributions_public_sector': 'cont_public',
|
139 |
+
'contributions_private_sector': 'cont_private',
|
140 |
+
'contributions_other': 'cont_other'
|
141 |
+
}
|
142 |
+
|
143 |
+
# Read the Excel file
|
144 |
+
try:
|
145 |
+
df = pd.read_excel(uploaded_file)
|
146 |
+
df = standardize_organization_names(df)
|
147 |
+
logger.info("Data import successful")
|
148 |
+
except Exception as e:
|
149 |
+
logger.error(f"Failed to read Excel file: {str(e)}")
|
150 |
+
st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
|
151 |
+
return None
|
152 |
+
|
153 |
+
# Validate required columns
|
154 |
+
missing_columns = [col for col in required_columns.keys() if col not in df.columns]
|
155 |
+
if missing_columns:
|
156 |
+
error_msg = f"Missing required columns: {', '.join(missing_columns)}"
|
157 |
+
logger.error(error_msg)
|
158 |
+
st.error(error_msg)
|
159 |
+
return None
|
160 |
+
|
161 |
+
# Rename required columns while preserving all others
|
162 |
+
df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
|
163 |
+
|
164 |
+
# Clean and process text fields
|
165 |
df.fillna('', inplace=True)
|
166 |
df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
|
167 |
|
|
|
255 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
256 |
# df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
|
257 |
df['pred_action'] = df.apply(lambda x:
|
258 |
+
'INELIGIBLE' if (x['concept_count'] > 6 or
|
259 |
+
x['LANG'] != 'en-US' or
|
260 |
x['ADAPMIT'] == 'Adaptation' or
|
261 |
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
|
262 |
else 'REJECT' if x['pred_score'] <= sens_level
|