import os from typing import Dict from datasets import Dataset import torch from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans from torch.nn import CrossEntropyLoss import pickle os.environ['OMP_NUM_THREADS'] = '7' class WeightedTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs: bool = False, num_items_in_batch: int = None): """ Custom loss computation with sample weights """ labels = inputs.get("labels") weights = inputs.get("weight") # Forward pass outputs = model(**{k: v for k, v in inputs.items() if k not in ["weight", "labels"]}) logits = outputs.get("logits") # Add labels back to outputs outputs["labels"] = labels # Compute weighted loss if weights is not None: weights = weights.to(logits.device) loss_fct = CrossEntropyLoss(reduction='none') loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) # Adjust weights if num_items_in_batch is provided if num_items_in_batch: weights = weights[:num_items_in_batch] loss = (loss * weights.view(-1)).mean() else: loss_fct = CrossEntropyLoss(label_smoothing=0.1) loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) outputs["loss"] = loss return (loss, outputs) if return_outputs else loss def create_feature_vector(df): """Create numerical feature vector for clustering with sample size weighting, handling missing/unseen labels.""" # Initialize LabelEncoders le_gender = LabelEncoder() le_race = LabelEncoder() le_risk = LabelEncoder() # Fit and transform while handling missing values gender_encoded = le_gender.fit(df['Gender'].unique()).transform(df['Gender'].fillna('Unknown')) race_encoded = le_race.fit(df['RaceEthnicity'].unique()).transform(df['RaceEthnicity'].fillna('Unknown')) risk_encoded = le_risk.fit(df['RiskFactor'].unique()).transform(df['RiskFactor'].fillna('Unknown')) # Create age groups numerical representation with a default for missing values age_map = { '12-17 years': 0, '18-39 years': 1, '40-64 years': 2, '65-79 years': 3, '80 years and older': 4 # Include all possible labels, even if missing } # Use `.get()` with a default value for missing/unseen age groups age_encoded = df['Age'].map(lambda x: age_map.get(x, -1)) # Combine features features = np.column_stack([ age_encoded, gender_encoded, race_encoded, risk_encoded, df['Sample_Size'].values # Add sample size as a feature ]) # Scale features scaler = StandardScaler() features_scaled = scaler.fit_transform(features) return features_scaled, scaler def weighted_kmeans(X, sample_weights, n_clusters, max_iter=300, random_state=42): """Custom K-means implementation that considers sample weights""" n_samples = X.shape[0] # Initialize centroids randomly from the weighted distribution rng = np.random.RandomState(random_state) weighted_indices = rng.choice(n_samples, size=n_clusters, p=sample_weights / sample_weights.sum()) centroids = X[weighted_indices] for _ in range(max_iter): # Assign points to nearest centroid distances = np.sqrt(((X[:, np.newaxis] - centroids) ** 2).sum(axis=2)) labels = np.argmin(distances, axis=1) # Update centroids using weighted means new_centroids = np.zeros_like(centroids) for k in range(n_clusters): mask = labels == k if mask.any(): weights_k = sample_weights[mask] new_centroids[k] = np.average(X[mask], axis=0, weights=weights_k) # Check for convergence if np.allclose(centroids, new_centroids): break centroids = new_centroids return labels, centroids def prepare_data(file_path='data/Vision_Survey_Cleaned.csv'): """Load and prepare the vision health dataset with sample-size-aware clustering.""" print("\nLoading and preparing data...") df = pd.read_csv(file_path) # Filter data vision_cat = ['Best-corrected visual acuity'] df = df[df['Question'].isin(vision_cat)].copy() df = df[df["RiskFactor"] != "All participants"] df = df[df["RiskFactorResponse"] != "Total"] # Reset index after filtering df = df.reset_index(drop=True) # Create feature vectors for clustering features_scaled, scaler = create_feature_vector(df) # Normalize sample sizes for weights sample_weights = df['Sample_Size'].values sample_weights = sample_weights / sample_weights.sum() # Apply weighted clustering n_clusters = min(5, len(df)) clusters, centroids = weighted_kmeans( features_scaled, sample_weights, n_clusters=n_clusters ) # Add clusters as a column df['cluster'] = clusters # Calculate cluster importance based on total sample size in each cluster cluster_total_samples = df.groupby('cluster')['Sample_Size'].sum() cluster_weights = cluster_total_samples / cluster_total_samples.sum() # Enhanced feature engineering with clustering information df['doc'] = df.apply( lambda x: f""" Patient Demographics: - Age Category: {x['Age']} - Gender: {x['Gender']} - Race/Ethnicity: {x['RaceEthnicity']} Risk Factors: - {x['RiskFactor']}: {x['RiskFactorResponse']} Additional Information: - Sample Size: {x['Sample_Size']} - Cluster Profile: {x['cluster']} (Weight: {cluster_weights.get(x['cluster'], 0):.3f}) """.strip(), axis=1 ) # Encode labels le = LabelEncoder() df['labels'] = le.fit_transform(df['Response'].astype(str)) # Combine sample size weights with cluster importance df['weight'] = df.apply( lambda x: (x['Sample_Size'] / df['Sample_Size'].sum()) * cluster_weights.get(x['cluster'], 0), axis=1 ) # Create train and test splits with stratification train_df, test_df = train_test_split( df, test_size=0.2, stratify=df['labels'], random_state=42 ) # Convert to dict format train_data = { 'doc': train_df['doc'].tolist(), 'labels': train_df['labels'].tolist(), 'weight': train_df['weight'].tolist() } test_data = { 'doc': test_df['doc'].tolist(), 'labels': test_df['labels'].tolist(), 'weight': test_df['weight'].tolist() } # Convert to datasets train_dataset = Dataset.from_dict(train_data) test_dataset = Dataset.from_dict(test_data) dataset_dict = { 'train': train_dataset, 'test': test_dataset } # Print detailed dataset statistics print("\nDataset Summary:") print(f"Training samples: {len(train_dataset)}") print(f"Test samples: {len(test_dataset)}") print("\nCluster Distribution:") for i in range(n_clusters): cluster_mask = df['cluster'] == i cluster_samples = df[cluster_mask]['Sample_Size'].sum() print(f"\nCluster {i} (Total samples: {cluster_samples:,}, Weight: {cluster_weights.get(i, 0):.3f}):") print("Most common characteristics:") for col in ['Age', 'Gender', 'RaceEthnicity', 'RiskFactor']: values = df[col][cluster_mask].value_counts().head(3) samples = df[cluster_mask].groupby(col)['Sample_Size'].sum().sort_values(ascending=False).head(3) print(f"{col}:") for val, count in values.items(): sample_count = samples.get(val, 0) # Use .get() for safety print(f" - {val}: {count} groups ({sample_count:,} individuals)") print("\nLabel Distribution:") for label, idx in zip(le.classes_, range(len(le.classes_))): count = (df['labels'] == idx).sum() total_size = df[df['labels'] == idx]['Sample_Size'].sum() print(f"{label}: {count} groups, {total_size:,} individuals") return dataset_dict, le def main(): # Setup output_dir = "models/vision-classifier" os.makedirs(output_dir, exist_ok=True) # Load the dataset dataset_dict, label_encoder = prepare_data() # Initialize the tokenizer tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # Define tokenization function within main to have access to tokenizer def tokenize_function(examples): """Tokenize the input texts and maintain the correct column names""" tokenized = tokenizer( examples["doc"], truncation=True, padding='max_length', max_length=128, return_tensors=None ) # Keep the additional columns tokenized['labels'] = examples['labels'] tokenized['weight'] = examples['weight'] return tokenized # Tokenize the datasets tokenized_datasets = {} for split, dataset in dataset_dict.items(): tokenized_datasets[split] = dataset.map( tokenize_function, batched=True, remove_columns=['doc'] ) # Print sample to verify print("\nSample tokenized data:", tokenized_datasets["train"][0]) # Initialize the model model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=len(label_encoder.classes_), id2label={i: label for i, label in enumerate(label_encoder.classes_)}, label2id={label: i for i, label in enumerate(label_encoder.classes_)}, ) # Data collator data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Check device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"\nTraining on device: {device}") # Move model to device model.to(device) # Set up training arguments training_args = TrainingArguments( output_dir=output_dir, learning_rate=3e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=7, weight_decay=0.01, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, remove_unused_columns=False, push_to_hub=True, ) # Create the Trainer trainer = WeightedTrainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], data_collator=data_collator, ) # Train the model print("\nStarting training...") trainer.train() # Save the model print("\nSaving model...") trainer.save_model(output_dir=os.path.join(output_dir, "model")) # Save the tokenizer tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer")) # Save the label encoder label_encoder_path = os.path.join(output_dir, "label_encoder.pkl") with open(label_encoder_path, 'wb') as f: pickle.dump(label_encoder, f) return trainer, model, tokenizer, label_encoder def evaluate_model(model, eval_dataset, tokenizer, label_encoder, device) -> Dict: """ Evaluate model performance using multiple metrics """ model.eval() all_predictions = [] all_labels = [] # Process each example in evaluation dataset for item in eval_dataset: # Tokenize input inputs = tokenizer( item['doc'], truncation=True, padding=True, return_tensors="pt" ) inputs = {k: v.to(device) for k, v in inputs.items()} # Get predictions with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=1) all_predictions.extend(predictions.cpu().numpy()) all_labels.append(item['labels']) # Calculate metrics accuracy = accuracy_score(all_labels, all_predictions) precision, recall, f1, support = precision_recall_fscore_support( all_labels, all_predictions, average='weighted' ) # Calculate per-class metrics per_class_precision, per_class_recall, per_class_f1, _ = precision_recall_fscore_support( all_labels, all_predictions, average=None ) # Create confusion matrix conf_matrix = confusion_matrix(all_labels, all_predictions) # Combine metrics metrics = { 'accuracy': accuracy, 'weighted_precision': precision, 'weighted_recall': recall, 'weighted_f1': f1, 'confusion_matrix': conf_matrix, 'per_class_metrics': { label: { 'precision': p, 'recall': r, 'f1': f } for label, p, r, f in zip( label_encoder.classes_, per_class_precision, per_class_recall, per_class_f1 ) } } return metrics def print_evaluation_report(metrics: Dict, label_encoder): """ Print formatted evaluation report """ print("\n" + "=" * 50) print("MODEL EVALUATION REPORT") print("=" * 50) print("\nOverall Metrics:") print(f"Accuracy: {metrics['accuracy']:.4f}") print(f"Weighted Precision: {metrics['weighted_precision']:.4f}") print(f"Weighted Recall: {metrics['weighted_recall']:.4f}") print(f"Weighted F1-Score: {metrics['weighted_f1']:.4f}") print("\nPer-Class Metrics:") print("-" * 50) print(f"{'Class':<30} {'Precision':>10} {'Recall':>10} {'F1-Score':>10}") print("-" * 50) for label, class_metrics in metrics['per_class_metrics'].items(): print( f"{label:<30} {class_metrics['precision']:>10.4f} {class_metrics['recall']:>10.4f} {class_metrics['f1']:>10.4f}") print("\nConfusion Matrix:") print("-" * 50) conf_matrix = metrics['confusion_matrix'] print(conf_matrix) if __name__ == "__main__": output_dir = "models/vision-classifier" model_path = os.path.join(output_dir, "model") tokenizer_path = os.path.join(output_dir, "tokenizer") if os.path.exists(model_path): print("\nLoading pre-trained model...") try: tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) model = AutoModelForSequenceClassification.from_pretrained(model_path) label_encoder_path = os.path.join(output_dir, "label_encoder.pkl") if os.path.exists(label_encoder_path): with open(label_encoder_path, 'rb') as f: label_encoder = pickle.load(f) else: print("Warning: Label encoder not found. Running full training...") trainer, model, tokenizer, label_encoder = main() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print(f"Model loaded successfully and moved to {device}") # Load test dataset for evaluation dataset_dict, _ = prepare_data() # Run evaluation print("\nEvaluating model performance...") eval_metrics = evaluate_model( model, dataset_dict['test'], tokenizer, label_encoder, device ) # Print evaluation report print_evaluation_report(eval_metrics, label_encoder) except Exception as e: print(f"Error loading model: {e}") print("Running full training instead...") trainer, model, tokenizer, label_encoder = main() else: print("\nNo pre-trained model found. Running training...") trainer, model, tokenizer, label_encoder = main() def predict_vision_status(text, model, tokenizer, label_encoder): """Make prediction using the loaded/trained model""" inputs = tokenizer( text, truncation=True, padding=True, return_tensors="pt" ) device = next(model.parameters()).device inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model(**inputs) # Apply softmax to get probabilities probabilities = torch.nn.functional.softmax(outputs.logits, dim=1) # Convert to numpy array probabilities = probabilities.cpu().numpy()[0] # Create list of (label, probability) tuples predictions = [] for idx, prob in enumerate(probabilities): label = label_encoder.inverse_transform([idx])[0] predictions.append((label, float(prob))) # Sort by probability in descending order predictions.sort(key=lambda x: x[1], reverse=True) return predictions example_text = "Age: 40-64 years, Gender: Female, Race: White, non-Hispanic, Diabetes: No" predictions = predict_vision_status(example_text, model, tokenizer, label_encoder) print(f"\nPredictions for: {example_text}") print("\nLabel Confidence Scores:") print("-" * 50) for label, confidence in predictions: print(f"{label:<30} {confidence:.2%}")