Spaces:

HUBioDataLab
/

ProtHGT

Running

App Files Files Community

Erva Ulusoy commited on Feb 2

Commit

8aa6c67

1 Parent(s): 673a3cf

updated data load function

Browse files

Files changed (2) hide show

requirements.txt +2 -1
run_prothgt_app.py +45 -26

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 pandas
 torch_geometric
-torch

 pandas
 torch_geometric
+torch
+gdown

run_prothgt_app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 import yaml
 import os
 from datasets import load_dataset
 class ProtHGT(torch.nn.Module):
     def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
@@ -44,25 +45,8 @@ class ProtHGT(torch.nn.Module):
         return self.mlp(z).view(-1), x_dict
-def _load_data(protein_ids, go_category=None):
-    # heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.pt")
-    heterodata = torch.load('data/prothgt-kg.pt')
-    print('Loading data...')
-    # Remove unnecessary edge types in one go
-    edge_types_to_remove = [
-        ('Protein', 'protein_function', 'GO_term_F'),
-        ('Protein', 'protein_function', 'GO_term_P'),
-        ('Protein', 'protein_function', 'GO_term_C'),
-        ('GO_term_F', 'rev_protein_function', 'Protein'),
-        ('GO_term_P', 'rev_protein_function', 'Protein'),
-        ('GO_term_C', 'rev_protein_function', 'Protein')
-    ]
-    for edge_type in edge_types_to_remove:
-        if edge_type in heterodata.edge_index_dict:
-            del heterodata.edge_index_dict[edge_type]
     # Get protein indices for all input proteins
     protein_indices = [heterodata['Protein']['id_mapping'][pid] for pid in protein_ids]
@@ -136,20 +120,53 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
     if isinstance(protein_ids, str):
         protein_ids = [protein_ids]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     for go_cat, model_config_path, model_path in zip(go_category, model_config_paths, model_paths):
         print(f'Generating predictions for {go_cat}...')
-        # Load data
-        heterodata = _load_data(protein_ids, go_cat)
-        # Load model configuration
         with open(model_config_path, 'r') as file:
             model_config = yaml.safe_load(file)
         # Initialize model with configuration
         model = ProtHGT(
-            heterodata,
             hidden_channels=model_config['hidden_channels'][0],
             num_heads=model_config['num_heads'],
             num_layers=model_config['num_layers'],
@@ -162,16 +179,18 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
         print(f'Loaded model weights from {model_path}')
         # Generate predictions
-        predictions = _generate_predictions(heterodata, model, go_cat)
-        prediction_df = _create_prediction_df(predictions, heterodata, protein_ids, go_cat)
         all_predictions.append(prediction_df)
         # Clean up memory
-        del heterodata
         del model
         del predictions
         torch.cuda.empty_cache()  # Clear CUDA cache if using GPU
     # Combine all predictions
     final_df = pd.concat(all_predictions, ignore_index=True)

 import yaml
 import os
 from datasets import load_dataset
+import gdown
 class ProtHGT(torch.nn.Module):
     def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
         return self.mlp(z).view(-1), x_dict
+def _load_data(heterodata, protein_ids, go_category=None):
+    """Process the loaded heterodata for specific proteins and GO categories."""
     # Get protein indices for all input proteins
     protein_indices = [heterodata['Protein']['id_mapping'][pid] for pid in protein_ids]
     if isinstance(protein_ids, str):
         protein_ids = [protein_ids]
+    # Load dataset once
+    # heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
+    print('Loading data...')
+    file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
+    output = "data/prothgt-kg.pt"
+    url = f"https://drive.google.com/uc?id={file_id}"
+    print(f"Downloading file from {url}...")
+    try:
+        gdown.download(url, output, quiet=False)
+        print(f"File downloaded to {output}")
+    except Exception as e:
+        print(f"Error downloading file: {e}")
+        raise
+    heterodata = torch.load(output)
+    print(heterodata.edge_types)
+    # Remove unnecessary edge types
+    edge_types_to_remove = [
+        ('Protein', 'protein_function', 'GO_term_F'),
+        ('Protein', 'protein_function', 'GO_term_P'),
+        ('Protein', 'protein_function', 'GO_term_C'),
+        ('GO_term_F', 'rev_protein_function', 'Protein'),
+        ('GO_term_P', 'rev_protein_function', 'Protein'),
+        ('GO_term_C', 'rev_protein_function', 'Protein')
+    ]
+    for edge_type in edge_types_to_remove:
+        if edge_type in heterodata.edge_index_dict:
+            del heterodata.edge_index_dict[edge_type]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     for go_cat, model_config_path, model_path in zip(go_category, model_config_paths, model_paths):
         print(f'Generating predictions for {go_cat}...')
+        # Process data for current GO category
+        processed_data = _load_data(heterodata, protein_ids, go_cat)
+        # Load model config
         with open(model_config_path, 'r') as file:
             model_config = yaml.safe_load(file)
         # Initialize model with configuration
         model = ProtHGT(
+            processed_data,
             hidden_channels=model_config['hidden_channels'][0],
             num_heads=model_config['num_heads'],
             num_layers=model_config['num_layers'],
         print(f'Loaded model weights from {model_path}')
         # Generate predictions
+        predictions = _generate_predictions(processed_data, model, go_cat)
+        prediction_df = _create_prediction_df(predictions, processed_data, protein_ids, go_cat)
         all_predictions.append(prediction_df)
         # Clean up memory
+        del processed_data
         del model
         del predictions
         torch.cuda.empty_cache()  # Clear CUDA cache if using GPU
+    del heterodata
     # Combine all predictions
     final_df = pd.concat(all_predictions, ignore_index=True)