Gla-AI4BioMed-Lab commited on
Commit
c54bfd4
·
verified ·
1 Parent(s): 8760168

Delete .ipynb_checkpoints

Browse files
.ipynb_checkpoints/app-checkpoint.py DELETED
@@ -1,64 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- import gradio as gr
3
- import pandas as pd
4
- import os
5
- import subprocess
6
-
7
- def predict_top_100_genes(disease_id):
8
- # Initialize paths
9
- input_csv_path = 'data/downstream/{}_disease.csv'.format(disease_id)
10
- output_csv_path = 'data/downstream/{}_top100.csv'.format(disease_id)
11
-
12
- # Check if the output CSV already exists
13
- if not os.path.exists(output_csv_path):
14
- # Proceed with your existing code if the output file doesn't exist
15
- df = pd.read_csv('data/pretrain/disgenet_latest.csv')
16
- df = df[df['proteinSeq'].notna()]
17
-
18
- # Check if the disease_id is present in the dataframe
19
- if disease_id not in df['diseaseId'].values:
20
- return f"Error: Disease ID '{disease_id}' not found in the database. Please check the ID and try again."
21
-
22
- desired_diseaseDes = df[df['diseaseId'] == disease_id]['diseaseDes'].iloc[0]
23
- related_proteins = df[df['diseaseDes'] == desired_diseaseDes]['proteinSeq'].unique()
24
- df['score'] = df['proteinSeq'].isin(related_proteins).astype(int)
25
- new_df = pd.DataFrame({
26
- 'diseaseId': disease_id,
27
- 'diseaseDes': desired_diseaseDes,
28
- 'geneSymbol': df['geneSymbol'],
29
- 'proteinSeq': df['proteinSeq'],
30
- 'score': df['score']
31
- }).drop_duplicates().reset_index(drop=True)
32
-
33
- new_df.to_csv(input_csv_path, index=False)
34
-
35
- # Call the model script only if the output CSV does not exist
36
- script_path = 'model.sh'
37
- subprocess.run(['bash', script_path, input_csv_path, output_csv_path], check=True)
38
-
39
- # Read the model output file or the existing file to get the top 100 genes
40
- output_df = pd.read_csv(output_csv_path)
41
- # Update here to select only the required columns and rename them
42
- result_df = output_df[['geneSymbol', 'Prediction_score']].rename(columns={'geneSymbol': 'Gene', 'Prediction_score': 'Score'}).head(100)
43
-
44
- return result_df
45
-
46
- iface = gr.Interface(
47
- fn=predict_top_100_genes,
48
- inputs=gr.Textbox(lines=1, placeholder="Enter Disease ID Here...", label="Disease ID"),
49
- outputs=gr.Dataframe(label="Predicted Top 100 Related Genes"),
50
- title="Gene Disease Association Prediction",
51
- description = (
52
- "This AI model predicts the top 100 genes associated with a given disease based on 16,733 genes."
53
- " To get started, you need a Disease ID (UMLS CUI), which can be obtained from the DisGeNET database. "
54
- "\n\n**Steps to Obtain a Disease ID from DisGeNET:**\n"
55
- "1. Visit the DisGeNET website: [https://www.disgenet.org/search](https://www.disgenet.org/search).\n"
56
- "2. Use the search bar to enter your disease of interest. For instance, if you're interested in 'Alzheimer's Disease', type 'Alzheimer's Disease' into the search bar.\n"
57
- "3. From the search results, identify the disease you're researching. The Disease ID (UMLS CUI) is listed alongside each disease name, e.g. C0002395.\n"
58
- "4. Enter the Disease ID into the input box below and submit.\n\n"
59
- "The DisGeNET database contains all known gene-disease associations and associated evidence. In addition, it is able to find the corresponding diseases based on a gene.\n"
60
- "\n**The model will take about 18 minutes to inference a new disease.**\n"
61
- )
62
- )
63
-
64
- iface.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/model-checkpoint.sh DELETED
@@ -1,24 +0,0 @@
1
- #!/bin/bash
2
-
3
- input_csv_path="$1"
4
- output_csv_path="$2"
5
- max_depth=6
6
- device='cuda:0'
7
- model_path_list=(
8
- "../../save_model_ckp/gda_infoNCE_2024_GPU3090" \
9
- )
10
-
11
- cd ../src/finetune/
12
- for save_model_path in ${model_path_list[@]}; do
13
- num_leaves=$((2**($max_depth-1)))
14
- python finetune.py \
15
- --input_csv_path $input_csv_path \
16
- --output_csv_path $output_csv_path \
17
- --save_model_path $save_model_path \
18
- --device $device \
19
- --batch_size 128 \
20
- --step "300" \
21
- --use_pooled \
22
- --num_leaves $num_leaves \
23
- --max_depth $max_depth
24
- done
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/requirements-checkpoint.txt DELETED
@@ -1,7 +0,0 @@
1
- lightgbm
2
- pytorch-metric-learning
3
- torch
4
- transformers
5
- PyTDC
6
- gradio
7
- numpy