Spaces:
Running
Running
Delete .ipynb_checkpoints
Browse files
.ipynb_checkpoints/app-checkpoint.py
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
import gradio as gr
|
3 |
-
import pandas as pd
|
4 |
-
import os
|
5 |
-
import subprocess
|
6 |
-
|
7 |
-
def predict_top_100_genes(disease_id):
|
8 |
-
# Initialize paths
|
9 |
-
input_csv_path = 'data/downstream/{}_disease.csv'.format(disease_id)
|
10 |
-
output_csv_path = 'data/downstream/{}_top100.csv'.format(disease_id)
|
11 |
-
|
12 |
-
# Check if the output CSV already exists
|
13 |
-
if not os.path.exists(output_csv_path):
|
14 |
-
# Proceed with your existing code if the output file doesn't exist
|
15 |
-
df = pd.read_csv('data/pretrain/disgenet_latest.csv')
|
16 |
-
df = df[df['proteinSeq'].notna()]
|
17 |
-
|
18 |
-
# Check if the disease_id is present in the dataframe
|
19 |
-
if disease_id not in df['diseaseId'].values:
|
20 |
-
return f"Error: Disease ID '{disease_id}' not found in the database. Please check the ID and try again."
|
21 |
-
|
22 |
-
desired_diseaseDes = df[df['diseaseId'] == disease_id]['diseaseDes'].iloc[0]
|
23 |
-
related_proteins = df[df['diseaseDes'] == desired_diseaseDes]['proteinSeq'].unique()
|
24 |
-
df['score'] = df['proteinSeq'].isin(related_proteins).astype(int)
|
25 |
-
new_df = pd.DataFrame({
|
26 |
-
'diseaseId': disease_id,
|
27 |
-
'diseaseDes': desired_diseaseDes,
|
28 |
-
'geneSymbol': df['geneSymbol'],
|
29 |
-
'proteinSeq': df['proteinSeq'],
|
30 |
-
'score': df['score']
|
31 |
-
}).drop_duplicates().reset_index(drop=True)
|
32 |
-
|
33 |
-
new_df.to_csv(input_csv_path, index=False)
|
34 |
-
|
35 |
-
# Call the model script only if the output CSV does not exist
|
36 |
-
script_path = 'model.sh'
|
37 |
-
subprocess.run(['bash', script_path, input_csv_path, output_csv_path], check=True)
|
38 |
-
|
39 |
-
# Read the model output file or the existing file to get the top 100 genes
|
40 |
-
output_df = pd.read_csv(output_csv_path)
|
41 |
-
# Update here to select only the required columns and rename them
|
42 |
-
result_df = output_df[['geneSymbol', 'Prediction_score']].rename(columns={'geneSymbol': 'Gene', 'Prediction_score': 'Score'}).head(100)
|
43 |
-
|
44 |
-
return result_df
|
45 |
-
|
46 |
-
iface = gr.Interface(
|
47 |
-
fn=predict_top_100_genes,
|
48 |
-
inputs=gr.Textbox(lines=1, placeholder="Enter Disease ID Here...", label="Disease ID"),
|
49 |
-
outputs=gr.Dataframe(label="Predicted Top 100 Related Genes"),
|
50 |
-
title="Gene Disease Association Prediction",
|
51 |
-
description = (
|
52 |
-
"This AI model predicts the top 100 genes associated with a given disease based on 16,733 genes."
|
53 |
-
" To get started, you need a Disease ID (UMLS CUI), which can be obtained from the DisGeNET database. "
|
54 |
-
"\n\n**Steps to Obtain a Disease ID from DisGeNET:**\n"
|
55 |
-
"1. Visit the DisGeNET website: [https://www.disgenet.org/search](https://www.disgenet.org/search).\n"
|
56 |
-
"2. Use the search bar to enter your disease of interest. For instance, if you're interested in 'Alzheimer's Disease', type 'Alzheimer's Disease' into the search bar.\n"
|
57 |
-
"3. From the search results, identify the disease you're researching. The Disease ID (UMLS CUI) is listed alongside each disease name, e.g. C0002395.\n"
|
58 |
-
"4. Enter the Disease ID into the input box below and submit.\n\n"
|
59 |
-
"The DisGeNET database contains all known gene-disease associations and associated evidence. In addition, it is able to find the corresponding diseases based on a gene.\n"
|
60 |
-
"\n**The model will take about 18 minutes to inference a new disease.**\n"
|
61 |
-
)
|
62 |
-
)
|
63 |
-
|
64 |
-
iface.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.ipynb_checkpoints/model-checkpoint.sh
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
|
3 |
-
input_csv_path="$1"
|
4 |
-
output_csv_path="$2"
|
5 |
-
max_depth=6
|
6 |
-
device='cuda:0'
|
7 |
-
model_path_list=(
|
8 |
-
"../../save_model_ckp/gda_infoNCE_2024_GPU3090" \
|
9 |
-
)
|
10 |
-
|
11 |
-
cd ../src/finetune/
|
12 |
-
for save_model_path in ${model_path_list[@]}; do
|
13 |
-
num_leaves=$((2**($max_depth-1)))
|
14 |
-
python finetune.py \
|
15 |
-
--input_csv_path $input_csv_path \
|
16 |
-
--output_csv_path $output_csv_path \
|
17 |
-
--save_model_path $save_model_path \
|
18 |
-
--device $device \
|
19 |
-
--batch_size 128 \
|
20 |
-
--step "300" \
|
21 |
-
--use_pooled \
|
22 |
-
--num_leaves $num_leaves \
|
23 |
-
--max_depth $max_depth
|
24 |
-
done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.ipynb_checkpoints/requirements-checkpoint.txt
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
lightgbm
|
2 |
-
pytorch-metric-learning
|
3 |
-
torch
|
4 |
-
transformers
|
5 |
-
PyTDC
|
6 |
-
gradio
|
7 |
-
numpy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|