DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

libokj commited on Dec 25, 2023

Commit

b084d6f

1 Parent(s): c072dc8

Upload app.py

Browse files

Files changed (1) hide show

app.py +107 -64

app.py CHANGED Viewed

@@ -60,7 +60,7 @@ SESSION.mount('https://', ADAPTER)
 UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
-CUSTOM_DATASET_MAX_LEN = 10_000
 CSS = """
 .help-tip {
@@ -353,7 +353,7 @@ TASK_MAP = {
 PRESET_MAP = {
     'DeepDTA': 'deep_dta',
-    'DeepConvDTI-ECFP4': 'deep_conv_dti',
     'GraphDTA': 'graph_dta',
     'MGraphDTA': 'm_graph_dta',
     'HyperAttentionDTI': 'hyper_attention_dti',
@@ -403,12 +403,12 @@ def validate_columns(df, mandatory_cols):
 def process_target_fasta(sequence):
-    # lines = sequence.strip().split("\n")
-    # if lines[0].startswith(">"):
-    #     lines = lines[1:]
-    # return ''.join(lines).split(">")[0]
-    record = SeqIO.parse(io.StringIO(sequence), "fasta")[0]
-    return str(record.seq)
 def send_email(receiver, msg):
@@ -749,7 +749,7 @@ def process_drug_library_upload(library_upload):
     else:
         raise gr.Error('Currently only CSV and SDF files are supported as compound libraries.')
     validate_columns(screen_df, ['X1'])
-    return library_upload
 def target_library_from_fasta(fasta_path):
@@ -783,7 +783,7 @@ theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
     code_background_fill='white',
 )
-with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
     run_state = gr.State(value=False)
     screen_flag = gr.State(value=False)
     identify_flag = gr.State(value=False)
@@ -802,18 +802,17 @@ To predict interactions/binding affinities of a single target against a library
                     with gr.Row():
                         with gr.Column():
                             HelpTip(
-                                "Target amino acid sequence in the FASTA format. Alternatively, you may use a "
-                                "UniProt ID/accession to query UniProt database for the sequence of your "
-                                "target of interest. If the input FASTA contains multiple entities, "
-                                "only the first one will be used."
                             )
                             with gr.Row():
                                 target_input_type = gr.Dropdown(
-                                    label='Target Input Type',
                                     choices=['Sequence', 'UniProt ID', 'Gene symbol'],
                                     info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
                                     value='Sequence',
-                                    scale=3, interactive=True
                                 )
                                 target_id = gr.Textbox(show_label=False, visible=False,
                                                        interactive=True, scale=4,
@@ -823,19 +822,9 @@ To predict interactions/binding affinities of a single target against a library
                                     interactive=True, scale=4,
                                     info='Query a sequence on UniProt with a gene symbol.')
                                 target_organism = gr.Textbox(
-                                    info='Organism common name or scientific name (default: Human).',
-                                    placeholder='Human', show_label=False,
                                     visible=False, interactive=True, scale=4, )
-                        with gr.Column():
-                            HelpTip(
-                                "Identify the protein family by conducting sequence alignment. "
-                                "You may select General if you find the alignment score unsatisfactory."
-                            )
-                            drug_screen_target_family = gr.Dropdown(
-                                choices=list(TARGET_FAMILY_MAP.keys()),
-                                value='General',
-                                label='Select Input Protein Family (Optional)', interactive=True)
-                            # with gr.Column(scale=1, min_width=24):
                     with gr.Row():
                         with gr.Column():
@@ -844,14 +833,39 @@ To predict interactions/binding affinities of a single target against a library
                                                                 size='lg')
                             target_query_btn = gr.Button(value='Query the sequence', variant='primary',
                                                          visible=False)
-                        target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
                     target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
                     example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
                     with gr.Row():
                         with gr.Column():
-                            drug_library = gr.Dropdown(label='Select a Compound Library',
                                                        choices=list(DRUG_LIBRARY_MAP.keys()))
                             with gr.Row():
                                 gr.File(label='Example SDF compound library',
@@ -861,24 +875,33 @@ To predict interactions/binding affinities of a single target against a library
                             drug_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             drug_library_upload = gr.File(label='Custom compound library file', visible=False)
-                            drug_screen_task = gr.Dropdown(list(TASK_MAP.keys()), label='Select a Prediction Task',
                                                            value='Compound-protein interaction')
                         with gr.Column():
-                            HelpTip("We recommend the appropriate model for your use case based on model performance "
-                                    "in drug-target interaction or binding affinity prediction. "
-                                    "The models were benchmarked on different target families "
-                                    "and real-world data scenarios.")
-                            drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
                             screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
-                    # drug_screen_email = gr.Textbox(
-                    #     label='Email (optional)',
-                    #     info="Your email will be used to send you notifications when your job finishes."
-                    # )
                     with gr.Row(visible=True):
                         # drug_screen_clr_btn = gr.ClearButton(size='lg')
-                        drug_screen_btn = gr.Button(value='SCREEN', variant='primary', size='lg')
                     # TODO Modify the pd df directly with df['X2'] = target
             screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
@@ -914,29 +937,38 @@ Example CSV target library:
                     with gr.Row():
                         with gr.Column():
                             HelpTip(
-                                """Compound molecule in the SMILES format. You may input the SMILES string directly,
-                                upload an SMI file, or upload an SDF file to convert to SMILES. Alternatively,
-                                you may search on databases like NCBI PubChem, ChEMBL, and DrugBank for the SMILES
-                                representing your drug of interest.
-                                """
                             )
                             compound_type = gr.Dropdown(
-                                label='Compound Input Type',
                                 choices=['SMILES', 'SDF'],
-                                info='Enter (paste) an SMILES string or upload an SMI file.',
                                 value='SMILES',
                                 interactive=True)
                             compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
-                        target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
-                                                                    label='Target Protein Family')
                     compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
                     example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
                     with gr.Row():
                         with gr.Column():
-                            target_library = gr.Dropdown(label='Select a Target Library',
                                                          choices=list(TARGET_LIBRARY_MAP.keys()))
                             with gr.Row():
                                 gr.File(label='Example FASTA target library',
@@ -946,22 +978,30 @@ Example CSV target library:
                             target_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             target_library_upload = gr.File(label='Custom target library file', visible=False)
-                            target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Select a Prediction Task',
                                                                value='Compound-protein interaction')
                         with gr.Column():
-                            HelpTip("We recommend the appropriate model for your use case based on model performance "
-                                    "in drug-target interaction or binding affinity prediction. "
-                                    "The models were benchmarked on different target families "
-                                    "and real-world data scenarios.")
-                            target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
                             identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
-                    # with gr.Row():
-                    #     target_identify_email = gr.Textbox(
-                    #         label='Email (optional)',
-                    #         info="Your email will be used to send you notifications when your job finishes."
-                    #     )
                     with gr.Row(visible=True):
                         # target_identify_clr_btn = gr.ClearButton(size='lg')
@@ -1327,6 +1367,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                     screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
                 else:
                     screen_df = process_drug_library_upload(library_upload)
                     if len(screen_df) >= CUSTOM_DATASET_MAX_LEN:
                         raise gr.Error(f'The uploaded compound library has more records '
                                        f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
@@ -1564,3 +1605,5 @@ if __name__ == "__main__":
     demo.launch(
         show_api=False,
     )

 UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
+CUSTOM_DATASET_MAX_LEN = 10000
 CSS = """
 .help-tip {
 PRESET_MAP = {
     'DeepDTA': 'deep_dta',
+    'DeepConvDTI': 'deep_conv_dti',
     'GraphDTA': 'graph_dta',
     'MGraphDTA': 'm_graph_dta',
     'HyperAttentionDTI': 'hyper_attention_dti',
 def process_target_fasta(sequence):
+    lines = sequence.strip().split("\n")
+    if lines[0].startswith(">"):
+        lines = lines[1:]
+    return ''.join(lines).split(">")[0]
+    # record = SeqIO.parse(io.StringIO(sequence), "fasta")[0]
+    # return str(record.seq)
 def send_email(receiver, msg):
     else:
         raise gr.Error('Currently only CSV and SDF files are supported as compound libraries.')
     validate_columns(screen_df, ['X1'])
+    return screen_df
 def target_library_from_fasta(fasta_path):
     code_background_fill='white',
 )
+with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS) as demo:
     run_state = gr.State(value=False)
     screen_flag = gr.State(value=False)
     identify_flag = gr.State(value=False)
                     with gr.Row():
                         with gr.Column():
                             HelpTip(
+                                "Enter (paste) a amino acid sequence below manually or upload a FASTA file."
+                                "If multiple entities are in the FASTA, only the first will be used."
+                                "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for the sequence."
                             )
                             with gr.Row():
                                 target_input_type = gr.Dropdown(
+                                    label='Step 1. Select Target Input Type and Input',
                                     choices=['Sequence', 'UniProt ID', 'Gene symbol'],
                                     info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
                                     value='Sequence',
+                                    scale=4, interactive=True
                                 )
                                 target_id = gr.Textbox(show_label=False, visible=False,
                                                        interactive=True, scale=4,
                                     interactive=True, scale=4,
                                     info='Query a sequence on UniProt with a gene symbol.')
                                 target_organism = gr.Textbox(
+                                    info='Organism scientific name (default: Homo sapiens).',
+                                    placeholder='Homo sapiens', show_label=False,
                                     visible=False, interactive=True, scale=4, )
                     with gr.Row():
                         with gr.Column():
                                                                 size='lg')
                             target_query_btn = gr.Button(value='Query the sequence', variant='primary',
                                                          visible=False)
                     target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
+                    # with gr.Row():
+                    #     with gr.Column():
                     example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
+                        # with gr.Column():
+                        #     gr.File(label='Example FASTA file',
+                        #             value='data/examples/MAPK14.fasta', interactive=False)
                     with gr.Row():
                         with gr.Column():
+                            HelpTip(
+                                "Click Auto-detect to identify the protein family using sequence alignment. "
+                                "This optional step allows applying a family-specific model instead of a all-family model (general)."
+                                "Manually select general if the alignment results are unsatisfactory."
+                            )
+                            drug_screen_target_family = gr.Dropdown(
+                                choices=list(TARGET_FAMILY_MAP.keys()),
+                                value='General',
+                                label='Step 2. Select Input Protein Family (Optional)', interactive=True)
+                            # with gr.Column(scale=1, min_width=24):
+                    with gr.Row():
+                        with gr.Column():
+                            target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
+                    with gr.Row():
+                        with gr.Column():
+                            HelpTip(
+                                "Select a preset compound library (e.g., DrugBank)."
+                                "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, or use an SDF file."
+                            )
+                            drug_library = gr.Dropdown(label='Step 3. Select or Upload a Compound Library',
                                                        choices=list(DRUG_LIBRARY_MAP.keys()))
                             with gr.Row():
                                 gr.File(label='Example SDF compound library',
                             drug_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             drug_library_upload = gr.File(label='Custom compound library file', visible=False)
+                    with gr.Row():
+                        with gr.Column():
+                            HelpTip(
+                                "Interaction prediction provides you binding probability score between the target of interest and each compound in the library,"
+                                "while affinity prediction directly estimates their binding strength measured using IC50."
+                            )
+                            drug_screen_task = gr.Dropdown(list(TASK_MAP.keys()), label='Step 4. Select a Prediction Task',
                                                            value='Compound-protein interaction')
+                    with gr.Row():
                         with gr.Column():
+                            HelpTip("Select your preferred model, or click Recommend for the best-performing model based on the selected task, family, and whether the target was trained."
+                                    "Please refer to documentation for detailed benchamrk results."
+                            )
+                            drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Step 5. Select a Preset Model')
                             screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
+                    with gr.Row():
+                        with gr.Column():
+                            drug_screen_email = gr.Textbox(
+                                label='Step 6. Email (Optional)',
+                                info="If an email is provided, a notification email will be sent to you when your job is completed."
+                            )
                     with gr.Row(visible=True):
+                        with gr.Column():
                         # drug_screen_clr_btn = gr.ClearButton(size='lg')
+                            drug_screen_btn = gr.Button(value='SCREEN', variant='primary', size='lg')
                     # TODO Modify the pd df directly with df['X2'] = target
             screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
                     with gr.Row():
                         with gr.Column():
                             HelpTip(
+                                "Enter (paste) a compound SMILES below manually or upload a SDF file."
+                                "If multiple entities are in the SDF, only the first will be used."
+                                "SMILES can be obtained by searching for the compound of interest in databases such as NCBI, PubChem and and ChEMBL."
                             )
                             compound_type = gr.Dropdown(
+                                label='Step 1. Select Compound Input Type and Input',
                                 choices=['SMILES', 'SDF'],
+                                info='Enter (paste) an SMILES string or upload an SDF file.',
                                 value='SMILES',
                                 interactive=True)
                             compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
                     compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
                     example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
                     with gr.Row():
                         with gr.Column():
+                            HelpTip(
+                                "By default, models trained on all protein families (general) will be applied."
+                                "If the proteins in the target library of interest all belong to the same protein family, manually selecting the family is supported."
+                            )
+                            target_identify_target_family = gr.Dropdown(choices=list(TARGET_FAMILY_MAP.keys()),
+                                                                        value='General',
+                                                                        label='Step 2. Select Target Protein Family (Optional)')
+                    with gr.Row():
+                        with gr.Column():
+                            HelpTip(
+                                "Select a preset target library (e.g., ChEMBL33_human_proteins)."
+                                "Alternatively, upload a CSV file with a column named X2 containing tareget protein sequences, or use an FASTA file."
+                            )
+                            target_library = gr.Dropdown(label='Step 3. Select or Upload a Target Library',
                                                          choices=list(TARGET_LIBRARY_MAP.keys()))
                             with gr.Row():
                                 gr.File(label='Example FASTA target library',
                             target_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             target_library_upload = gr.File(label='Custom target library file', visible=False)
+                    with gr.Row():
+                        with gr.Column():
+                            HelpTip(
+                                "Interaction prediction provides you binding probability score between the target of interest and each compound in the library,"
+                                "while affinity prediction directly estimates their binding strength measured using IC50."
+                            )
+                            target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Step 4. Select a Prediction Task',
                                                                value='Compound-protein interaction')
+                    with gr.Row():
                         with gr.Column():
+                            HelpTip("Select your preferred model, or click Recommend for the best-performing model based on the selected task, family, and whether the compound was trained."
+                                    "Please refer to documentation for detailed benchamrk results."
+                                    )
+                            target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Step 5. Select a Preset Model')
                             identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
+                    with gr.Row():
+                        with gr.Column():
+                            target_identify_email = gr.Textbox(
+                                label='Step 6. Email (Optional)',
+                                info="If an email is provided, a notification email will be sent to you when your job is completed."
+                            )
                     with gr.Row(visible=True):
                         # target_identify_clr_btn = gr.ClearButton(size='lg')
                     screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
                 else:
                     screen_df = process_drug_library_upload(library_upload)
+                    print(screen_df.shape)
                     if len(screen_df) >= CUSTOM_DATASET_MAX_LEN:
                         raise gr.Error(f'The uploaded compound library has more records '
                                        f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
     demo.launch(
         show_api=False,
     )
+#%%