libokj commited on
Commit
b084d6f
·
1 Parent(s): c072dc8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -64
app.py CHANGED
@@ -60,7 +60,7 @@ SESSION.mount('https://', ADAPTER)
60
 
61
  UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
62
 
63
- CUSTOM_DATASET_MAX_LEN = 10_000
64
 
65
  CSS = """
66
  .help-tip {
@@ -353,7 +353,7 @@ TASK_MAP = {
353
 
354
  PRESET_MAP = {
355
  'DeepDTA': 'deep_dta',
356
- 'DeepConvDTI-ECFP4': 'deep_conv_dti',
357
  'GraphDTA': 'graph_dta',
358
  'MGraphDTA': 'm_graph_dta',
359
  'HyperAttentionDTI': 'hyper_attention_dti',
@@ -403,12 +403,12 @@ def validate_columns(df, mandatory_cols):
403
 
404
 
405
  def process_target_fasta(sequence):
406
- # lines = sequence.strip().split("\n")
407
- # if lines[0].startswith(">"):
408
- # lines = lines[1:]
409
- # return ''.join(lines).split(">")[0]
410
- record = SeqIO.parse(io.StringIO(sequence), "fasta")[0]
411
- return str(record.seq)
412
 
413
 
414
  def send_email(receiver, msg):
@@ -749,7 +749,7 @@ def process_drug_library_upload(library_upload):
749
  else:
750
  raise gr.Error('Currently only CSV and SDF files are supported as compound libraries.')
751
  validate_columns(screen_df, ['X1'])
752
- return library_upload
753
 
754
 
755
  def target_library_from_fasta(fasta_path):
@@ -783,7 +783,7 @@ theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
783
  code_background_fill='white',
784
  )
785
 
786
- with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
787
  run_state = gr.State(value=False)
788
  screen_flag = gr.State(value=False)
789
  identify_flag = gr.State(value=False)
@@ -802,18 +802,17 @@ To predict interactions/binding affinities of a single target against a library
802
  with gr.Row():
803
  with gr.Column():
804
  HelpTip(
805
- "Target amino acid sequence in the FASTA format. Alternatively, you may use a "
806
- "UniProt ID/accession to query UniProt database for the sequence of your "
807
- "target of interest. If the input FASTA contains multiple entities, "
808
- "only the first one will be used."
809
  )
810
  with gr.Row():
811
  target_input_type = gr.Dropdown(
812
- label='Target Input Type',
813
  choices=['Sequence', 'UniProt ID', 'Gene symbol'],
814
  info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
815
  value='Sequence',
816
- scale=3, interactive=True
817
  )
818
  target_id = gr.Textbox(show_label=False, visible=False,
819
  interactive=True, scale=4,
@@ -823,19 +822,9 @@ To predict interactions/binding affinities of a single target against a library
823
  interactive=True, scale=4,
824
  info='Query a sequence on UniProt with a gene symbol.')
825
  target_organism = gr.Textbox(
826
- info='Organism common name or scientific name (default: Human).',
827
- placeholder='Human', show_label=False,
828
  visible=False, interactive=True, scale=4, )
829
- with gr.Column():
830
- HelpTip(
831
- "Identify the protein family by conducting sequence alignment. "
832
- "You may select General if you find the alignment score unsatisfactory."
833
- )
834
- drug_screen_target_family = gr.Dropdown(
835
- choices=list(TARGET_FAMILY_MAP.keys()),
836
- value='General',
837
- label='Select Input Protein Family (Optional)', interactive=True)
838
- # with gr.Column(scale=1, min_width=24):
839
 
840
  with gr.Row():
841
  with gr.Column():
@@ -844,14 +833,39 @@ To predict interactions/binding affinities of a single target against a library
844
  size='lg')
845
  target_query_btn = gr.Button(value='Query the sequence', variant='primary',
846
  visible=False)
847
- target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
848
 
849
  target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
 
 
850
  example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
 
 
 
851
 
852
  with gr.Row():
853
  with gr.Column():
854
- drug_library = gr.Dropdown(label='Select a Compound Library',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  choices=list(DRUG_LIBRARY_MAP.keys()))
856
  with gr.Row():
857
  gr.File(label='Example SDF compound library',
@@ -861,24 +875,33 @@ To predict interactions/binding affinities of a single target against a library
861
  drug_library_upload_btn = gr.UploadButton(
862
  label='Upload a custom library', variant='primary')
863
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
864
- drug_screen_task = gr.Dropdown(list(TASK_MAP.keys()), label='Select a Prediction Task',
 
 
 
 
 
 
865
  value='Compound-protein interaction')
 
 
866
  with gr.Column():
867
- HelpTip("We recommend the appropriate model for your use case based on model performance "
868
- "in drug-target interaction or binding affinity prediction. "
869
- "The models were benchmarked on different target families "
870
- "and real-world data scenarios.")
871
- drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
872
  screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
873
-
874
- # drug_screen_email = gr.Textbox(
875
- # label='Email (optional)',
876
- # info="Your email will be used to send you notifications when your job finishes."
877
- # )
 
878
 
879
  with gr.Row(visible=True):
 
880
  # drug_screen_clr_btn = gr.ClearButton(size='lg')
881
- drug_screen_btn = gr.Button(value='SCREEN', variant='primary', size='lg')
882
  # TODO Modify the pd df directly with df['X2'] = target
883
 
884
  screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
@@ -914,29 +937,38 @@ Example CSV target library:
914
  with gr.Row():
915
  with gr.Column():
916
  HelpTip(
917
- """Compound molecule in the SMILES format. You may input the SMILES string directly,
918
- upload an SMI file, or upload an SDF file to convert to SMILES. Alternatively,
919
- you may search on databases like NCBI PubChem, ChEMBL, and DrugBank for the SMILES
920
- representing your drug of interest.
921
- """
922
  )
923
  compound_type = gr.Dropdown(
924
- label='Compound Input Type',
925
  choices=['SMILES', 'SDF'],
926
- info='Enter (paste) an SMILES string or upload an SMI file.',
927
  value='SMILES',
928
  interactive=True)
929
  compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
930
 
931
- target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
932
- label='Target Protein Family')
933
-
934
  compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
935
  example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
936
 
937
  with gr.Row():
938
  with gr.Column():
939
- target_library = gr.Dropdown(label='Select a Target Library',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
940
  choices=list(TARGET_LIBRARY_MAP.keys()))
941
  with gr.Row():
942
  gr.File(label='Example FASTA target library',
@@ -946,22 +978,30 @@ Example CSV target library:
946
  target_library_upload_btn = gr.UploadButton(
947
  label='Upload a custom library', variant='primary')
948
  target_library_upload = gr.File(label='Custom target library file', visible=False)
949
- target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Select a Prediction Task',
 
 
 
 
 
 
 
950
  value='Compound-protein interaction')
951
 
 
952
  with gr.Column():
953
- HelpTip("We recommend the appropriate model for your use case based on model performance "
954
- "in drug-target interaction or binding affinity prediction. "
955
- "The models were benchmarked on different target families "
956
- "and real-world data scenarios.")
957
- target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
958
  identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
959
 
960
- # with gr.Row():
961
- # target_identify_email = gr.Textbox(
962
- # label='Email (optional)',
963
- # info="Your email will be used to send you notifications when your job finishes."
964
- # )
 
965
 
966
  with gr.Row(visible=True):
967
  # target_identify_clr_btn = gr.ClearButton(size='lg')
@@ -1327,6 +1367,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1327
  screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
1328
  else:
1329
  screen_df = process_drug_library_upload(library_upload)
 
1330
  if len(screen_df) >= CUSTOM_DATASET_MAX_LEN:
1331
  raise gr.Error(f'The uploaded compound library has more records '
1332
  f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
@@ -1564,3 +1605,5 @@ if __name__ == "__main__":
1564
  demo.launch(
1565
  show_api=False,
1566
  )
 
 
 
60
 
61
  UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
62
 
63
+ CUSTOM_DATASET_MAX_LEN = 10000
64
 
65
  CSS = """
66
  .help-tip {
 
353
 
354
  PRESET_MAP = {
355
  'DeepDTA': 'deep_dta',
356
+ 'DeepConvDTI': 'deep_conv_dti',
357
  'GraphDTA': 'graph_dta',
358
  'MGraphDTA': 'm_graph_dta',
359
  'HyperAttentionDTI': 'hyper_attention_dti',
 
403
 
404
 
405
  def process_target_fasta(sequence):
406
+ lines = sequence.strip().split("\n")
407
+ if lines[0].startswith(">"):
408
+ lines = lines[1:]
409
+ return ''.join(lines).split(">")[0]
410
+ # record = SeqIO.parse(io.StringIO(sequence), "fasta")[0]
411
+ # return str(record.seq)
412
 
413
 
414
  def send_email(receiver, msg):
 
749
  else:
750
  raise gr.Error('Currently only CSV and SDF files are supported as compound libraries.')
751
  validate_columns(screen_df, ['X1'])
752
+ return screen_df
753
 
754
 
755
  def target_library_from_fasta(fasta_path):
 
783
  code_background_fill='white',
784
  )
785
 
786
+ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS) as demo:
787
  run_state = gr.State(value=False)
788
  screen_flag = gr.State(value=False)
789
  identify_flag = gr.State(value=False)
 
802
  with gr.Row():
803
  with gr.Column():
804
  HelpTip(
805
+ "Enter (paste) a amino acid sequence below manually or upload a FASTA file."
806
+ "If multiple entities are in the FASTA, only the first will be used."
807
+ "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for the sequence."
 
808
  )
809
  with gr.Row():
810
  target_input_type = gr.Dropdown(
811
+ label='Step 1. Select Target Input Type and Input',
812
  choices=['Sequence', 'UniProt ID', 'Gene symbol'],
813
  info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
814
  value='Sequence',
815
+ scale=4, interactive=True
816
  )
817
  target_id = gr.Textbox(show_label=False, visible=False,
818
  interactive=True, scale=4,
 
822
  interactive=True, scale=4,
823
  info='Query a sequence on UniProt with a gene symbol.')
824
  target_organism = gr.Textbox(
825
+ info='Organism scientific name (default: Homo sapiens).',
826
+ placeholder='Homo sapiens', show_label=False,
827
  visible=False, interactive=True, scale=4, )
 
 
 
 
 
 
 
 
 
 
828
 
829
  with gr.Row():
830
  with gr.Column():
 
833
  size='lg')
834
  target_query_btn = gr.Button(value='Query the sequence', variant='primary',
835
  visible=False)
 
836
 
837
  target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
838
+ # with gr.Row():
839
+ # with gr.Column():
840
  example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
841
+ # with gr.Column():
842
+ # gr.File(label='Example FASTA file',
843
+ # value='data/examples/MAPK14.fasta', interactive=False)
844
 
845
  with gr.Row():
846
  with gr.Column():
847
+ HelpTip(
848
+ "Click Auto-detect to identify the protein family using sequence alignment. "
849
+ "This optional step allows applying a family-specific model instead of a all-family model (general)."
850
+ "Manually select general if the alignment results are unsatisfactory."
851
+ )
852
+ drug_screen_target_family = gr.Dropdown(
853
+ choices=list(TARGET_FAMILY_MAP.keys()),
854
+ value='General',
855
+ label='Step 2. Select Input Protein Family (Optional)', interactive=True)
856
+ # with gr.Column(scale=1, min_width=24):
857
+
858
+ with gr.Row():
859
+ with gr.Column():
860
+ target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
861
+
862
+ with gr.Row():
863
+ with gr.Column():
864
+ HelpTip(
865
+ "Select a preset compound library (e.g., DrugBank)."
866
+ "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, or use an SDF file."
867
+ )
868
+ drug_library = gr.Dropdown(label='Step 3. Select or Upload a Compound Library',
869
  choices=list(DRUG_LIBRARY_MAP.keys()))
870
  with gr.Row():
871
  gr.File(label='Example SDF compound library',
 
875
  drug_library_upload_btn = gr.UploadButton(
876
  label='Upload a custom library', variant='primary')
877
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
878
+ with gr.Row():
879
+ with gr.Column():
880
+ HelpTip(
881
+ "Interaction prediction provides you binding probability score between the target of interest and each compound in the library,"
882
+ "while affinity prediction directly estimates their binding strength measured using IC50."
883
+ )
884
+ drug_screen_task = gr.Dropdown(list(TASK_MAP.keys()), label='Step 4. Select a Prediction Task',
885
  value='Compound-protein interaction')
886
+
887
+ with gr.Row():
888
  with gr.Column():
889
+ HelpTip("Select your preferred model, or click Recommend for the best-performing model based on the selected task, family, and whether the target was trained."
890
+ "Please refer to documentation for detailed benchamrk results."
891
+ )
892
+ drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Step 5. Select a Preset Model')
 
893
  screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
894
+ with gr.Row():
895
+ with gr.Column():
896
+ drug_screen_email = gr.Textbox(
897
+ label='Step 6. Email (Optional)',
898
+ info="If an email is provided, a notification email will be sent to you when your job is completed."
899
+ )
900
 
901
  with gr.Row(visible=True):
902
+ with gr.Column():
903
  # drug_screen_clr_btn = gr.ClearButton(size='lg')
904
+ drug_screen_btn = gr.Button(value='SCREEN', variant='primary', size='lg')
905
  # TODO Modify the pd df directly with df['X2'] = target
906
 
907
  screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
 
937
  with gr.Row():
938
  with gr.Column():
939
  HelpTip(
940
+ "Enter (paste) a compound SMILES below manually or upload a SDF file."
941
+ "If multiple entities are in the SDF, only the first will be used."
942
+ "SMILES can be obtained by searching for the compound of interest in databases such as NCBI, PubChem and and ChEMBL."
 
 
943
  )
944
  compound_type = gr.Dropdown(
945
+ label='Step 1. Select Compound Input Type and Input',
946
  choices=['SMILES', 'SDF'],
947
+ info='Enter (paste) an SMILES string or upload an SDF file.',
948
  value='SMILES',
949
  interactive=True)
950
  compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
951
 
 
 
 
952
  compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
953
  example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
954
 
955
  with gr.Row():
956
  with gr.Column():
957
+ HelpTip(
958
+ "By default, models trained on all protein families (general) will be applied."
959
+ "If the proteins in the target library of interest all belong to the same protein family, manually selecting the family is supported."
960
+ )
961
+ target_identify_target_family = gr.Dropdown(choices=list(TARGET_FAMILY_MAP.keys()),
962
+ value='General',
963
+ label='Step 2. Select Target Protein Family (Optional)')
964
+
965
+ with gr.Row():
966
+ with gr.Column():
967
+ HelpTip(
968
+ "Select a preset target library (e.g., ChEMBL33_human_proteins)."
969
+ "Alternatively, upload a CSV file with a column named X2 containing tareget protein sequences, or use an FASTA file."
970
+ )
971
+ target_library = gr.Dropdown(label='Step 3. Select or Upload a Target Library',
972
  choices=list(TARGET_LIBRARY_MAP.keys()))
973
  with gr.Row():
974
  gr.File(label='Example FASTA target library',
 
978
  target_library_upload_btn = gr.UploadButton(
979
  label='Upload a custom library', variant='primary')
980
  target_library_upload = gr.File(label='Custom target library file', visible=False)
981
+
982
+ with gr.Row():
983
+ with gr.Column():
984
+ HelpTip(
985
+ "Interaction prediction provides you binding probability score between the target of interest and each compound in the library,"
986
+ "while affinity prediction directly estimates their binding strength measured using IC50."
987
+ )
988
+ target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Step 4. Select a Prediction Task',
989
  value='Compound-protein interaction')
990
 
991
+ with gr.Row():
992
  with gr.Column():
993
+ HelpTip("Select your preferred model, or click Recommend for the best-performing model based on the selected task, family, and whether the compound was trained."
994
+ "Please refer to documentation for detailed benchamrk results."
995
+ )
996
+ target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Step 5. Select a Preset Model')
 
997
  identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
998
 
999
+ with gr.Row():
1000
+ with gr.Column():
1001
+ target_identify_email = gr.Textbox(
1002
+ label='Step 6. Email (Optional)',
1003
+ info="If an email is provided, a notification email will be sent to you when your job is completed."
1004
+ )
1005
 
1006
  with gr.Row(visible=True):
1007
  # target_identify_clr_btn = gr.ClearButton(size='lg')
 
1367
  screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
1368
  else:
1369
  screen_df = process_drug_library_upload(library_upload)
1370
+ print(screen_df.shape)
1371
  if len(screen_df) >= CUSTOM_DATASET_MAX_LEN:
1372
  raise gr.Error(f'The uploaded compound library has more records '
1373
  f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
 
1605
  demo.launch(
1606
  show_api=False,
1607
  )
1608
+
1609
+ #%%