DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

libokj commited on Apr 17, 2024

Commit

c95ea1d

verified ·

1 Parent(s): ccdad94

Update app.py

Browse files

Files changed (1) hide show

app.py +379 -112

app.py CHANGED Viewed

@@ -27,8 +27,8 @@ from pandarallel import pandarallel
 import requests
 from requests.adapters import HTTPAdapter, Retry
 from markdown import markdown
-from rdkit import Chem
-from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
 from rdkit.Chem.Scaffolds import MurckoScaffold
 import seaborn as sns
@@ -196,6 +196,13 @@ TARGET_FAMILY_MAP = {
     'Nuclear Receptor': 'nuclear_receptor',
     'Ion Channel': 'ion_channel',
     'Others': 'others',
 }
 TARGET_LIBRARY_MAP = {
@@ -247,7 +254,7 @@ def remove_job_record(job_id):
     # Delete the job from the database
     db.remove(Job.id == job_id)
     # Delete the corresponding files
-    files = glob.glob(f"/data/{job_id}*")
     for file_path in files:
         if os.path.exists(file_path):
             os.remove(file_path)
@@ -265,7 +272,7 @@ def check_expiry():
                 # Delete the job from the database
                 db.remove(Job.id == job['id'])
                 # Delete the corresponding file
-                files = glob.glob(f"/data/{job['id']}*")
                 for file_path in files:
                     if os.path.exists(file_path):
                         os.remove(file_path)
@@ -278,8 +285,63 @@ def check_expiry():
                 send_email(job)
-scheduler.add_job(check_expiry, 'interval', hours=1)
-scheduler.start()
 def lipinski(mol):
@@ -635,46 +697,155 @@ using the job id. You will also receive an email notification once the job is do
             raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
-def submit_predict(predict_filepath, task, preset, target_family, state):
     job_id = state['id']
     status = "RUNNING"
     error = None
     task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
     predictions_file = None
     try:
-        target_family = TARGET_FAMILY_MAP[target_family]
-        predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_{target_family}_predictions.csv'
-        task = TASK_MAP[task]
-        preset = PRESET_MAP[preset]
-        prediction_df = pd.DataFrame()
-        cfg = hydra.compose(
-            config_name="webserver_inference",
-            overrides=[f"task={task}",
-                       f"preset={preset}",
-                       f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
-                       f"data.data_file='{str(predict_filepath)}'"])
-        # with concurrent.futures.ThreadPoolExecutor() as executor:
-        #     future = executor.submit(predict, cfg)
-        #     try:
-        #         predictions, _ = future.result(timeout=4*60*60)
-        #     except concurrent.futures.TimeoutError:
-        #         raise gr.Error("Prediction timed out.")
-        predictions, _ = predict(cfg)
-        predictions = [pd.DataFrame(prediction) for prediction in predictions]
-        prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
-        prediction_df.set_index('N', inplace=True)
-        orig_df = pd.read_csv(
-            predict_filepath,
-            usecols=lambda x: x not in ['X1', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES',
-                                        'X2', 'ID2',
-                                        'Y', 'Y^']
-        )
-        prediction_df = pd.merge(prediction_df, orig_df, left_index=True, right_index=True, how='left')
-        prediction_df.to_csv(predictions_file)
         status = "COMPLETED"
         return {run_state: False}
@@ -714,19 +885,21 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
             task = 'Compound-Protein Binding Affinity'
         df = pd.read_csv(file)
         if 'N' in df.columns:
             df.set_index('N', inplace=True)
         if not any(col in ['X1', 'X2'] for col in df.columns):
             gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
             return {analyze_btn: gr.Button(interactive=False)}
         if 'X1' in df.columns:
-            df['Scaffold SMILES'] = df['X1'].parallel_apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
-            df['Scaffold'] = df['Scaffold SMILES'].parallel_apply(
-                lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
-            # Add a new column with RDKit molecule objects
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
                 df['Compound'] = df['X1'].parallel_apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
         # DF_FOR_REPORT = df.copy()
@@ -752,7 +925,7 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
         return {analyze_btn: gr.Button(interactive=False)}
-def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm=True)):
     df_html = df.copy(deep=True)
     column_aliases = COLUMN_ALIASES.copy()
     cols_left = list(pd.Index(
@@ -763,9 +936,9 @@ def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm
     if isinstance(task, str):
         column_aliases.update({
             'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
-            else 'Actual Binding Affinity',
             'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
-            else 'Predicted Binding Affinity'
         })
     ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
@@ -803,12 +976,17 @@ def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm
         elif 'Y^' in df_html.columns:
             job = 'Interaction Pair Inference'
-    if 'Compound' in df_html.columns:
         df_html['Compound'] = df_html['Compound'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
-    if 'Scaffold' in df_html.columns:
         df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
@@ -1276,7 +1454,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                         "Interaction prediction provides you binding probability score between the target of "
                         "interest and each compound in the library, "
                         "while affinity prediction directly estimates their binding strength measured using "
-                        "IC50."
                     )
                     drug_screen_task = gr.Dropdown(
                         list(TASK_MAP.keys()),
@@ -1313,17 +1491,24 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                     drug_library_upload_btn = gr.UploadButton(
                         label='OR Upload Your Own Library', variant='primary')
                     drug_library_upload = gr.File(label='Custom compound library file', visible=False)
             with gr.Row():
                 with gr.Column():
                     drug_screen_email = gr.Textbox(
-                        label='Step 6. Input Your Email Address (Optional)',
                         info="Your email address will be used to notify you of the status of your job. "
                              "If you cannot receive the email, please check your spam/junk folder."
                     )
             with gr.Row(visible=True):
                 with gr.Column():
-                    # drug_screen_clr_btn = gr.ClearButton(size='lg')
                     drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
             # TODO Modify the pd df directly with df['X2'] = target
@@ -1359,26 +1544,25 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
                 with gr.Row():
-                    with gr.Column(visible=False):
                         HelpTip(
                             "By default, models trained on all protein families (general) will be applied. "
-                            # "If the proteins in the target library of interest all belong to the same protein "
-                            # "family, manually selecting the family is supported."
                         )
                         target_identify_target_family = gr.Dropdown(
-                            choices=['General'], value='General',
-                            label='Target Family')
-                with gr.Row():
                     with gr.Column():
                         HelpTip(
                             "Interaction prediction provides you binding probability score between the target of "
                             "interest and each compound in the library, while affinity prediction directly "
-                            "estimates their binding strength measured using IC50."
                         )
                         target_identify_task = gr.Dropdown(
                             list(TASK_MAP.keys()),
-                            label='Step 2. Select a Prediction Task',
                             value='Compound-Protein Interaction')
                     with gr.Column():
@@ -1389,8 +1573,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                             "Please refer to the documentation for detailed benchmark results."
                         )
                         target_identify_preset = gr.Dropdown(
-                            list(PRESET_MAP.keys()),
-                            label='Step 3. Select a Preset Model')
                         identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
                                                                   variant='primary')
                 with gr.Row():
@@ -1403,7 +1587,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                             "and can be downloaded by clicking the lower right corner."
                         )
                         target_library = gr.Dropdown(
-                            label='Step 4. Select a Preset Target Library',
                             choices=list(TARGET_LIBRARY_MAP.keys()))
                         with gr.Row():
                             gr.File(label='Example FASTA target library',
@@ -1414,16 +1598,23 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                             label='OR Upload Your Own Library', variant='primary')
                         target_library_upload = gr.File(label='Custom target library file', visible=False)
                 with gr.Row():
                     with gr.Column():
                         target_identify_email = gr.Textbox(
-                            label='Step 5. Input Your Email Address (Optional)',
                             info="Your email address will be used to notify you of the status of your job. "
                                  "If you cannot receive the email, please check your spam/junk folder."
                         )
                 with gr.Row(visible=True):
-                    # target_identify_clr_btn = gr.ClearButton(size='lg')
                     target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
                                                     size='lg')
@@ -1501,7 +1692,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                         "Interaction prediction provides you binding probability score "
                         "between the target of interest and each compound in the library, "
                         "while affinity prediction directly estimates their binding strength "
-                        "measured using IC50."
                     )
                     pair_infer_task = gr.Dropdown(
                         list(TASK_MAP.keys()),
@@ -1525,7 +1716,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                          "If you cannot receive the email, please check your spam/junk folder.")
             with gr.Row(visible=True):
-                # pair_infer_clr_btn = gr.ClearButton(size='lg')
                 pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
             infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
@@ -1546,25 +1737,33 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
             Please first `Preview` the report, then `Generate` and download a CSV report
             or an interactive HTML report below if you wish to access the full report.
             ''')
             with gr.Row():
-                with gr.Column():
                     file_for_report = gr.File(interactive=True, type='filepath')
                     report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
-                                              label='Specify the Task Labels in the Upload Dataset')
-                raw_df = gr.State(value=pd.DataFrame())
-                report_df = gr.State(value=pd.DataFrame())
-                scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Scores')
-                filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Filters')
             with gr.Row():
-                # clear_btn = gr.ClearButton(size='lg')
-                analyze_btn = gr.Button('Preview Top 30 Records', variant='primary', size='lg',
-                                        interactive=False)
             with gr.Row():
                 with gr.Column(scale=3):
                     html_report = gr.HTML()  # label='Results', visible=True)
-                ranking_pie_chart = gr.Plot(visible=False)
             with gr.Row():
                 with gr.Column():
@@ -1584,8 +1783,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
             if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
             You will be redirected to Chemical Property Report for carrying out further analysis and
-            generating the full report if the job is done. If the Lookup fails to respond, please come back
-            in five minutes, refresh the page, and try again.
             ''')
             with gr.Column():
                 pred_lookup_id = gr.Textbox(
@@ -1689,8 +1888,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
     def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
         try:
-            aligner = PairwiseAligner(scoring='blastp', mode='local')
-            alignment_df = pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv')
             processed_fasta = process_target_fasta(fasta)
@@ -1698,18 +1897,20 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
             exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
             if not exact_match.empty:
                 row = exact_match.iloc[0]
-                return gr.Dropdown(value=row['protein_family'],
-                                   info=f"Reason: Exact match found with {row['ID2']} from family {row['protein_family']}")
             # If no exact match, then calculate alignment score
             def align_score(query):
-                return aligner.align(processed_fasta, query).score
             alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
-            return gr.Dropdown(value=row['protein_family'],
-                               info=f"Reason: Best BLASTP score ({row['score']}) "
-                                    f"with {row['ID2']} from family {row['protein_family']}")
         except Exception as e:
             gr.Warning("Failed to detect the protein family due to error: " + str(e))
@@ -1772,7 +1973,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                 scenario_general = "Unseen Target"
             seen_targets_family = pd.read_csv(
-                f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family]}_{task.lower()}_random_split.csv')
             if process_target_fasta(fasta) in seen_targets_family['X2'].values:
                 scenario_family = "Seen Target"
             else:
@@ -1787,12 +1988,16 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             filtered_df = pd.concat([filtered_df_general, filtered_df_family])
         row = filtered_df.loc[filtered_df[score].idxmax()]
         return {drug_screen_preset:
                     gr.Dropdown(value=row['Model'],
                                 info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
-                                     f"model with the best {score} ({float(row[score]):.3f}) "
-                                     f"in the {row['Scenario']} scenario on {row['Family']}."),
                 drug_screen_target_family:
                     gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
@@ -1848,9 +2053,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             gr.Warning('Please enter a valid SMILES for model recommendation.')
             return None
-        seen_drugs = pd.read_csv(
-            f'data/benchmarks/seen_drugs/all_families_full_{task.lower()}_random_split.csv')
-        if rdkit_canonicalize(smiles) in seen_drugs['X1'].values:
             scenario = "Seen Compound"
         else:
             scenario = "Unseen Compound"
@@ -1863,8 +2068,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
         return gr.Dropdown(value=row['Model'],
                            info=f"Reason: {scenario} in training; choosing the model "
-                                f"with the best {score} ({float(row[score]):.3f}) "
-                                f"in the {scenario} scenario.")
     identify_preset_recommend_btn.click(fn=identify_recommend_model,
@@ -1965,7 +2169,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
         job_id = str(uuid4())
         temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
-        screen_df.to_csv(temp_file, index=False)
         if temp_file.is_file():
             job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
             return {screen_data_for_predict: str(temp_file),
@@ -1995,7 +2199,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
         job_id = str(uuid4())
         temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
-        identify_df.to_csv(temp_file, index=False)
         if temp_file.is_file():
             job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
             return {identify_data_for_predict: str(temp_file),
@@ -2043,7 +2247,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                                f'than the allowed maximum {DATASET_MAX_LEN}.')
             temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
-            infer_df.to_csv(temp_file, index=False)
         else:
             raise gr.Error('Should upload a compound-protein pair dataset, or '
@@ -2093,10 +2297,54 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     drug_screen_click.success(
         fn=submit_predict,
         inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
-                drug_screen_target_family, run_state, ],
         outputs=[run_state, ]
     )
     target_identify_click = target_identify_btn.click(
         fn=target_identify_validate,
         inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
@@ -2125,7 +2373,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     target_identify_click.success(
         fn=submit_predict,
         inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
-                target_identify_target_family, run_state, ],  # , target_identify_email],
         outputs=[run_state, ]
     )
@@ -2200,6 +2448,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     report_df_change = file_for_report.change(
         fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
         concurrency_limit=100,
     )
     file_for_report.upload(
@@ -2214,8 +2465,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     file_for_report.clear(
         fn=lambda: [gr.Button(interactive=False)] * 3 +
                    [gr.File(visible=False, value=None)] * 2 +
-                   [gr.Dropdown(visible=False, value=None),
-                    gr.HTML(visible=False)],
         outputs=[
             csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file, report_task, html_report
         ]
@@ -2234,11 +2485,23 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                        outputs=analyze_btn)
-    def create_csv_report_file(df, file_report, progress=gr.Progress(track_tqdm=True)):
         try:
             now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-            filename = f"/data/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
-            df.drop(labels=['Compound', 'Scaffold'], axis=1).to_csv(filename, index=True, na_rep='')
             return gr.File(filename)
         except Exception as e:
@@ -2246,28 +2509,32 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             return None
-    def create_html_report_file(df, file_report, task, progress=gr.Progress(track_tqdm=True)):
         try:
             now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-            filename = f"/data/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
-            create_html_report(df, filename, task)
             return gr.File(filename, visible=True)
         except Exception as e:
             gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
             return None
-    html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
     csv_generate.click(
-        lambda: [gr.Button(visible=False), gr.File(visible=True)], outputs=[csv_generate, csv_download_file],
-    ).then(fn=create_csv_report_file, inputs=[report_df, file_for_report],
            outputs=csv_download_file, show_progress='full')
     html_generate.click(
-        lambda: [gr.Button(visible=False), gr.File(visible=True)], outputs=[html_generate, html_download_file],
-    ).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task],
            outputs=html_download_file, show_progress='full')
 if __name__ == "__main__":
     hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
-    pandarallel.initialize(progress_bar=True)
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)

 import requests
 from requests.adapters import HTTPAdapter, Retry
 from markdown import markdown
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen, AllChem
 from rdkit.Chem.Scaffolds import MurckoScaffold
 import seaborn as sns
     'Nuclear Receptor': 'nuclear_receptor',
     'Ion Channel': 'ion_channel',
     'Others': 'others',
+    # 'general': 'general',
+    # 'kinase': 'kinase',
+    # 'non-kinase enzyme': 'non_kinase_enzyme',
+    # 'membrane receptor': 'membrane_receptor',
+    # 'nuclear Receptor': 'nuclear_receptor',
+    # 'ion channel': 'ion_channel',
+    # 'others': 'others',
 }
 TARGET_LIBRARY_MAP = {
     # Delete the job from the database
     db.remove(Job.id == job_id)
     # Delete the corresponding files
+    files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*")
     for file_path in files:
         if os.path.exists(file_path):
             os.remove(file_path)
                 # Delete the job from the database
                 db.remove(Job.id == job['id'])
                 # Delete the corresponding file
+                files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*")
                 for file_path in files:
                     if os.path.exists(file_path):
                         os.remove(file_path)
                 send_email(job)
+@cache
+def max_tanimoto_similarity(smi, seen_smiles):
+    if smi is None:
+        return 0
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return 0
+    mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
+    max_sim = 0
+    for smiles in seen_smiles:
+        mol_seen = Chem.MolFromSmiles(smiles)
+        mol_seen_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_seen, radius=2, nBits=2048)
+        sim = DataStructs.TanimotoSimilarity(mol_ecfp, mol_seen_ecfp)
+        if sim == 1:
+            return 1
+        max_sim = max(sim, max_sim)
+    return max_sim
+@cache
+def max_sequence_identity(seq, seen_fastas):
+    if seq is None:
+        return 0
+    aligner = PairwiseAligner()
+    aligner.mode = 'local'
+    max_id = 0
+    for fasta in seen_fastas:
+        alignment = aligner.align(seq, fasta)
+        identity = alignment.score / max(len(seq), len(fasta))
+        if identity == 1:
+            return 1
+        max_id = max(identity, max_id)
+    return max_id
+@cache
+def get_seen_smiles(family, task):
+    seen_smiles = pd.read_csv(
+        f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
+    return seen_smiles['X1'].tolist()
+@cache
+def get_seen_fastas(family, task):
+    seen_fastas = pd.read_csv(
+        f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
+    return seen_fastas['X2'].tolist()
+@cache
+def get_fasta_family_map():
+    usecols = ['X2', 'ID2', 'Target Family']
+    fasta_family_map = pd.concat([
+        pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols),
+        pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols)
+    ]).drop_duplicates(subset=['X2'], keep='first')
+    return fasta_family_map
 def lipinski(mol):
             raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
+def submit_predict(predict_filepath, task, preset, target_family, opts, state):
     job_id = state['id']
     status = "RUNNING"
     error = None
     task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
     predictions_file = None
+    df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
+    orig_df = pd.read_csv(predict_filepath)
+    alignment_df = get_fasta_family_map()
+    prediction_df = pd.DataFrame()
+    @cache
+    def detect_family(query):
+        # Check for an exact match first
+        exact_match = alignment_df[alignment_df['X2'] == query]
+        if not exact_match.empty:
+            row = exact_match.iloc[0]
+            return row['Target Family']
+        # If no exact match, then calculate alignment score
+        else:
+            aligner = PairwiseAligner(mode='local')
+            def align_score(target):
+                alignment = aligner.align(query, target)
+                return alignment.score / max(len(query), len(target))
+            alignment_df['score'] = alignment_df['X2'].apply(align_score)
+            row = alignment_df.loc[alignment_df['score'].idxmax()]
+            return row['Target Family']
+    if 'Target Family' not in orig_df.columns:
+        orig_df['Target Family'] = None
+    orig_df.loc[
+        orig_df['Target Family'].isna(), 'Target Family'
+    ] = orig_df.loc[
+        orig_df['Target Family'].isna(), 'X2'
+    ].parallel_apply(detect_family)
+    detect_family.cache_clear()
+    orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
+    annotated_df = orig_df[~orig_df['Y'].isna()].copy()
+    annotated_df.rename(columns={'Y': 'Y^'}, inplace=True)
+    annotated_df['Prediction Source'] = 'Training Data'
+    # Resave the unannotated data
+    unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y', 'Target Family'], axis=1)
+    if not unannotated_df.empty:
+        unannotated_df.to_csv(predict_filepath, index=False, na_rep='')
+    else:
+        annotated_df.to_csv(predictions_file, index=False, na_rep='')
+        status = "COMPLETED"
+        return {run_state: False}
+    columns_to_drop = ['ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^']
+    columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns]
+    orig_df.drop(columns_to_drop, axis=1, inplace=True)
     try:
+        if target_family != 'Family-Specific Auto-Recommendation':
+            target_family_value = TARGET_FAMILY_MAP[target_family.title()]
+            task_value = TASK_MAP[task]
+            preset_value = PRESET_MAP[preset]
+            predictions_file = (f'{SERVER_DATA_DIR}/'
+                                f'{job_id}_{task_file_abbr[task]}_{preset_value}_{target_family_value}_predictions.csv')
+            cfg = hydra.compose(
+                config_name="webserver_inference",
+                overrides=[f"task={task_value}",
+                           f"preset={preset_value}",
+                           f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt",
+                           f"data.data_file='{str(predict_filepath)}'"])
+            predictions, _ = predict(cfg)
+            predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
+            predictions['Prediction Source'] = f'{preset} ({target_family})'
+            prediction_df = pd.concat([prediction_df, predictions])
+        else:
+            predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_auto_predictions.csv'
+            task_value = TASK_MAP[task]
+            score = TASK_METRIC_MAP[task]
+            benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
+            predict_df = pd.read_csv(predict_filepath)
+            for family, subset in predict_df.groupby('Target Family'):
+                predict_subset_filepath = f'{SERVER_DATA_DIR}/{job_id}_{family}_input.csv'
+                subset.to_csv(predict_subset_filepath, index=False, na_rep='')
+                seen_compounds = get_seen_smiles(family, task_value)
+                if subset['X1'].iloc[0] in seen_compounds['X1'].values:
+                    scenario = "Seen Compound"
+                else:
+                    scenario = "Unseen Compound"
+                filtered_df = benchmark_df[(benchmark_df['Family'] == target_family.title())
+                                           & (benchmark_df['Scenario'] == scenario)]
+                preset = filtered_df.loc[filtered_df[score].idxmax(), 'preset']
+                preset_value = PRESET_MAP[preset]
+                target_family = TARGET_FAMILY_MAP[family.title()]
+                cfg = hydra.compose(
+                    config_name="webserver_inference",
+                    overrides=[f"task={task_value}",
+                               f"preset={preset_value}",
+                               f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
+                               f"data.data_file='{str(predict_subset_filepath)}'"])
+                predictions, _ = predict(cfg)
+                predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
+                predictions['Prediction Source'] = f'{preset} ({family})'
+                prediction_df = pd.concat([prediction_df, predictions])
+        prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
+        prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
+        # prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
+        #     lambda group: group.parallel_apply(
+        #         max_tanimoto_similarity,
+        #         seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
+        #     )
+        # ).values
+        #
+        # prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
+        #     lambda group: group.parallel_apply(
+        #         max_sequence_identity,
+        #         seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
+        #     )
+        # ).values
+        if "Include Max. Tanimoto Similarity" in opts:
+            for family in prediction_df['Target Family'].unique():
+                prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'X1'].parallel_apply(
+                    max_tanimoto_similarity,
+                    seen_smiles=tuple(get_seen_smiles(family=family, task=task_value))
+                )
+        if "Include Max. Sequence Identity" in opts:
+            for family in prediction_df['Target Family'].unique():
+                prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'X2'].parallel_apply(
+                    max_sequence_identity,
+                    seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
+                )
+        prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
         status = "COMPLETED"
         return {run_state: False}
             task = 'Compound-Protein Binding Affinity'
         df = pd.read_csv(file)
         if 'N' in df.columns:
             df.set_index('N', inplace=True)
         if not any(col in ['X1', 'X2'] for col in df.columns):
             gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
             return {analyze_btn: gr.Button(interactive=False)}
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
                 df['Compound'] = df['X1'].parallel_apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+            df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
+            df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
         # DF_FOR_REPORT = df.copy()
         return {analyze_btn: gr.Button(interactive=False)}
+def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
     df_html = df.copy(deep=True)
     column_aliases = COLUMN_ALIASES.copy()
     cols_left = list(pd.Index(
     if isinstance(task, str):
         column_aliases.update({
             'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
+            else 'Actual Binding Affinity pIC50 [nM]',
             'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
+            else 'Predicted Binding Affinity (pIC50 [nM])'
         })
     ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
         elif 'Y^' in df_html.columns:
             job = 'Interaction Pair Inference'
+    if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
         df_html['Compound'] = df_html['Compound'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
+    else:
+        df_html.drop(['Compound'], axis=1, inplace=True)
+    if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
         df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
+    else:
+        df_html.drop(['Scaffold'], axis=1, inplace=True)
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
                         "Interaction prediction provides you binding probability score between the target of "
                         "interest and each compound in the library, "
                         "while affinity prediction directly estimates their binding strength measured using "
+                        "pIC<sub>50</sub> in units of nM."
                     )
                     drug_screen_task = gr.Dropdown(
                         list(TASK_MAP.keys()),
                     drug_library_upload_btn = gr.UploadButton(
                         label='OR Upload Your Own Library', variant='primary')
                     drug_library_upload = gr.File(label='Custom compound library file', visible=False)
+                with gr.Column():
+                    drug_screen_opts = gr.CheckboxGroup(
+                        ['Include Max. Tanimoto Similarity'],
+                        label='Step 6. Select Additional Options',
+                        info="Calculating the maximum Tanimoto similarity of the library compounds to the "
+                             "training dataset is an experimental feature and may take a considerable amount of time."
+                    )
             with gr.Row():
                 with gr.Column():
                     drug_screen_email = gr.Textbox(
+                        label='Step 7. Input Your Email Address (Optional)',
                         info="Your email address will be used to notify you of the status of your job. "
                              "If you cannot receive the email, please check your spam/junk folder."
                     )
             with gr.Row(visible=True):
                 with gr.Column():
+                    drug_screen_clr_btn = gr.ClearButton(size='lg')
                     drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
             # TODO Modify the pd df directly with df['X2'] = target
                 example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
                 with gr.Row():
+                    with gr.Column(visible=True):
                         HelpTip(
                             "By default, models trained on all protein families (general) will be applied. "
+                            "If you upload a target library containing proteins all in the same family, "
+                            "you may manually select a Target Family."
                         )
                         target_identify_target_family = gr.Dropdown(
+                            choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()),
+                            value='General',
+                            label='Step 2. Select Target Family')
                     with gr.Column():
                         HelpTip(
                             "Interaction prediction provides you binding probability score between the target of "
                             "interest and each compound in the library, while affinity prediction directly "
+                            "estimates their binding strength measured using pIC<sub>50</sub> in units of nM."
                         )
                         target_identify_task = gr.Dropdown(
                             list(TASK_MAP.keys()),
+                            label='Step 3. Select a Prediction Task',
                             value='Compound-Protein Interaction')
                     with gr.Column():
                             "Please refer to the documentation for detailed benchmark results."
                         )
                         target_identify_preset = gr.Dropdown(
+                            ['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()),
+                            label='Step 4. Select a Preset Model')
                         identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
                                                                   variant='primary')
                 with gr.Row():
                             "and can be downloaded by clicking the lower right corner."
                         )
                         target_library = gr.Dropdown(
+                            label='Step 5. Select a Preset Target Library',
                             choices=list(TARGET_LIBRARY_MAP.keys()))
                         with gr.Row():
                             gr.File(label='Example FASTA target library',
                             label='OR Upload Your Own Library', variant='primary')
                         target_library_upload = gr.File(label='Custom target library file', visible=False)
+                    with gr.Column():
+                        target_identify_opts = gr.CheckboxGroup(
+                            ['Include Max. Sequence Identity'],
+                            label='Step 6. Select Additional Options',
+                            info="Calculating the maximum sequence identity of the library protein to the "
+                                 "training dataset is an experimental feature and may take a considerable amount of time."
+                        )
                 with gr.Row():
                     with gr.Column():
                         target_identify_email = gr.Textbox(
+                            label='Step 7. Input Your Email Address (Optional)',
                             info="Your email address will be used to notify you of the status of your job. "
                                  "If you cannot receive the email, please check your spam/junk folder."
                         )
                 with gr.Row(visible=True):
+                    target_identify_clr_btn = gr.ClearButton(size='lg')
                     target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
                                                     size='lg')
                         "Interaction prediction provides you binding probability score "
                         "between the target of interest and each compound in the library, "
                         "while affinity prediction directly estimates their binding strength "
+                        "measured using pIC<sub>50</sub> in units of nM."
                     )
                     pair_infer_task = gr.Dropdown(
                         list(TASK_MAP.keys()),
                          "If you cannot receive the email, please check your spam/junk folder.")
             with gr.Row(visible=True):
+                pair_infer_clr_btn = gr.ClearButton(size='lg')
                 pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
             infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
             Please first `Preview` the report, then `Generate` and download a CSV report
             or an interactive HTML report below if you wish to access the full report.
             ''')
+            raw_df = gr.State(value=pd.DataFrame())
+            report_df = gr.State(value=pd.DataFrame())
             with gr.Row():
+                with gr.Column(scale=1):
                     file_for_report = gr.File(interactive=True, type='filepath')
                     report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
+                                              label='Specify the Task Labels in the Uploaded Dataset')
+                with gr.Column(scale=2):
+                    with gr.Row():
+                        scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
+                        filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
+                    with gr.Accordion('Report Generate Options', open=False):
+                        with gr.Row():
+                            csv_sep = gr.Radio(label='CSV Delimiter',
+                                               choices=['Comma', 'Tab'], value='Comma')
+                            html_opts = gr.CheckboxGroup(label='HTML Report Options',
+                                                         choices=['Exclude Molecular Graph', 'Exclude Scaffold Graph'])
             with gr.Row():
+                report_clr_btn = gr.ClearButton(size='lg')
+                analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary',
+                                        size='lg', interactive=False)
             with gr.Row():
                 with gr.Column(scale=3):
                     html_report = gr.HTML()  # label='Results', visible=True)
+                    ranking_pie_chart = gr.Plot(visible=False)
             with gr.Row():
                 with gr.Column():
             if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
             You will be redirected to Chemical Property Report for carrying out further analysis and
+            generating the full report when the job is done. If the Lookup fails to respond, please wait for a
+            few minutes and refresh the page to try again.
             ''')
             with gr.Column():
                 pred_lookup_id = gr.Textbox(
     def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
         try:
+            aligner = PairwiseAligner(mode='local')
+            alignment_df = get_fasta_family_map()
             processed_fasta = process_target_fasta(fasta)
             exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
             if not exact_match.empty:
                 row = exact_match.iloc[0]
+                return gr.Dropdown(
+                    value=row['Target Family'],
+                    info=f"Reason: Exact match found with {row['ID2']} from family {row['Target Family']}")
             # If no exact match, then calculate alignment score
             def align_score(query):
+                alignment = aligner.align(processed_fasta, query)
+                return alignment.score / max(len(processed_fasta), len(query))
             alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
+            return gr.Dropdown(value=row['Target Family'],
+                               info=f"Reason: Best sequence identity ({row['score']}) "
+                                    f"with {row['ID2']} from family {row['Target Family']}")
         except Exception as e:
             gr.Warning("Failed to detect the protein family due to error: " + str(e))
                 scenario_general = "Unseen Target"
             seen_targets_family = pd.read_csv(
+                f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
             if process_target_fasta(fasta) in seen_targets_family['X2'].values:
                 scenario_family = "Seen Target"
             else:
             filtered_df = pd.concat([filtered_df_general, filtered_df_family])
         row = filtered_df.loc[filtered_df[score].idxmax()]
+        if row['Scenario'] == 'Seen Target':
+            scenario = "Seen Target (>=0.85 sequence identity)"
+        elif row['Scenario'] == 'Unseen Target':
+            scenario = "Unseen Target (<0.85 sequence identity)"
         return {drug_screen_preset:
                     gr.Dropdown(value=row['Model'],
                                 info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
+                                     f"model with the best {score} in the {scenario} scenario "
+                                     f"on {row['Family']}."),
                 drug_screen_target_family:
                     gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
             gr.Warning('Please enter a valid SMILES for model recommendation.')
             return None
+        seen_compounds = pd.read_csv(
+            f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv')
+        if rdkit_canonicalize(smiles) in seen_compounds['X1'].values:
             scenario = "Seen Compound"
         else:
             scenario = "Unseen Compound"
         return gr.Dropdown(value=row['Model'],
                            info=f"Reason: {scenario} in training; choosing the model "
+                                f"with the best {score} in the {scenario} scenario.")
     identify_preset_recommend_btn.click(fn=identify_recommend_model,
         job_id = str(uuid4())
         temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+        screen_df.to_csv(temp_file, index=False, na_rep='')
         if temp_file.is_file():
             job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
             return {screen_data_for_predict: str(temp_file),
         job_id = str(uuid4())
         temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+        identify_df.to_csv(temp_file, index=False, na_rep='')
         if temp_file.is_file():
             job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
             return {identify_data_for_predict: str(temp_file),
                                f'than the allowed maximum {DATASET_MAX_LEN}.')
             temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+            infer_df.to_csv(temp_file, index=False, na_rep='')
         else:
             raise gr.Error('Should upload a compound-protein pair dataset, or '
     drug_screen_click.success(
         fn=submit_predict,
         inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
+                drug_screen_target_family, drug_screen_opts, run_state, ],
         outputs=[run_state, ]
     )
+    drug_screen_clr_btn.click(
+        lambda: ['General'] + [None] * 5,
+        outputs=[drug_screen_target_family,
+                 target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email])
+    target_identify_clr_btn.click(
+        lambda: ['General'] + [None] * 5,
+        outputs=[target_identify_target_family,
+                 compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email])
+    pair_infer_clr_btn.click(
+        lambda: ['General'] + [None] * 4,
+        outputs=[pair_infer_target_family,
+                 infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email])
+    report_clr_btn.click(
+        lambda: ['General'] + [None] * 4,
+        outputs=[scores,
+                 target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email])
+    def update_preset(family, preset):
+        if family == 'Family-Specific Auto-Recommendation':
+            return 'Family-Specific Auto-Recommendation'
+        elif preset == 'Family-Specific Auto-Recommendation':
+            return None
+        else:
+            return preset
+    def update_family(family, preset):
+        if preset == 'Family-Specific Auto-Recommendation':
+            return 'Family-Specific Auto-Recommendation'
+        elif family == 'Family-Specific Auto-Recommendation':
+            return None
+        else:
+            return family
+    target_identify_target_family.change(
+        fn=update_preset, inputs=[target_identify_target_family, target_identify_preset],
+        outputs=target_identify_preset, show_progress='hidden')
+    target_identify_preset.change(
+        fn=update_family, inputs=[target_identify_target_family, target_identify_preset],
+        outputs=target_identify_target_family, show_progress='hidden')
     target_identify_click = target_identify_btn.click(
         fn=target_identify_validate,
         inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
     target_identify_click.success(
         fn=submit_predict,
         inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
+                target_identify_target_family, target_identify_opts, run_state, ],  # , target_identify_email],
         outputs=[run_state, ]
     )
     report_df_change = file_for_report.change(
         fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
         concurrency_limit=100,
+    ).then(
+        fn=lambda: [gr.Button(interactive=True)] * 2,
+        outputs=[csv_generate, html_generate],
     )
     file_for_report.upload(
     file_for_report.clear(
         fn=lambda: [gr.Button(interactive=False)] * 3 +
                    [gr.File(visible=False, value=None)] * 2 +
+                   [gr.Dropdown(visible=False, value=None), gr.HTML(visible=False)],
+        cancels=[report_df_change],
         outputs=[
             csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file, report_task, html_report
         ]
                        outputs=analyze_btn)
+    def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
+        csv_sep_map = {
+            'Comma': ',',
+            'Tab': '\t',
+        }
+        Y_colname = 'Y^'
+        if isinstance(task, str):
+            if task == 'Compound-Protein Interaction':
+                Y_colname = 'Y^_pIC50',
+            elif task == 'Compound-Protein Binding Affinity':
+                Y_colname = 'Y^_prob'
         try:
             now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
+            df.rename(columns={'Y^': Y_colname}).drop(
+                labels=['Compound', 'Scaffold'], axis=1
+            ).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep])
             return gr.File(filename)
         except Exception as e:
             return None
+    def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)):
         try:
             now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
+            create_html_report(df, filename, task, opts)
             return gr.File(filename, visible=True)
         except Exception as e:
             gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
             return None
+    # html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
     csv_generate.click(
+        lambda: [gr.File(visible=True)], outputs=[csv_download_file],
+    ).then(fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
            outputs=csv_download_file, show_progress='full')
     html_generate.click(
+        lambda: [gr.File(visible=True)], outputs=[html_download_file],
+    ).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
            outputs=html_download_file, show_progress='full')
 if __name__ == "__main__":
+    pandarallel.initialize()
     hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
+    scheduler.add_job(check_expiry, 'interval', hours=1)
+    scheduler.start()