Spaces:

westlake-repl
/

Demo_ProTrek_650M_UniRef50

Running

+model_dir: /sujin/Models/ProTrek/ProTrek_650M_UniRef50
+faiss_config:
+  IO_FLAG_MMAP: True
+sequence_index_dir:
+  - name: UniRef50
+    index_dir: /mnt/5t/faiss_index/UniRef50/ProTrek_650M_UniRef50/sequence
+  - name: Swiss-Prot
+    index_dir: /sujin/Datasets/ProTrek/faiss_index/SwissProt/ProTrek_650M_UniRef50/sequence
+  - name: PDB
+    index_dir: /sujin/Datasets/ProTrek/faiss_index/PDB/ProTrek_650M_UniRef50/sequence
+  - name: Uncharacterized
+    index_dir: /mnt/5t/faiss_index/Uncharacterized/ProTrek_650M_UniRef50/sequence
+structure_index_dir:
+  - name: Swiss-Prot
+    index_dir: /sujin/Datasets/ProTrek/faiss_index/SwissProt/ProTrek_650M_UniRef50/structure
+  - name: PDB
+    index_dir: /sujin/Datasets/ProTrek/faiss_index/PDB/ProTrek_650M_UniRef50/structure
+text_index_dir:
+  - name: UniProt
+    index_dir: /mnt/5t/faiss_index/UniRef50/ProTrek_650M_UniRef50/text
+  - name: Swiss-Prot
+    index_dir: /sujin/Datasets/ProTrek/faiss_index/SwissProt/ProTrek_650M_UniRef50/text
+#model_dir: /sujin/Models/ProTrek/ProTrek_35M_UniRef50
+#
+#faiss_config:
+#  IO_FLAG_MMAP: True
+#
+#sequence_index_dir:
+##  - name: UniRef50
+##    index_dir: /sujin/Datasets/ProTrek/faiss_index/UniRef50/ProTrek_650M_UniRef50/sequence
+#  - name: Swiss-Prot
+#    index_dir: /sujin/Datasets/ProTrek/faiss_index/SwissProt/ProTrek_650M_UniRef50/sequence
+##  - name: PDB
+##    index_dir: /sujin/Datasets/ProTrek/faiss_index/PDB/ProTrek_650M_UniRef50/sequence
+#
+#structure_index_dir:
+#  - name: Swiss-Prot
+#    index_dir: /sujin/Datasets/ProTrek/faiss_index/SwissProt/ProTrek_650M_UniRef50/structure
+##  - name: PDB
+##    index_dir: /sujin/Datasets/ProTrek/faiss_index/PDB/ProTrek_650M_UniRef50/structure
+#
+#text_index_dir:
+##  - name: UniProt
+##    index_dir: /sujin/Datasets/ProTrek/faiss_index/UniRef50/ProTrek_650M_UniRef50/text
+#  - name: Swiss-Prot
+#    index_dir: /sujin/Datasets/ProTrek/faiss_index/SwissProt/ProTrek_650M_UniRef50/text

demo/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import sys
+sys.path += []
+import argparse
+def main():
+    pass
+def get_args():
+    parser = argparse.ArgumentParser()
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = get_args()
+    main()

demo/modules/blocks.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import gradio as gr
+from utils.foldseek_util import get_struc_seq
+####################################################
+#                  gradio blocks                   #
+####################################################
+def upload_pdb_button(visible: bool = True, chain_visible: bool = True):
+    """
+    Provide an upload button to upload a pdb file
+    Args:
+        visible: Whether the block is visible or not
+    """
+    with gr.Column(scale=0):
+        # Which chain to be extracted
+        chain_box = gr.Textbox(label="Chain (to be extracted from the pdb file)", value="A",
+                               visible=chain_visible, interactive=True)
+        upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", visible=visible)
+    return upload_btn, chain_box
+####################################################
+#                 Trigger functions                #
+####################################################
+def parse_pdb_file(input_type: str, file: str, chain: str) -> str:
+    """
+    Parse the uploaded structure file
+    Args:
+        input_type: Type of input. Must be one of ["protein sequence", "protein structure"]
+        file: Path to the uploaded file
+        chain: Chain to be extracted from the pdb file
+    Returns:
+        Protein sequence or Foldseek sequence
+    """
+    try:
+        parsed_seqs = get_struc_seq("bin/foldseek", file, [chain])[chain]
+        if input_type == "sequence":
+            return parsed_seqs[0]
+        else:
+            return parsed_seqs[1].lower()
+    except Exception:
+        raise gr.Error(f"Chain '{chain}' not found in the pdb file. Please check the chain id and try again.")
+def set_upload_visible(visible: bool) -> gr.Interface:
+    """
+    Set the visibility of the upload button
+    Args:
+        visible: Whether the block is visible or not
+    Returns:
+        gr.Interface: Updated interface
+    """
+    return gr.update(visible=visible)

demo/modules/compute_score.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gr
+import torch
+from .init_model import model
+from .blocks import upload_pdb_button, parse_pdb_file
+input_types = ["sequence", "structure", "text"]
+input_examples = {
+    "sequence": [
+        "MQLQRLGAPLLKRLVGGCIRQSTAPIMPCVVVSGSGGFLTPVRTYMPLPNDQSDFSPYIEIDLPSESRIQSLHKSGLAAQEWVACEKVHGTNFGIYLINQGDHEVVRFAKRSGIMDPNENFFGYHILIDEFTAQIRILNDLLKQKYGLSRVGRLVLNGELFGAKYKHPLVPKSEKWCTLPNGKKFPIAGVQIQREPFPQYSPELHFFAFDIKYSVSGAEEDFVLLGYDEFVEFSSKVPNLLYARALVRGTLDECLAFDVENFMTPLPALLGLGNYPLEGNLAEGVVIRHVRRGDPAVEKHNVSTIIKLRCSSFMELKHPGKQKELKETFIDTVRSGALRRVRGNVTVISDSMLPQVEAAANDLLLNNVSDGRLSNVLSKIGREPLLSGEVSQVDVALMLAKDALKDFLKEVDSLVLNTTLAFRKLLITNVYFESKRLVEQKWKELMQEEAAAQSEAIPPLSPAAPTKGE",
+        "MSLSTEQMLRDYPRSMQINGQIPKNAIHETYGNDGVDVFIAGSGPIGATYAKLCVEAGLRVVMVEIGAADSFYAVNAEEGTAVPYVPGYHKKNEIEFQKDIDRFVNVIKGALQQVSVPVRNQNVPTLDPGAWSAPPGSSAISNGKNPHQREFENLSAEAVTRGVGGMSTHWTCSTPRIHPPMESLPGIGRPKLSNDPAEDDKEWNELYSEAERLIGTSTKEFDESIRHTLVLRSLQDAYKDRQRIFRPLPLACHRLKNAPEYVEWHSAENLFHSIYNDDKQKKLFTLLTNHRCTRLALTGGYEKKIGAAEVRNLLATRNPSSQLDSYIMAKVYVLASGAIGNPQILYNSGFSGLQVTPRNDSLIPNLGRYITEQPMAFCQIVLRQEFVDSVRDDPYGLPWWKEAVAQHIAKNPTDALPIPFRDPEPQVTTPFTEEHPWHTQIHRDAFSYGAVGPEVDSRVIVDLRWFGATDPEANNLLVFQNDVQDGYSMPQPTFRYRPSTASNVRARKMMADMCEVASNLGGYLPTSPPQFMDPGLALHLAGTTRIGFDKATTVADNNSLVWDFANLYVAGNGTIRTGFGENPTLTSMCHAIKSARSIINTLKGGTDGKNTGEHRNL",
+        "MGVHECPAWLWLLLSLLSLPLGLPVLGAPPRLICDSRVLERYLLEAKEAENITTGCAEHCSLNENITVPDTKVNFYAWKRMEVGQQAVEVWQGLALLSEAVLRGQALLVNSSQPWEPLQLHVDKAVSGLRSLTTLLRALGAQKEAISPPDAASAAPLRTITADTFRKLFRVYSNFLRGKLKLYTGEACRTGDR"
+    ],
+    "structure": [
+        "ddddddddddddddddddddddddddddddddpdpddpddpqpdddfddpdqqlddadddfaaddpvqvvlcvvvvvlqakkfkwfdadffkkkwkwadpdpdidifidtnvgtdglqpddllclvcvvlsvqlvvllqvvvcvvvvapafrmkmfiwgkdalddpfppadadpdwhagsvgdidgsvpgdrdddpaqhahsdiaietewiwiarnsdpvriqtafqvvvcvsqvprpphhyidgqfmggnllnlldpqqpaaqlrnqqvvnqvgddpprggqfikmfrrpprppvvcvsvrhgihtdghlvnvcvvdppcsvvcccnrcvprnvvscvvvvndhdtdvlsrhhpvlsvllvqllvlldpvllvvldvvvdlpclqvvvqdllnsllsslvvsvvvsvvpddpvnvpgdpvsvvvssvsssvsssvvsvvcvvvvnvvsvvvvvvvddppdpdddpddd",
+        "dpdplvvqppdddplqappppfaadpvcvlvdpvaaaeeeeaqallsllllllclvlvgfyeyefqaeqpdwdddpddvpdddftqtqfapcqppvclqpqqvllvvqvvfwdwqeaefdqpppvpddppddhddppdgdddqqhdppfdpqqdlgqatwgghrntcqnhdpqfddawadadpvahqgtfdaldpdpvvrvvlvvvllvvlcvqlvkdqclqvpflqqcllqvllcvvcvvppwhkgggtgswhadpvhsldirhttsssscvvqrvdpssvssydyhyskhqqewhaghdpfgetawtkiarnccvvpvpdrgihigghrfyeypralprvllrcvssvqalqdpggdprhnqdqffalkwfwwkkkfkfffdpvsqvcqcvppppdpssnvqlvvqcvvcvpdpgsgdssrakhfmwtdadpvqqktktwidghhndddddppddpsrmimimiihwafrdrqfgwgfdppgdhpvrttrihtrddgdpvsvvsvvvrlvvsvvssvstgdtdprgpididrrnsvnlieqrqaedddsvngqayqlqhgpsyphygyfdrnhrngigngdcvsvrssssvsnsvvsscvvvvdpdddppdddddd",
+        "ddppppdcvvvvvvvvvppppppvppldplvvlldvvllvvqlvllvvllvvcvvpdpnfflqdwqkafdlddpvvvvvpddlllllqlllvrlvsllvrlvsslvslvpdpdrdvvnnvssvvlnvssvvvnvssvslvsvvsnppddppprdddgdididrgssvssvsvssnsvgsvvvssvvssvvvvd"
+    ],
+    "text": [
+        "RNA-editing ligase in kinetoplastid mitochondrial.",
+        "Oxidase which catalyzes the oxidation of various aldopyranoses and disaccharides.",
+        "Erythropoietin for regulation of erythrocyte proliferation and differentiation."
+    ]
+}
+samples = [[s1, s2] for s1, s2 in zip(input_examples["sequence"], input_examples["text"])]
+def compute_score(input_type_1: str, input_1: str, input_type_2: str, input_2: str):
+    with torch.no_grad():
+        input_reprs = []
+        for input_type, input in [(input_type_1, input_1), (input_type_2, input_2)]:
+            if input_type == "sequence":
+                input_reprs.append(model.get_protein_repr([input]))
+            elif input_type == "structure":
+                input_reprs.append(model.get_structure_repr([input]))
+            else:
+                input_reprs.append(model.get_text_repr([input]))
+        score = input_reprs[0] @ input_reprs[1].T / model.temperature
+    return f"{score.item():.4f}"
+def change_input_type(choice_1: str, choice_2: str):
+    examples_1 = input_examples[choice_1]
+    examples_2 = input_examples[choice_2]
+    # Change examples if input type is changed
+    global samples
+    samples = [[s1, s2] for s1, s2 in zip(examples_1, examples_2)]
+    # Set visibility of upload button
+    if choice_1 == "text":
+        visible_1 = False
+    else:
+        visible_1 = True
+    if choice_2 == "text":
+        visible_2 = False
+    else:
+        visible_2 = True
+    return (gr.update(samples=samples), "", "", gr.update(visible=visible_1), gr.update(visible=visible_1),
+            gr.update(visible=visible_2), gr.update(visible=visible_2))
+# Load example from dataset
+def load_example(example_id):
+    return samples[example_id]
+# Build the block for computing protein-text similarity
+def build_score_computation():
+    gr.Markdown(f"# Compute similarity score between two modalities")
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            # Compute similarity score between sequence and text
+            with gr.Row():
+                input_1 = gr.Textbox(label="Input 1")
+                # Choose the type of input 1
+                input_type_1 = gr.Dropdown(input_types, label="Input type", value="sequence",
+                                           interactive=True, visible=True)
+                # Provide an upload button to upload a pdb file
+                upload_btn_1, chain_box_1 = upload_pdb_button(visible=True)
+                upload_btn_1.upload(parse_pdb_file, inputs=[input_type_1, upload_btn_1, chain_box_1], outputs=[input_1])
+            with gr.Row():
+                input_2 = gr.Textbox(label="Input 2")
+                # Choose the type of input 2
+                input_type_2 = gr.Dropdown(input_types, label="Input type", value="text",
+                                           interactive=True, visible=True)
+                # Provide an upload button to upload a pdb file
+                upload_btn_2, chain_box_2 = upload_pdb_button(visible=False)
+                upload_btn_2.upload(parse_pdb_file, inputs=[input_type_2, upload_btn_2, chain_box_2], outputs=[input_2])
+            # Provide examples
+            examples = gr.Dataset(samples=samples, type="index", components=[input_1, input_2], label="Input examples")
+            # Add click event to examples
+            examples.click(fn=load_example, inputs=[examples], outputs=[input_1, input_2])
+            compute_btn = gr.Button(value="Compute")
+        # Change examples based on input type
+        input_type_1.change(fn=change_input_type, inputs=[input_type_1, input_type_2],
+                            outputs=[examples, input_1, input_2, upload_btn_1, chain_box_1,
+                                     upload_btn_2, chain_box_2])
+        input_type_2.change(fn=change_input_type, inputs=[input_type_1, input_type_2],
+                            outputs=[examples, input_1, input_2, upload_btn_1, chain_box_1,
+                                     upload_btn_2, chain_box_2])
+        similarity_score = gr.Label(label="similarity score")
+        compute_btn.click(fn=compute_score, inputs=[input_type_1, input_1, input_type_2, input_2],
+                          outputs=[similarity_score])

demo/modules/init_model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import faiss
+import numpy as np
+import pandas as pd
+import os
+import yaml
+import glob
+from easydict import EasyDict
+from utils.constants import sequence_level
+from model.ProTrek.protrek_trimodal_model import ProTrekTrimodalModel
+from tqdm import tqdm
+def load_model():
+    model_config = {
+        "protein_config": glob.glob(f"{config.model_dir}/esm2_*")[0],
+        "text_config": f"{config.model_dir}/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
+        "structure_config": glob.glob(f"{config.model_dir}/foldseek_*")[0],
+        "load_protein_pretrained": False,
+        "load_text_pretrained": False,
+        "from_checkpoint": glob.glob(f"{config.model_dir}/*.pt")[0]
+    }
+    model = ProTrekTrimodalModel(**model_config)
+    model.eval()
+    return model
+def load_faiss_index(index_path: str):
+    if config.faiss_config.IO_FLAG_MMAP:
+        index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
+    else:
+        index = faiss.read_index(index_path)
+    index.metric_type = faiss.METRIC_INNER_PRODUCT
+    return index
+def load_index():
+    all_index = {}
+    # Load protein sequence index
+    all_index["sequence"] = {}
+    for db in tqdm(config.sequence_index_dir, desc="Loading sequence index..."):
+        db_name = db["name"]
+        index_dir = db["index_dir"]
+        index_path = f"{index_dir}/sequence.index"
+        sequence_index = load_faiss_index(index_path)
+        id_path = f"{index_dir}/ids.tsv"
+        uniprot_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
+        all_index["sequence"][db_name] = {"index": sequence_index, "ids": uniprot_ids}
+    # Load protein structure index
+    print("Loading structure index...")
+    all_index["structure"] = {}
+    for db in tqdm(config.structure_index_dir, desc="Loading structure index..."):
+        db_name = db["name"]
+        index_dir = db["index_dir"]
+        index_path = f"{index_dir}/structure.index"
+        structure_index = load_faiss_index(index_path)
+        id_path = f"{index_dir}/ids.tsv"
+        uniprot_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
+        all_index["structure"][db_name] = {"index": structure_index, "ids": uniprot_ids}
+    # Load text index
+    all_index["text"] = {}
+    valid_subsections = {}
+    for db in tqdm(config.text_index_dir, desc="Loading text index..."):
+        db_name = db["name"]
+        index_dir = db["index_dir"]
+        all_index["text"][db_name] = {}
+        text_dir = f"{index_dir}/subsections"
+        # Remove "Taxonomic lineage" from sequence_level. This is a special case which we don't need to index.
+        valid_subsections[db_name] = set()
+        sequence_level.add("Global")
+        for subsection in tqdm(sequence_level):
+            index_path = f"{text_dir}/{subsection.replace(' ', '_')}.index"
+            if not os.path.exists(index_path):
+                continue
+            text_index = load_faiss_index(index_path)
+            id_path = f"{text_dir}/{subsection.replace(' ', '_')}_ids.tsv"
+            text_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
+            all_index["text"][db_name][subsection] = {"index": text_index, "ids": text_ids}
+            valid_subsections[db_name].add(subsection)
+    # Sort valid_subsections
+    for db_name in valid_subsections:
+        valid_subsections[db_name] = sorted(list(valid_subsections[db_name]))
+    return all_index, valid_subsections
+# Load the config file
+root_dir = __file__.rsplit("/", 3)[0]
+config_path = f"{root_dir}/demo/config.yaml"
+with open(config_path, 'r', encoding='utf-8') as r:
+    config = EasyDict(yaml.safe_load(r))
+device = "cuda"
+print("Loading model...")
+model = load_model()
+model.to(device)
+all_index, valid_subsections = load_index()
+print("Done...")
+# model = None
+# all_index, valid_subsections = {"text": {}, "sequence": {"UniRef50": None}, "structure": {"UniRef50": None}}, {}

demo/modules/search.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import gradio as gr
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.stats import norm
+from .init_model import model, all_index, valid_subsections
+from .blocks import upload_pdb_button, parse_pdb_file
+tmp_file_path = "/tmp/results.tsv"
+tmp_plot_path = "/tmp/histogram.svg"
+# Samples for input
+samples = [
+    ["Proteins with zinc bindings."],
+    ["Proteins locating at cell membrane."],
+    ["Protein that serves as an enzyme."]
+]
+# Databases for different modalities
+now_db = {
+    "sequence": list(all_index["sequence"].keys())[0],
+    "structure": list(all_index["structure"].keys())[0],
+    "text": list(all_index["text"].keys())[0]
+}
+def clear_results():
+    return "", gr.update(visible=False), gr.update(visible=False)
+def plot(scores) -> None:
+    """
+    Plot the distribution of scores and fit a normal distribution.
+    Args:
+        scores: List of scores
+    """
+    plt.hist(scores, bins=100, density=True, alpha=0.6)
+    plt.title('Distribution of similarity scores in the database', fontsize=15)
+    plt.xlabel('Similarity score', fontsize=15)
+    plt.ylabel('Density', fontsize=15)
+    mu, std = norm.fit(scores)
+    # Plot the Gaussian
+    xmin, xmax = plt.xlim()
+    _, ymax = plt.ylim()
+    x = np.linspace(xmin, xmax, 100)
+    p = norm.pdf(x, mu, std)
+    plt.plot(x, p)
+    # Plot total number of scores
+    plt.text(xmax, 0.9*ymax, f"Total number: {len(scores)}", ha='right', fontsize=12)
+    # Convert the plot to svg format
+    plt.savefig(tmp_plot_path)
+    plt.cla()
+# Search from database
+def search(input: str, nprobe: int, topk: int, input_type: str, query_type: str, subsection_type: str):
+    input_modality = input_type.replace("sequence", "protein")
+    with torch.no_grad():
+        input_embedding = getattr(model, f"get_{input_modality}_repr")([input]).cpu().numpy()
+    db = now_db[query_type]
+    if query_type == "text":
+        index = all_index["text"][db][subsection_type]["index"]
+        ids = all_index["text"][db][subsection_type]["ids"]
+    else:
+        index = all_index[query_type][db]["index"]
+        ids = all_index[query_type][db]["ids"]
+    if check_index_ivf(query_type, subsection_type):
+        if index.nlist < nprobe:
+            raise gr.Error(f"The number of clusters to search must be less than or equal to the number of clusters in the index ({index.nlist}).")
+        else:
+            index.nprobe = nprobe
+    if topk > index.ntotal:
+        raise gr.Error(f"You cannot retrieve more than the database size ({index.ntotal}).")
+    # Retrieve all scores to plot the distribution
+    scores, ranks = index.search(input_embedding, index.ntotal)
+    scores, ranks = scores[0], ranks[0]
+    # Remove inf values
+    selector = scores > -1
+    scores = scores[selector]
+    ranks = ranks[selector]
+    scores = scores / model.temperature.item()
+    plot(scores)
+    top_scores = scores[:topk]
+    top_ranks = ranks[:topk]
+    # ranks = [list(range(topk))]
+    # ids = ["P12345"] * topk
+    # scores = torch.randn(topk).tolist()
+    # Write the results to a temporary file for downloading
+    with open(tmp_file_path, "w") as w:
+        w.write("Id\tMatching score\n")
+        for i in range(topk):
+            rank = top_ranks[i]
+            w.write(f"{ids[rank]}\t{top_scores[i]}\n")
+    # Get topk ids
+    topk_ids = []
+    for rank in top_ranks:
+        now_id = ids[rank]
+        if query_type == "text":
+            topk_ids.append(now_id)
+        else:
+            if db != "PDB":
+                # Provide link to uniprot website
+                topk_ids.append(f"[{now_id}](https://www.uniprot.org/uniprotkb/{now_id})")
+            else:
+                # Provide link to pdb website
+                pdb_id = now_id.split("-")[0]
+                topk_ids.append(f"[{now_id}](https://www.rcsb.org/structure/{pdb_id})")
+    limit = 1000
+    df = pd.DataFrame({"Id": topk_ids[:limit], "Matching score": top_scores[:limit]})
+    if len(topk_ids) > limit:
+        info_df = pd.DataFrame({"Id": ["Download the file to check all results"], "Matching score": ["..."]},
+                               index=[1000])
+        df = pd.concat([df, info_df], axis=0)
+    output = df.to_markdown()
+    return (output,
+            gr.DownloadButton(label="Download results", value=tmp_file_path, visible=True, scale=0),
+            gr.update(value=tmp_plot_path, visible=True))
+def change_input_type(choice: str):
+    # Change examples if input type is changed
+    global samples
+    if choice == "text":
+        samples = [
+            ["Proteins with zinc bindings."],
+            ["Proteins locating at cell membrane."],
+            ["Protein that serves as an enzyme."]
+        ]
+    elif choice == "sequence":
+        samples = [
+            ["MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK"],
+            ["MITLDWEKANGLITTVVQDATTKQVLMVAYMNQESLAKTMATGETWFWSRSRKTLWHKGATSGNIQTVKTIAVDCDADTLLVTVDPAGPACHTGHISCFYRHYPEGKDLT"],
+            ["MDLKQYVSEVQDWPKPGVSFKDITTIMDNGEAYGYATDKIVEYAKDRDVDIVVGPEARGFIIGCPVAYSMGIGFAPVRKEGKLPREVIRYEYDLEYGTNVLTMHKDAIKPGQRVLITDDLLATGGTIEAAIKLVEKLGGIVVGIAFIIELKYLNGIEKIKDYDVMSLISYDE"]
+        ]
+    elif choice == "structure":
+        samples = [
+            ["dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd"],
+            ["dddadcpvpvqkakefeaeppprdtadiaiagpvqvvvcvvpqwhwgqdpvvrdidgqcpvpvqiwrwddwdaddnrryiytythtpahsdpvrhvhpppadvvgpddpd"],
+            ["dplvvqwdwdaqpphhpdtdthcvscvvppvslvvqlvvvlvvcvvqvaqeeeeepdqrcsnrvsscvvvvhyywykyfpppddaawdwdwdddppgitiiithlpseaaageyeyegaeqalqprvlrvvvrcvvnnyddaeyeyqeyevcrvncvsvvvhhydyvyydpd"]
+        ]
+    # Set visibility of upload button
+    if choice == "text":
+        visible = False
+    else:
+        visible = True
+    return gr.update(samples=samples), "", gr.update(visible=visible), gr.update(visible=visible)
+# Load example from dataset
+def load_example(example_id):
+    return samples[example_id][0]
+# Change the visibility of subsection type
+def change_output_type(query_type: str, subsection_type: str):
+    nprobe_visible = check_index_ivf(query_type, subsection_type)
+    subsection_visible = True if query_type == "text" else False
+    return (
+        gr.update(visible=subsection_visible),
+        gr.update(visible=nprobe_visible),
+        gr.update(choices=list(all_index[query_type].keys()), value=now_db[query_type])
+    )
+def check_index_ivf(index_type: str, subsection_type: str = None) -> bool:
+    """
+    Check if the index is of IVF type.
+    Args:
+        index_type: Type of index.
+        subsection_type: If the "index_type" is "text", get the index based on the subsection type.
+    Returns:
+        Whether the index is of IVF type or not.
+    """
+    db = now_db[index_type]
+    if index_type == "sequence":
+        index = all_index["sequence"][db]["index"]
+    elif index_type == "structure":
+        index = all_index["structure"][db]["index"]
+    elif index_type == "text":
+        index = all_index["text"][db][subsection_type]["index"]
+    nprobe_visible = True if hasattr(index, "nprobe") else False
+    return nprobe_visible
+def change_db_type(query_type: str, subsection_type: str, db_type: str):
+    """
+    Change the database to search.
+    Args:
+        query_type: The output type.
+        db_type: The database to search.
+    """
+    now_db[query_type] = db_type
+    if query_type == "text":
+        subsection_update = gr.update(choices=list(valid_subsections[now_db["text"]]), value="Function")
+    else:
+        subsection_update = gr.update(visible=False)
+    nprobe_visible = check_index_ivf(query_type, subsection_type)
+    return subsection_update, gr.update(visible=nprobe_visible)
+# Build the searching block
+def build_search_module():
+    gr.Markdown(f"# Search from Swiss-Prot database (the whole UniProt database will be supported soon)")
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            # Set input type
+            input_type = gr.Radio(["sequence", "structure", "text"], label="Input type (e.g. 'text' means searching based on text descriptions)", value="text")
+            with gr.Row():
+                # Set output type
+                query_type = gr.Radio(
+                    ["sequence", "structure", "text"],
+                    label="Output type (e.g. 'sequence' means returning qualified sequences)",
+                    value="sequence",
+                    scale=2,
+                )
+                # If the output type is "text", provide an option to choose the subsection of text
+                subsection_type = gr.Dropdown(valid_subsections[now_db["text"]], label="Subsection of text", value="Function",
+                                              interactive=True, visible=False, scale=0)
+                db_type = gr.Dropdown(all_index["sequence"].keys(), label="Database", value=now_db["sequence"],
+                                              interactive=True, visible=True, scale=0)
+            with gr.Row():
+                # Input box
+                input = gr.Text(label="Input")
+                # Provide an upload button to upload a pdb file
+                upload_btn, chain_box = upload_pdb_button(visible=False)
+                upload_btn.upload(parse_pdb_file, inputs=[input_type, upload_btn, chain_box], outputs=[input])
+            # If the index is of IVF type, provide an option to choose the number of clusters.
+            nprobe_visible = check_index_ivf(query_type.value)
+            nprobe = gr.Slider(1, 1000000, 1000,  step=1, visible=nprobe_visible,
+                               label="Number of clusters to search (lower value for faster search and higher value for more accurate search)")
+            # Add event listener to output type
+            query_type.change(fn=change_output_type, inputs=[query_type, subsection_type],
+                              outputs=[subsection_type, nprobe, db_type])
+            # Add event listener to db type
+            db_type.change(fn=change_db_type, inputs=[query_type, subsection_type, db_type],
+                           outputs=[subsection_type, nprobe])
+            # Choose topk results
+            topk = gr.Slider(1, 1000000, 5,  step=1, label="Retrieve top k results")
+            # Provide examples
+            examples = gr.Dataset(samples=samples, components=[input], type="index", label="Input examples")
+            # Add click event to examples
+            examples.click(fn=load_example, inputs=[examples], outputs=input)
+            # Change examples based on input type
+            input_type.change(fn=change_input_type, inputs=[input_type], outputs=[examples, input, upload_btn, chain_box])
+            with gr.Row():
+                search_btn = gr.Button(value="Search")
+                clear_btn = gr.Button(value="Clear")
+        with gr.Row():
+            with gr.Column():
+                results = gr.Markdown(label="results", height=450)
+                download_btn = gr.DownloadButton(label="Download results", visible=False)
+                # Plot the distribution of scores
+                histogram = gr.Image(label="Histogram of matching scores", type="filepath", scale=1, visible=False)
+        search_btn.click(fn=search, inputs=[input, nprobe, topk, input_type, query_type, subsection_type],
+                      outputs=[results, download_btn, histogram])
+        clear_btn.click(fn=clear_results, outputs=[results, download_btn, histogram])

demo/modules/tmalign.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import gradio as gr
+import os
+from .blocks import upload_pdb_button
+from utils.downloader import download_pdb, download_af2
+root_dir = __file__.rsplit("/", 3)[0]
+structure_types = ["AlphaFoldDB", "PDB"]
+def upload_structure(file: str):
+    return file
+def get_structure_path(structure: str, structure_type: str) -> str:
+    # If the structure is manually uploaded
+    if structure[0] == "/":
+        return structure
+    # If the structure is a Uniprot ID, download the structure from AlphaFoldDB
+    elif structure_type == "AlphaFoldDB":
+        save_path = f"{root_dir}/demo/cache/{structure}.pdb"
+        if not os.path.exists(save_path):
+            download_af2(structure, "pdb", save_path)
+        return save_path
+    # If the structure is a PDB ID, download the structure from PDB
+    elif structure_type == "PDB":
+        save_path = f"{root_dir}/demo/cache/{structure}.cif"
+        if not os.path.exists(save_path):
+            download_pdb(structure, "cif", save_path)
+        return save_path
+def tmalign(structure_1: str, structure_type_1: str, structure_2: str, structure_type_2: str):
+    structure_path_1 = get_structure_path(structure_1, structure_type_1)
+    structure_path_2 = get_structure_path(structure_2, structure_type_2)
+    cmd = f"bin/TMalign {structure_path_1} {structure_path_2}"
+    r = os.popen(cmd)
+    text = r.read()
+    return text
+# Build the block for computing protein-text similarity
+def build_TMalign():
+    gr.Markdown(f"# Calculate TM-score between two protein structures")
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            # Compute similarity score between sequence and text
+            with gr.Row():
+                structure_1 = gr.Textbox(label="Protein structure 1 (input Uniprot ID or PDB ID or upload a pdb file)")
+                structure_type_1 = gr.Dropdown(structure_types, label="Structure type (if the structure is manually uploaded, ignore this field)",
+                                               value="AlphaFoldDB", interactive=True, visible=True)
+                # Provide an upload button to upload a pdb file
+                upload_btn_1, _ = upload_pdb_button(visible=True, chain_visible=False)
+                upload_btn_1.upload(upload_structure, inputs=[upload_btn_1], outputs=[structure_1])
+            with gr.Row():
+                structure_2 = gr.Textbox(label="Protein structure 2 (input Uniprot ID or PDB ID or upload a pdb file)")
+                structure_type_2 = gr.Dropdown(structure_types, label="Structure type (if the structure is manually uploaded, ignore this field)",
+                                               value="AlphaFoldDB", interactive=True, visible=True)
+                # Provide an upload button to upload a pdb file
+                upload_btn_2, _ = upload_pdb_button(visible=True, chain_visible=False)
+                upload_btn_2.upload(upload_structure, inputs=[upload_btn_2], outputs=[structure_2])
+            compute_btn = gr.Button(value="Compute TM-score")
+            tmscore = gr.TextArea(label="TM-score", interactive=False)
+            compute_btn.click(tmalign, inputs=[structure_1, structure_type_1, structure_2, structure_type_2],
+                              outputs=[tmscore])

demo/run.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+root_dir = __file__.rsplit("/", 2)[0]
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+import gradio as gr
+from modules.search import build_search_module
+from modules.compute_score import build_score_computation
+from modules.tmalign import build_TMalign
+# Build demo
+with gr.Blocks() as demo:
+    build_search_module()
+    build_score_computation()
+    build_TMalign()
+if __name__ == '__main__':
+    # Run the demo
+    demo.launch()

model/ProTrek/protein_encoder.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+from tqdm import tqdm
+from torch.nn.functional import normalize
+from transformers import EsmConfig, EsmForMaskedLM, EsmTokenizer
+class ProteinEncoder(torch.nn.Module):
+    def __init__(self,
+                 config_path: str,
+                 out_dim: int,
+                 load_pretrained: bool = True,
+                 gradient_checkpointing: bool = False):
+        """
+        Args:
+            config_path: Path to the config file
+            out_dim    : Output dimension of the protein representation
+            load_pretrained: Whether to load pretrained weights
+            gradient_checkpointing: Whether to use gradient checkpointing
+        """
+        super().__init__()
+        config = EsmConfig.from_pretrained(config_path)
+        if load_pretrained:
+            self.model = EsmForMaskedLM.from_pretrained(config_path)
+        else:
+            self.model = EsmForMaskedLM(config)
+        self.out = torch.nn.Linear(config.hidden_size, out_dim)
+        # Set gradient checkpointing
+        self.model.esm.encoder.gradient_checkpointing = gradient_checkpointing
+        # Remove contact head
+        self.model.esm.contact_head = None
+        # Remove position embedding if the embedding type is ``rotary``
+        if config.position_embedding_type == "rotary":
+            self.model.esm.embeddings.position_embeddings = None
+        self.tokenizer = EsmTokenizer.from_pretrained(config_path)
+    def get_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        """
+        Compute protein representation for the given proteins
+        Args:
+            protein: A list of protein sequences
+            batch_size: Batch size for inference
+            verbose: Whether to print progress
+        """
+        device = next(self.parameters()).device
+        protein_repr = []
+        if verbose:
+            iterator = tqdm(range(0, len(proteins), batch_size), desc="Computing protein embeddings")
+        else:
+            iterator = range(0, len(proteins), batch_size)
+        for i in iterator:
+            protein_inputs = self.tokenizer.batch_encode_plus(proteins[i:i + batch_size],
+                                                              return_tensors="pt",
+                                                              padding=True)
+            protein_inputs = {k: v.to(device) for k, v in protein_inputs.items()}
+            output, _ = self.forward(protein_inputs)
+            protein_repr.append(output)
+        protein_repr = torch.cat(protein_repr, dim=0)
+        return normalize(protein_repr, dim=-1)
+    def forward(self, inputs: dict, get_mask_logits: bool = False):
+        """
+        Encode protein sequence into protein representation
+        Args:
+            inputs: A dictionary containing the following keys:
+                - input_ids: [batch, seq_len]
+                - attention_mask: [batch, seq_len]
+            get_mask_logits: Whether to return the logits for masked tokens
+        Returns:
+            protein_repr: [batch, protein_repr_dim]
+            mask_logits : [batch, seq_len, vocab_size]
+        """
+        last_hidden_state = self.model.esm(**inputs).last_hidden_state
+        reprs = last_hidden_state[:, 0, :]
+        reprs = self.out(reprs)
+        # Get logits for masked tokens
+        if get_mask_logits:
+            mask_logits = self.model.lm_head(last_hidden_state)
+        else:
+            mask_logits = None
+        return reprs, mask_logits

model/ProTrek/protrek_trimodal_model.py ADDED Viewed

	@@ -0,0 +1,874 @@

+import torch
+import torch.distributed as dist
+import torchmetrics
+import json
+import math
+import numpy as np
+import os
+import copy
+import faiss
+import time
+import pandas as pd
+import random
+from tqdm import tqdm
+from .protein_encoder import ProteinEncoder
+from .structure_encoder import StructureEncoder
+from .text_encoder import TextEncoder
+from ..abstract_model import AbstractModel
+from ..model_interface import register_model
+from utils.mpr import MultipleProcessRunnerSimplifier
+from torch.nn.functional import normalize, cross_entropy
+from utils.constants import residue_level, sequence_level
+from sklearn.metrics import roc_auc_score
+def multilabel_cross_entropy(logits, labels):
+    """
+    Compute cross entropy loss for multilabel classification。 See "https://arxiv.org/pdf/2208.02955.pdf"
+    Args:
+        logits: [num_samples, num_classes]
+        labels: [num_samples, num_classes]
+    """
+    loss = 0
+    for pred, label in zip(logits, labels):
+        pos_logits = pred[label == 1]
+        neg_logits = pred[label == 0]
+        diff = neg_logits.unsqueeze(-1) - pos_logits
+        loss += torch.log(1 + torch.exp(diff).sum())
+    return loss / len(logits)
+    # pred = (1 - 2 * labels) * logits
+    # pred_neg = pred - labels * 1e12
+    # pred_pos = pred - (1 - labels) * 1e12
+    #
+    # zeros = torch.zeros_like(logits[..., :1], dtype=logits.dtype)
+    # pred_neg = torch.cat([pred_neg, zeros], dim=-1)
+    # pred_pos = torch.cat([pred_pos, zeros], dim=-1)
+    #
+    # neg_loss = torch.logsumexp(pred_neg, dim=-1)
+    # pos_loss = torch.logsumexp(pred_pos, dim=-1)
+    #
+    # return (neg_loss + pos_loss).mean()
+@register_model
+class ProTrekTrimodalModel(AbstractModel):
+    def __init__(self,
+                 protein_config: str,
+                 text_config: str,
+                 structure_config: str = None,
+                 repr_dim: int = 1024,
+                 temperature: float = 0.07,
+                 load_protein_pretrained: bool = True,
+                 load_text_pretrained: bool = True,
+                 use_mlm_loss: bool = False,
+                 use_zlpr_loss: bool = False,
+                 use_saprot: bool = False,
+                 gradient_checkpointing: bool = False,
+                 **kwargs):
+        """
+        Args:
+            protein_config: Path to the config file for protein sequence encoder
+            text_config: Path to the config file for text encoder
+            structure_config: Path to the config file for structure encoder
+            repr_dim: Output dimension of the protein and text representation
+            temperature: Temperature for softmax
+            load_protein_pretrained: Whether to load pretrained weights for protein encoder
+            load_text_pretrained: Whether to load pretrained weights for text encoder
+            use_mlm_loss: Whether to use masked language modeling loss
+            use_zlpr_loss: Whether to use zlpr loss. See "https://arxiv.org/pdf/2208.02955.pdf"
+            use_saprot: Whether to use SaProt as protein encoder
+            gradient_checkpointing: Whether to use gradient checkpointing for protein encoder
+        """
+        self.protein_config = protein_config
+        self.structure_config = structure_config
+        self.text_config = text_config
+        self.repr_dim = repr_dim
+        self.temperature = temperature
+        self.load_protein_pretrained = load_protein_pretrained
+        self.load_text_pretrained = load_text_pretrained
+        self.use_mlm_loss = use_mlm_loss
+        self.use_zlpr_loss = use_zlpr_loss
+        self.use_saprot = use_saprot
+        self.gradient_checkpointing = gradient_checkpointing
+        super().__init__(**kwargs)
+    def initialize_metrics(self, stage: str) -> dict:
+        return_dict = {
+            f"{stage}_protein_text_acc": torchmetrics.Accuracy(),
+            f"{stage}_text_protein_acc": torchmetrics.Accuracy(),
+        }
+        if self.use_mlm_loss:
+            return_dict[f"{stage}_protein_mask_acc"] = torchmetrics.Accuracy(ignore_index=-1)
+            if self.structure_config is not None:
+                return_dict[f"{stage}_structure_mask_acc"] = torchmetrics.Accuracy(ignore_index=-1)
+        if self.structure_config is not None:
+            return_dict[f"{stage}_structure_protein_acc"] = torchmetrics.Accuracy()
+            return_dict[f"{stage}_structure_text_acc"] = torchmetrics.Accuracy()
+            return_dict[f"{stage}_text_structure_acc"] = torchmetrics.Accuracy()
+            return_dict[f"{stage}_protein_structure_acc"] = torchmetrics.Accuracy()
+        return return_dict
+    def initialize_model(self):
+        # Initialize encoders
+        self.protein_encoder = ProteinEncoder(self.protein_config,
+                                              self.repr_dim,
+                                              self.load_protein_pretrained,
+                                              self.gradient_checkpointing)
+        self.text_encoder = TextEncoder(self.text_config,
+                                        self.repr_dim,
+                                        self.load_text_pretrained,
+                                        self.gradient_checkpointing)
+        # Learnable temperature
+        self.temperature = torch.nn.Parameter(torch.tensor(self.temperature))
+        # self.model is used for saving and loading
+        self.model = torch.nn.ParameterList([self.temperature,
+                                             self.protein_encoder,
+                                             self.text_encoder])
+        # If the structure encoder is specified
+        if self.structure_config is not None:
+            self.structure_encoder = StructureEncoder(self.structure_config, self.repr_dim)
+            self.model.append(self.structure_encoder)
+    def get_text_repr(self, texts: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        return self.text_encoder.get_repr(texts, batch_size, verbose)
+    def get_structure_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        return self.structure_encoder.get_repr(proteins, batch_size, verbose)
+    def get_protein_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        return self.protein_encoder.get_repr(proteins, batch_size, verbose)
+    def forward(self, protein_inputs: dict, text_inputs: dict, structure_inputs: dict = None):
+        """
+        Args:
+            protein_inputs: A dictionary for protein encoder
+            structure_inputs: A dictionary for structure encoder
+            text_inputs   : A dictionary for text encoder
+        """
+        protein_repr, protein_mask_logits = self.protein_encoder(protein_inputs, self.use_mlm_loss)
+        text_repr = self.text_encoder(text_inputs)
+        outputs = [text_repr, protein_repr, protein_mask_logits]
+        if self.structure_config is not None:
+            structure_repr, structure_mask_logits = self.structure_encoder(structure_inputs, self.use_mlm_loss)
+            outputs += [structure_repr, structure_mask_logits]
+        return outputs
+    def loss_func(self, stage: str, outputs, labels):
+        if self.structure_config is not None:
+            text_repr, protein_repr, protein_mask_logits, structure_repr, structure_mask_logits = outputs
+        else:
+            text_repr, protein_repr, protein_mask_logits = outputs
+        device = text_repr.device
+        text_repr = normalize(text_repr, dim=-1)
+        protein_repr = normalize(protein_repr, dim=-1)
+        # Gather representations from all GPUs
+        all_protein_repr = self.all_gather(protein_repr).view(-1, protein_repr.shape[-1]).detach()
+        all_text_repr = self.all_gather(text_repr).view(-1, text_repr.shape[-1]).detach()
+        if self.structure_config is not None:
+            structure_repr = normalize(structure_repr, dim=-1)
+            all_structure_repr = self.all_gather(structure_repr).view(-1, structure_repr.shape[-1]).detach()
+        # text_idx = labels["text_idx"]
+        # text_candidates = labels["text_candidates"]
+        #
+        # # Gather all text ids
+        # text_inds = self.all_gather(text_idx).flatten()
+        # # Create text classification labels
+        # text_labels = torch.zeros(len(text_candidates), len(text_inds), dtype=int).to(device)
+        # for i, candidate in enumerate(text_candidates):
+        #     for j, idx in enumerate(text_inds):
+        #         if idx.item() in candidate:
+        #             text_labels[i, j] = 1
+        #
+        # # Gather text labels from all GPUs
+        # text_labels = self.all_gather(text_labels).view(-1, text_labels.shape[-1])
+        #
+        # # Protein classification labels are the transpose of text labels
+        # protein_labels = text_labels.T
+        # Batch size
+        rank = dist.get_rank()
+        bs = text_repr.shape[0]
+        # Get current labels
+        # protein_labels = protein_labels[rank * bs: rank * bs + bs]
+        # text_labels = text_labels[rank * bs: rank * bs + bs]
+        # Create classification labels between structure and sequence
+        bs_labels = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(device)
+        if self.structure_config is not None:
+            pairs = {
+                "protein": ["structure", "text"],
+                "structure": ["protein", "text"],
+                "text": ["protein", "structure"]
+            }
+        else:
+            pairs = {
+                "protein": ["text"],
+                "text": ["protein"]
+            }
+        loss_list = []
+        for k, values in pairs.items():
+            for v in values:
+                # Only calculate the similarity for the current batch
+                sim = torch.matmul(eval(f"{k}_repr"), eval(f"all_{v}_repr").T).div(self.temperature)
+                # if k == "text":
+                #     if self.use_zlpr_loss:
+                #         loss = multilabel_cross_entropy(sim, protein_labels)
+                #     else:
+                #         loss = cross_entropy(sim, bs_labels)
+                #
+                #     pred = []
+                #     for s, l in zip(sim, protein_labels):
+                #         n_label = l.sum()
+                #         topk = torch.topk(s, k=n_label).indices
+                #         if l[topk].sum() == n_label:
+                #             pred.append(1)
+                #         else:
+                #             pred.append(0)
+                #
+                #     pred = torch.tensor(pred).to(device)
+                #     label = torch.ones_like(pred)
+                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(pred.detach(), label)
+                #     # if v == "protein":
+                #     #     acc = self.metrics[stage][f"{stage}_{k}_{v}_acc"].compute()
+                #     #     print(f"{stage}_{k}_{v}_acc: {acc:.4f}")
+                #
+                # elif v == "text":
+                #     if self.use_zlpr_loss:
+                #         loss = multilabel_cross_entropy(sim, text_labels)
+                #     else:
+                #         loss = cross_entropy(sim, bs_labels)
+                #
+                #     pred = []
+                #     for s, l in zip(sim, text_labels):
+                #         n_label = l.sum()
+                #         topk = torch.topk(s, k=n_label).indices
+                #         if l[topk].sum() == n_label:
+                #             pred.append(1)
+                #         else:
+                #             pred.append(0)
+                #
+                #     pred = torch.tensor(pred).to(device)
+                #     label = torch.ones_like(pred)
+                #     # if k == "protein":
+                #     #     acc = pred.sum() / len(pred)
+                #     #     print(f"{stage}_{k}_{v}_acc: {acc:.4f}")
+                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(pred.detach(), label)
+                #
+                # else:
+                #     loss = cross_entropy(sim, bs_labels)
+                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(sim.detach(), bs_labels)
+                loss = cross_entropy(sim, bs_labels)
+                self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(sim.detach(), bs_labels)
+                loss_list.append(loss)
+        # Masked language modeling loss
+        if self.use_mlm_loss:
+            k_label = [("protein", labels["seq_labels"])]
+            if self.structure_config is not None:
+                k_label.append(("structure", labels["struc_labels"]))
+            for k, label in k_label:
+                logits = eval(f"{k}_mask_logits")
+                # merge the first and second dimension of logits
+                logits = logits.view(-1, logits.shape[-1])
+                label = label.flatten().to(device)
+                mlm_loss = cross_entropy(logits, label, ignore_index=-1)
+                loss_list.append(mlm_loss)
+                self.metrics[stage][f"{stage}_{k}_mask_acc"].update(logits.detach(), label)
+        loss = sum(loss_list) / len(loss_list)
+        if stage == "train":
+            log_dict = self.get_log_dict("train")
+            log_dict["train_loss"] = loss
+            self.log_info(log_dict)
+            # Reset train metrics
+            self.reset_metrics("train")
+        return loss
+    def padded_gather(self, tensor: torch.Tensor):
+        """
+        Gather tensors from all GPUs, allowing different shapes at the batch dimension.
+        """
+        # Get the size of the tensor
+        size = tensor.shape[0]
+        all_sizes = self.all_gather(torch.tensor(size, device=tensor.device))
+        max_size = max(all_sizes)
+        # Pad the tensor
+        if size != max_size:
+            tmp = torch.zeros(max_size, tensor.shape[-1], dtype=tensor.dtype, device=tensor.device)
+            tmp[:size] = tensor
+            tensor = tmp
+        padded_tensor = self.all_gather(tensor).view(-1, tensor.shape[-1])
+        tensor = padded_tensor[:sum(all_sizes)]
+        return tensor
+    def _get_protein_indices(self):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        if self.use_saprot:
+            proteins = []
+            for sub_dict in self.uniprot2label.values():
+                aa_seq = sub_dict["seq"]
+                foldseek_seq = sub_dict["foldseek"]
+                assert len(aa_seq) == len(foldseek_seq)
+                seq = "".join([a + b for a, b in zip(aa_seq, foldseek_seq)])
+                proteins.append(seq)
+        else:
+            proteins = [sub_dict["seq"] for sub_dict in self.uniprot2label.values()]
+        span = math.ceil(len(proteins) / world_size)
+        sub_proteins = proteins[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        # Get protein representations
+        sub_protein_repr = self.protein_encoder.get_repr(sub_proteins, batch_size=1, verbose=verbose)
+        protein_repr = self.padded_gather(sub_protein_repr)
+        # Construct faiss index
+        d = protein_repr.shape[-1]
+        protein_indices = faiss.IndexFlatIP(d)
+        protein_indices.add(protein_repr.cpu().numpy())
+        return protein_indices
+    def _get_structure_indices(self):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        proteins = [sub_dict["foldseek"] for sub_dict in self.uniprot2label.values()]
+        span = math.ceil(len(proteins) / world_size)
+        sub_proteins = proteins[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        # Get protein representations
+        sub_protein_repr = self.structure_encoder.get_repr(sub_proteins, batch_size=1, verbose=verbose)
+        protein_repr = self.padded_gather(sub_protein_repr)
+        # Construct faiss index
+        d = protein_repr.shape[-1]
+        structure_indices = faiss.IndexFlatIP(d)
+        structure_indices.add(protein_repr.cpu().numpy())
+        return structure_indices
+    def _get_text_indices(self):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        if verbose:
+            iterator = tqdm(self.label2text.keys(), desc="Get text representations")
+        else:
+            iterator = self.label2text.keys()
+        text_embeddings = {}
+        for subsection in iterator:
+            if subsection == "Total":
+                continue
+            texts = []
+            for text_list in self.label2text[subsection].values():
+                # Only use the first text for efficiency
+                texts.append(text_list[0:1])
+            span = math.ceil(len(texts) / world_size)
+            texts = texts[rank * span: (rank + 1) * span]
+            embeddings = []
+            for text_list in texts:
+                text_repr = self.text_encoder.get_repr(text_list)
+                mean_repr = text_repr.mean(dim=0, keepdim=True)
+                norm_repr = torch.nn.functional.normalize(mean_repr, dim=-1)
+                embeddings.append(norm_repr)
+            if len(embeddings) > 0:
+                embeddings = torch.cat(embeddings, dim=0)
+            else:
+                embeddings = torch.zeros(0, self.repr_dim, dtype=self.dtype, device=self.device)
+            text_repr = self.padded_gather(embeddings)
+            text_embeddings[subsection] = text_repr
+        # Aggregate text embeddings for global retrieval
+        total_embeddings = []
+        for idx in self.label2text["Total"].values():
+            subsection, i = idx.split("|")
+            total_embeddings.append(text_embeddings[subsection][int(i)])
+        text_embeddings["Total"] = torch.stack(total_embeddings)
+        # Construct faiss index
+        text_indices = {}
+        for subsection, text_repr in text_embeddings.items():
+            d = text_repr.shape[-1]
+            text_indices[subsection] = faiss.IndexFlatIP(d)
+            text_indices[subsection].add(text_repr.cpu().numpy())
+        return text_indices
+    def _protein2text(self, modality: str, protein_indices, text_indices: dict):
+        def do(process_id, idx, row, writer):
+            subsection, uniprot_id, prob_idx, label = row
+            # Retrieve ranking results
+            p_embedding = protein_indices.reconstruct(prob_idx).reshape(1, -1)
+            text_inds = text_indices[subsection]
+            sim_scores, rank_inds = text_inds.search(p_embedding, text_inds.ntotal)
+            sim_scores, rank_inds = sim_scores[0], rank_inds[0]
+            # Calculate Average Precision(AP)
+            ranks = []
+            label = set(label)
+            for i, rk in enumerate(rank_inds):
+                # Find the rank of this label in all labels
+                if rk in label:
+                    ranks.append(i + 1)
+            ranks = np.array(ranks)
+            ap = np.mean([(i + 1) / rank for i, rank in enumerate(ranks)])
+            # Calculate Mean Reciprocal Rank(MRR)
+            best_rank = ranks[0]
+            mrr = 1 / best_rank
+            # Calculate the AUC
+            true_labels = np.zeros_like(sim_scores)
+            true_labels[ranks - 1] = 1
+            if true_labels.sum() == 0 or true_labels.sum() == true_labels.shape[0]:
+                auc = 0
+            else:
+                auc = roc_auc_score(true_labels, sim_scores)
+            output = json.dumps([ap, mrr, auc])
+            writer.write(output + "\n")
+        inputs = []
+        swissprot_subsections = set()
+        for subsection in text_indices.keys():
+            for i, (uniprot_id, labels) in enumerate(self.uniprot2label.items()):
+                if uniprot_id in self.swissprot_ids:
+                    if subsection in labels:
+                        swissprot_subsections.add(subsection)
+                        label = labels[subsection]
+                        inputs.append((subsection, uniprot_id, i, label))
+        # Randomly shuffle the inputs
+        random.seed(20000812)
+        random.shuffle(inputs)
+        # Split inputs into chunks for parallel processing
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        span = math.ceil(len(inputs) / world_size)
+        sub_inputs = inputs[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        if verbose:
+            print("Evaluating on each subsection...")
+        tmp_path = f"/sujin/PycharmProjects/Pretraining/{time.time()}_{rank}.tsv"
+        mpr = MultipleProcessRunnerSimplifier(sub_inputs, do, save_path=tmp_path, n_process=8, verbose=verbose,
+                                              return_results=True)
+        outputs = mpr.run()
+        os.remove(tmp_path)
+        # Aggregate results
+        tensor_outputs = []
+        for output in outputs:
+            ap, mrr, auc = json.loads(output)
+            tensor_outputs.append([float(ap), float(mrr), float(auc)])
+        tensor_outputs = torch.tensor(tensor_outputs, dtype=torch.float32, device=self.device)
+        tensor_outputs = self.padded_gather(tensor_outputs)
+        # Record results
+        avg_results = {}
+        for subsection in swissprot_subsections:
+            avg_results[subsection] = {"map": [],
+                                       "mrr": [],
+                                       "auc": []}
+        for input, output in zip(inputs, tensor_outputs):
+            ap, mrr, auc = output
+            subsection, _, _, _ = input
+            avg_results[subsection]["map"].append(ap.cpu().item())
+            avg_results[subsection]["mrr"].append(mrr.cpu().item())
+            avg_results[subsection]["auc"].append(auc.cpu().item())
+        results = {
+            f"{modality}2Text_Total_mrr": np.mean(avg_results["Total"]["mrr"]),
+            f"{modality}2Text_Total_map": np.mean(avg_results["Total"]["map"]),
+            f"{modality}2Text_Total_auc": np.mean(avg_results["Total"]["auc"]),
+        }
+        # Average the precision and recall for each level
+        for level, labels in [("residue-level", residue_level),
+                              ("sequence-level", sequence_level),
+                              ("all", residue_level | sequence_level)]:
+            mrrs = []
+            maps = []
+            aucs = []
+            for subsection in labels:
+                if subsection in avg_results:
+                    mrrs.append(np.mean(avg_results[subsection]["mrr"]))
+                    maps.append(np.mean(avg_results[subsection]["map"]))
+                    aucs.append(np.mean(avg_results[subsection]["auc"]))
+            results[f"{modality}2Text_{level}_mrr"] = np.mean(mrrs)
+            results[f"{modality}2Text_{level}_map"] = np.mean(maps)
+            results[f"{modality}2Text_{level}_auc"] = np.mean(aucs)
+        return results
+    def _text2protein(self, modality: str, protein_indices, text_indices: dict):
+        def do(process_id, idx, row, writer):
+            subsection, text_id, label = row
+            # Retrieve ranking results
+            t_embedding = text_indices[subsection].reconstruct(text_id).reshape(1, -1)
+            sim_scores, rank_inds = protein_indices.search(t_embedding, protein_indices.ntotal)
+            sim_scores, rank_inds = sim_scores[0], rank_inds[0]
+            # Calculate Average Precision(AP)
+            ranks = []
+            label = set(label)
+            for i, rk in enumerate(rank_inds):
+                # Find the rank of this label in all labels
+                if rk in label:
+                    ranks.append(i + 1)
+            ranks = np.array(ranks)
+            ap = np.mean([(i + 1) / rank for i, rank in enumerate(ranks)])
+            # Calculate Mean Reciprocal Rank(MRR)
+            best_rank = ranks[0]
+            mrr = 1 / best_rank
+            # Calculate the AUC
+            true_labels = np.zeros_like(sim_scores)
+            true_labels[ranks - 1] = 1
+            if true_labels.sum() == 0 or true_labels.sum() == true_labels.shape[0]:
+                auc = 0
+            else:
+                auc = roc_auc_score(true_labels, sim_scores)
+            output = json.dumps([ap, mrr, auc])
+            writer.write(output + "\n")
+        text2label = {}
+        swissprot_subsections = set()
+        for i, (uniprot_id, subsections) in enumerate(self.uniprot2label.items()):
+            # Only evaluate the texts in Swiss-Prot
+            if uniprot_id not in self.swissprot_ids:
+                continue
+            for subsection, text_ids in subsections.items():
+                if subsection == "seq" or subsection == "foldseek":
+                    continue
+                swissprot_subsections.add(subsection)
+                if subsection not in text2label:
+                    text2label[subsection] = {}
+                for text_id in text_ids:
+                    text2label[subsection][text_id] = text2label[subsection].get(text_id, []) + [i]
+        inputs = []
+        for subsection in swissprot_subsections:
+            for i, (text_id, label) in enumerate(text2label[subsection].items()):
+                inputs.append((subsection, text_id, label))
+        # Randomly shuffle the inputs
+        random.seed(20000812)
+        random.shuffle(inputs)
+        # Split inputs into chunks for parallel processing
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        span = math.ceil(len(inputs) / world_size)
+        sub_inputs = inputs[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        if verbose:
+            print("Evaluating on each text...")
+        # Add time stamp to the temporary file name to avoid conflicts
+        tmp_path = f"/sujin/PycharmProjects/Pretraining/{time.time()}_{rank}.tsv"
+        mpr = MultipleProcessRunnerSimplifier(sub_inputs, do, save_path=tmp_path, n_process=8, verbose=verbose,
+                                              return_results=True)
+        outputs = mpr.run()
+        os.remove(tmp_path)
+        # Aggregate results
+        tensor_outputs = []
+        for output in outputs:
+            ap, mrr, auc = json.loads(output)
+            tensor_outputs.append([float(ap), float(mrr), float(auc)])
+        tensor_outputs = torch.tensor(tensor_outputs, dtype=torch.float32, device=self.device)
+        tensor_outputs = self.padded_gather(tensor_outputs)
+        # Record results
+        avg_results = {}
+        for subsection in swissprot_subsections:
+            avg_results[subsection] = {"map": [],
+                                       "mrr": [],
+                                       "auc": []}
+        for input, output in zip(inputs, tensor_outputs):
+            ap, mrr, auc = output
+            subsection, _, _ = input
+            avg_results[subsection]["map"].append(ap.cpu().item())
+            avg_results[subsection]["mrr"].append(mrr.cpu().item())
+            avg_results[subsection]["auc"].append(auc.cpu().item())
+        results = {
+            f"Text2{modality}_Total_mrr": np.mean(avg_results["Total"]["mrr"]),
+            f"Text2{modality}_Total_map": np.mean(avg_results["Total"]["map"]),
+            f"Text2{modality}_Total_auc": np.mean(avg_results["Total"]["auc"]),
+        }
+        # Average the precision and recall for each level
+        for level, labels in [("residue-level", residue_level),
+                              ("sequence-level", sequence_level),
+                              ("all", residue_level | sequence_level)]:
+            mrrs = []
+            maps = []
+            aucs = []
+            for subsection in labels:
+                if subsection in avg_results:
+                    mrrs.append(np.mean(avg_results[subsection]["mrr"]))
+                    maps.append(np.mean(avg_results[subsection]["map"]))
+                    aucs.append(np.mean(avg_results[subsection]["auc"]))
+            results[f"Text2{modality}_{level}_mrr"] = np.mean(mrrs)
+            results[f"Text2{modality}_{level}_map"] = np.mean(maps)
+            results[f"Text2{modality}_{level}_auc"] = np.mean(aucs)
+        return results
+    def retrieval_eval(self) -> dict:
+        # Get protein representations
+        protein_indices = self._get_protein_indices()
+        # Get structure representations
+        # if self.structure_config is not None:
+        #     structure_embeddings = self._get_structure_embeddings()
+        # Get text representations
+        text_indices = self._get_text_indices()
+        # Retrieve texts for each protein
+        results = {}
+        results.update(self._protein2text("Sequence", protein_indices, text_indices))
+        # if self.structure_config is not None:
+        #     results.update(self._protein2text("Structure", structure_embeddings, text_embeddings))
+        #     results.update(self._text2protein("Structure", structure_embeddings, text_embeddings))
+        # Retrieve proteins for each text
+        results.update(self._text2protein("Sequence", protein_indices, text_indices))
+        return results
+    def _apply_bert_mask(self, tokens, tokenizer, mask_ratio):
+        while True:
+            masked_tokens = copy.copy(tokens)
+            labels = torch.full((len(tokens) + 2,), -1, dtype=torch.long)
+            vocab = [k for k in tokenizer.get_vocab().keys()]
+            for i in range(len(tokens)):
+                token = tokens[i]
+                prob = random.random()
+                if prob < mask_ratio:
+                    prob /= mask_ratio
+                    labels[i + 1] = tokenizer.convert_tokens_to_ids(token)
+                    if prob < 0.8:
+                        # 80% random change to mask token
+                        if self.use_saprot:
+                            token = "#" + token[-1]
+                        else:
+                            token = tokenizer.mask_token
+                    elif prob < 0.9:
+                        # 10% chance to change to random token
+                        token = random.choice(vocab)
+                    else:
+                        # 10% chance to keep current token
+                        pass
+                    masked_tokens[i] = token
+            # Check if there is at least one masked token
+            if (labels != -1).any():
+                return masked_tokens, labels
+    def mlm_eval(self) -> float:
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        if self.use_saprot:
+            proteins = []
+            for sub_dict in self.uniprot2label.values():
+                aa_seq = sub_dict["seq"]
+                foldseek_seq = sub_dict["foldseek"]
+                assert len(aa_seq) == len(foldseek_seq)
+                seq = "".join([a + b for a, b in zip(aa_seq, foldseek_seq)])
+                proteins.append(seq)
+        else:
+            proteins = [sub_dict["seq"] for sub_dict in self.uniprot2label.values()]
+        span = math.ceil(len(proteins) / world_size)
+        sub_proteins = proteins[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        if self.trainer.local_rank == 0:
+            iterator = tqdm(sub_proteins, desc="Computing mlm...")
+        else:
+            iterator = sub_proteins
+        total = torch.tensor([0], dtype=torch.long, device=self.device)
+        correct = torch.tensor([0], dtype=torch.long, device=self.device)
+        for seq in iterator:
+            tokens = self.protein_encoder.tokenizer.tokenize(seq)
+            masked_tokens, labels = self._apply_bert_mask(tokens, self.protein_encoder.tokenizer, 0.15)
+            seq = " ".join(masked_tokens)
+            inputs = self.protein_encoder.tokenizer(seq, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            _, logits = self.protein_encoder(inputs, get_mask_logits=True)
+            logits = logits.squeeze(0)
+            labels = labels.to(self.device)
+            selecor = labels != -1
+            preds = logits.argmax(dim=-1)[selecor]
+            labels = labels[selecor]
+            total += len(preds)
+            correct += (preds == labels).sum()
+        # Gather all results
+        total = self.padded_gather(total).sum()
+        correct = self.padded_gather(correct).sum()
+        acc = correct / total
+        return acc.cpu().item()
+    def _load_eval_data(self, stage):
+        # Load the data
+        lmdb_dir = eval(f"self.trainer.datamodule.{stage}_lmdb")
+        uniprot2label_path = os.path.join(lmdb_dir, "uniprot2label.json")
+        label2text_path = os.path.join(lmdb_dir, "label2text.json")
+        swissprot_id_path = os.path.join(lmdb_dir, "swissprot_ids.tsv")
+        self.uniprot2label = json.load(open(uniprot2label_path, "r"))
+        self.label2text = json.load(open(label2text_path, "r"))
+        self.swissprot_ids = set(pd.read_csv(swissprot_id_path, sep="\t", header=None).values.flatten().tolist())
+        self.k = 3
+    def on_test_start(self):
+        self._load_eval_data("test")
+        log_dict = self.retrieval_eval()
+        log_dict = {"test_" + k: v for k, v in log_dict.items()}
+        if self.use_mlm_loss:
+            log_dict["test_mask_acc"] = self.mlm_eval()
+        self.log_info(log_dict)
+        print(log_dict)
+    def on_validation_start(self):
+        # Clear the cache
+        torch.cuda.empty_cache()
+        self._load_eval_data("valid")
+        log_dict = self.retrieval_eval()
+        log_dict = {"valid_" + k: v for k, v in log_dict.items()}
+        if self.use_mlm_loss:
+            log_dict["valid_mask_acc"] = self.mlm_eval()
+        self.log_info(log_dict)
+        self.check_save_condition(self.step, mode="max")
+    def test_step(self, batch, batch_idx):
+        return
+    def validation_step(self, batch, batch_idx):
+        return
+    def on_train_epoch_end(self):
+        super().on_train_epoch_end()
+        # Re-sample the subset of the training data
+        if self.trainer.datamodule.train_dataset.fixed_dataset_num is not None:
+            self.trainer.datamodule.train_dataset.sample_subset()
+    # def test_epoch_end(self, outputs):
+    #     log_dict = self.get_log_dict("test")
+    #     log_dict["test_loss"] = torch.cat(self.all_gather(outputs), dim=-1).mean()
+    #
+    #     print(log_dict)
+    #     self.log_info(log_dict)
+    #
+    #     self.reset_metrics("test")
+    #
+    # def validation_epoch_end(self, outputs):
+    #     log_dict = self.get_log_dict("valid")
+    #     log_dict["valid_loss"] = torch.cat(self.all_gather(outputs), dim=-1).mean()
+    #
+    #     self.log_info(log_dict)
+    #     self.reset_metrics("valid")
+    #     self.check_save_condition(log_dict["valid_loss"], mode="min")

model/ProTrek/structure_encoder.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from tqdm import tqdm
+from transformers import EsmConfig, EsmForMaskedLM, EsmTokenizer
+from torch.nn.functional import normalize
+class StructureEncoder(torch.nn.Module):
+    def __init__(self, config_path: str, out_dim: int, gradient_checkpointing: bool = False):
+        """
+        Args:
+            config_path: Path to the config file
+            out_dim: Output dimension of the structure representation
+            gradient_checkpointing: Whether to use gradient checkpointing
+        """
+        super().__init__()
+        config = EsmConfig.from_pretrained(config_path)
+        self.model = EsmForMaskedLM(config)
+        self.out = torch.nn.Linear(config.hidden_size, out_dim)
+        # Set gradient checkpointing
+        self.model.esm.encoder.gradient_checkpointing = gradient_checkpointing
+        # Remove contact head
+        self.model.esm.contact_head = None
+        # Remove position embedding if the embedding type is ``rotary``
+        if config.position_embedding_type == "rotary":
+            self.model.esm.embeddings.position_embeddings = None
+        self.tokenizer = EsmTokenizer.from_pretrained(config_path)
+    def get_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        """
+        Compute protein structure representation for the given proteins
+        Args:
+            protein: A list of protein structural sequences
+            batch_size: Batch size for inference
+            verbose: Whether to print progress
+        """
+        device = next(self.parameters()).device
+        protein_repr = []
+        if verbose:
+            iterator = tqdm(range(0, len(proteins), batch_size), desc="Computing protein embeddings")
+        else:
+            iterator = range(0, len(proteins), batch_size)
+        for i in iterator:
+            protein_inputs = self.tokenizer.batch_encode_plus(proteins[i:i + batch_size],
+                                                              return_tensors="pt",
+                                                              padding=True)
+            protein_inputs = {k: v.to(device) for k, v in protein_inputs.items()}
+            output, _ = self.forward(protein_inputs)
+            protein_repr.append(output)
+        protein_repr = torch.cat(protein_repr, dim=0)
+        return normalize(protein_repr, dim=-1)
+    def forward(self, inputs: dict, get_mask_logits: bool = False):
+        """
+        Encode protein structure into protein representation
+        Args:
+            inputs: A dictionary containing the following keys:
+                - input_ids: [batch, seq_len]
+                - attention_mask: [batch, seq_len]
+            get_mask_logits: Whether to return the logits for masked tokens
+        Returns:
+            protein_repr: [batch, protein_repr_dim]
+            mask_logits : [batch, seq_len, vocab_size]
+        """
+        last_hidden_state = self.model.esm(**inputs).last_hidden_state
+        reprs = last_hidden_state[:, 0, :]
+        reprs = self.out(reprs)
+        # Get logits for masked tokens
+        if get_mask_logits:
+            mask_logits = self.model.lm_head(last_hidden_state)
+        else:
+            mask_logits = None
+        return reprs, mask_logits

model/ProTrek/text_encoder.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from tqdm import tqdm
+from torch.nn.functional import normalize
+from transformers import BertConfig, BertModel, BertTokenizer
+class TextEncoder(torch.nn.Module):
+    def __init__(self,
+                 config_path: str,
+                 out_dim: int,
+                 load_pretrained: bool = True,
+                 gradient_checkpointing: bool = False):
+        """
+        Args:
+            config_path: Path to the config file
+            out_dim: Output dimension of the text representation
+            load_pretrained: Whether to load pretrained weights
+            gradient_checkpointing: Whether to enable gradient checkpointing
+        """
+        super().__init__()
+        config = BertConfig.from_pretrained(config_path)
+        if load_pretrained:
+            self.model = BertModel.from_pretrained(config_path, add_pooling_layer=False)
+        else:
+            self.model = BertModel(config, add_pooling_layer=False)
+        self.out = torch.nn.Linear(config.hidden_size, out_dim)
+        # Set gradient checkpointing
+        self.model.encoder.gradient_checkpointing = gradient_checkpointing
+        self.tokenizer = BertTokenizer.from_pretrained(config_path)
+    def get_repr(self, texts: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        """
+        Compute text representation for the given texts
+        Args:
+            texts: A list of strings
+            batch_size: Batch size for inference
+            verbose: Whether to print progress
+        """
+        device = next(self.parameters()).device
+        text_repr = []
+        if verbose:
+            iterator = tqdm(range(0, len(texts), batch_size), desc="Computing text embeddings")
+        else:
+            iterator = range(0, len(texts), batch_size)
+        for i in iterator:
+            text_inputs = self.tokenizer.batch_encode_plus(texts[i: i+batch_size],
+                                                           return_tensors="pt",
+                                                           truncation=True,
+                                                           max_length=512,
+                                                           padding=True)
+            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+            output = self(text_inputs)
+            text_repr.append(output)
+        text_repr = torch.cat(text_repr, dim=0)
+        return normalize(text_repr, dim=-1)
+    def forward(self, inputs: dict):
+        """
+        Encode text into text representation
+        Args:
+            inputs: A dictionary containing the following keys:
+                - input_ids: [batch, seq_len]
+                - attention_mask: [batch, seq_len]
+                - token_type_ids: [batch, seq_len]
+        Returns:
+            text_repr: [batch, text_repr_dim]
+        """
+        reprs = self.model(**inputs).last_hidden_state[:, 0, :]
+        reprs = self.out(reprs)
+        return reprs

model/abstract_model.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import torch
+import abc
+import os
+import copy
+import pytorch_lightning as pl
+from utils.lr_scheduler import *
+from torch import distributed as dist
+class AbstractModel(pl.LightningModule):
+    def __init__(self,
+                 lr_scheduler_kwargs: dict = None,
+                 optimizer_kwargs: dict = None,
+                 save_path: str = None,
+                 from_checkpoint: str = None,
+                 load_prev_scheduler: bool = False,
+                 save_weights_only: bool = True,):
+        """
+        Args:
+            lr_scheduler: Kwargs for lr_scheduler
+            optimizer_kwargs: Kwargs for optimizer_kwargs
+            save_path: Save trained model
+            from_checkpoint: Load model from checkpoint
+            load_prev_scheduler: Whether load previous scheduler from checkpoint
+            load_strict: Whether load model strictly
+            save_weights_only: Whether save only weights or also optimizer and lr_scheduler
+        """
+        super().__init__()
+        self.initialize_model()
+        self.metrics = {}
+        for stage in ["train", "valid", "test"]:
+            stage_metrics = self.initialize_metrics(stage)
+            # Rigister metrics as attributes
+            for metric_name, metric in stage_metrics.items():
+                setattr(self, metric_name, metric)
+            self.metrics[stage] = stage_metrics
+        if lr_scheduler_kwargs is None:
+            # Default lr_scheduler
+            self.lr_scheduler_kwargs = {
+                "class": "ConstantLRScheduler",
+                "init_lr": 0,
+            }
+            print("No lr_scheduler_kwargs provided. The default learning rate is 0.")
+        else:
+            self.lr_scheduler_kwargs = lr_scheduler_kwargs
+        if optimizer_kwargs is None:
+            # Default optimizer
+            self.optimizer_kwargs = {
+                "class": "AdamW",
+                "betas": (0.9, 0.98),
+                "weight_decay": 0.01,
+            }
+            print("No optimizer_kwargs provided. The default optimizer is AdamW.")
+        else:
+            self.optimizer_kwargs = optimizer_kwargs
+        self.init_optimizers()
+        self.save_path = save_path
+        self.save_weights_only = save_weights_only
+        # temp_step is used for accumulating gradients
+        self.temp_step = 0
+        self.step = 0
+        self.epoch = 0
+        self.load_prev_scheduler = load_prev_scheduler
+        self.from_checkpoint = from_checkpoint
+        if from_checkpoint:
+            self.load_checkpoint(from_checkpoint)
+    @abc.abstractmethod
+    def initialize_model(self) -> None:
+        """
+        All model initialization should be done here
+        Note that the whole model must be named as "self.model" for model saving and loading
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def forward(self, *args, **kwargs):
+        """
+        Forward propagation
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def initialize_metrics(self, stage: str) -> dict:
+        """
+        Initialize metrics for each stage
+        Args:
+            stage: "train", "valid" or "test"
+        Returns:
+            A dictionary of metrics for the stage. Keys are metric names and values are metric objects
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def loss_func(self, stage: str, outputs, labels) -> torch.Tensor:
+        """
+        Args:
+            stage: "train", "valid" or "test"
+            outputs: model outputs for calculating loss
+            labels: labels for calculating loss
+        Returns:
+            loss
+        """
+        raise NotImplementedError
+    @staticmethod
+    def load_weights(model, weights):
+        model_dict = model.state_dict()
+        unused_params = []
+        missed_params = list(model_dict.keys())
+        for k, v in weights.items():
+            if k in model_dict.keys():
+                model_dict[k] = v
+                missed_params.remove(k)
+            else:
+                unused_params.append(k)
+        if len(missed_params) > 0:
+            print(f"\033[31mSome weights of {type(model).__name__} were not "
+                  f"initialized from the model checkpoint: {missed_params}\033[0m")
+        if len(unused_params) > 0:
+            print(f"\033[31mSome weights of the model checkpoint were not used: {unused_params}\033[0m")
+        model.load_state_dict(model_dict)
+    def optimizer_step(
+        self,
+        epoch: int,
+        batch_idx: int,
+        optimizer,
+        optimizer_closure=None,
+    ) -> None:
+        super().optimizer_step(epoch, batch_idx, optimizer, optimizer_closure)
+        self.temp_step += 1
+        if self.temp_step == self.trainer.accumulate_grad_batches:
+            self.step += 1
+            self.temp_step = 0
+    # For pytorch-lightning 1.9.5
+    # def optimizer_step(
+    #     self,
+    #     epoch: int,
+    #     batch_idx: int,
+    #     optimizer,
+    #     optimizer_idx: int = 0,
+    #     optimizer_closure=None,
+    #     on_tpu: bool = False,
+    #     using_native_amp: bool = False,
+    #     using_lbfgs: bool = False,
+    # ) -> None:
+    #     super().optimizer_step(
+    #         epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs
+    #     )
+    #     self.temp_step += 1
+    #     if self.temp_step == self.trainer.accumulate_grad_batches:
+    #         self.step += 1
+    #         self.temp_step = 0
+    def on_train_epoch_end(self):
+        self.epoch += 1
+    def training_step(self, batch, batch_idx):
+        inputs, labels = batch
+        # optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.98))
+        # for _ in range(1000):
+        #     outputs = self(**inputs)
+        #     loss = self.loss_func('train', outputs, labels)
+        #     loss.backward()
+        #     optimizer.step()
+        #     optimizer.zero_grad()
+        #
+        # raise
+        outputs = self(**inputs)
+        loss = self.loss_func('train', outputs, labels)
+        self.log("loss", loss, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        inputs, labels = batch
+        outputs = self(**inputs)
+        loss = self.loss_func('valid', outputs, labels)
+        self.valid_outputs.append(loss)
+        return loss
+    def test_step(self, batch, batch_idx):
+        inputs, labels = batch
+        outputs = self(**inputs)
+        loss = self.loss_func('test', outputs, labels)
+        self.test_outputs.append(loss)
+        return loss
+    def on_train_start(self) -> None:
+        # Load previous scheduler
+        if getattr(self, "prev_schechuler", None) is not None:
+            try:
+                self.step = self.prev_schechuler["global_step"]
+                self.epoch = self.prev_schechuler["epoch"]
+                self.best_value = self.prev_schechuler["best_value"]
+                self.lr_scheduler.load_state_dict(self.prev_schechuler["lr_scheduler"])
+                print(f"Previous training global step: {self.step}")
+                print(f"Previous training epoch: {self.epoch}")
+                print(f"Previous best value: {self.best_value}")
+                print(f"Previous lr_scheduler: {self.prev_schechuler['lr_scheduler']}")
+                # Load optimizer state
+                if hasattr(self.trainer.strategy, "deepspeed_engine"):
+                    # For DeepSpeed strategy
+                    try:
+                        self.trainer.strategy.deepspeed_engine.load_checkpoint(self.from_checkpoint)
+                    except Exception as e:
+                        print(e)
+                else:
+                    # For DDP strategy
+                    self.optimizer.load_state_dict(self.prev_schechuler["optimizer"])
+            except Exception as e:
+                print(e)
+                raise Exception("Error in loading previous scheduler. Please set load_prev_scheduler=False")
+    def on_validation_epoch_start(self) -> None:
+        setattr(self, "valid_outputs", [])
+    def on_test_epoch_start(self) -> None:
+        setattr(self, "test_outputs", [])
+    def load_checkpoint(self, from_checkpoint: str) -> None:
+        """
+        Args:
+            from_checkpoint:  Path to checkpoint.
+        """
+        # If ``from_checkpoint`` is a directory, load the checkpoint in it
+        if os.path.isdir(from_checkpoint):
+            basename = os.path.basename(from_checkpoint)
+            from_checkpoint = os.path.join(from_checkpoint, f"{basename}.pt")
+        state_dict = torch.load(from_checkpoint, map_location=self.device)
+        self.load_weights(self.model, state_dict["model"])
+        if self.load_prev_scheduler:
+            state_dict.pop("model")
+            self.prev_schechuler = state_dict
+    def save_checkpoint(self, save_path: str, save_info: dict = None, save_weights_only: bool = True) -> None:
+        """
+        Save model to save_path
+        Args:
+            save_path: Path to save model
+            save_info: Other info to save
+            save_weights_only: Whether only save model weights
+        """
+        dir = os.path.dirname(save_path)
+        os.makedirs(dir, exist_ok=True)
+        state_dict = {} if save_info is None else save_info
+        state_dict["model"] = self.model.state_dict()
+        # Convert model weights to fp32
+        for k, v in state_dict["model"].items():
+            state_dict["model"][k] = v.float()
+        if not save_weights_only:
+            state_dict["global_step"] = self.step
+            state_dict["epoch"] = self.epoch
+            state_dict["best_value"] = getattr(self, f"best_value", None)
+            state_dict["lr_scheduler"] = self.lr_schedulers().state_dict()
+            # If not using DeepSpeed, save optimizer state
+            if not hasattr(self.trainer.strategy, "deepspeed_engine"):
+                state_dict["optimizer"] = self.optimizers().optimizer.state_dict()
+        torch.save(state_dict, save_path)
+    def check_save_condition(self, now_value: float, mode: str, save_info: dict = None) -> None:
+        """
+        Check whether to save model. If save_path is not None and now_value is the best, save model.
+        Args:
+            now_value: Current metric value
+            mode: "min" or "max", meaning whether the lower the better or the higher the better
+            save_info: Other info to save
+        """
+        assert mode in ["min", "max"], "mode should be 'min' or 'max'"
+        if self.save_path is not None:
+            # In case there are variables to be included in the save path
+            save_path = eval(f"f'{self.save_path}'")
+            dir = os.path.dirname(save_path)
+            os.makedirs(dir, exist_ok=True)
+            # Check whether to save model
+            best_value = getattr(self, f"best_value", None)
+            if best_value is not None:
+                if mode == "min" and now_value >= best_value or mode == "max" and now_value <= best_value:
+                    return
+            setattr(self, "best_value", now_value)
+            # For DeepSpeed strategy
+            if hasattr(self.trainer.strategy, "deepspeed_engine"):
+                if not self.save_weights_only:
+                    self.trainer.strategy.deepspeed_engine.save_checkpoint(save_path, tag="deepspeed_ckpt")
+                # Save a complete checkpoint
+                if dist.get_rank() == 0:
+                    basename = os.path.basename(save_path)
+                    ckpt_path = os.path.join(save_path, f"{basename}.pt")
+                    self.save_checkpoint(ckpt_path, save_info, self.save_weights_only)
+            # For normal situation
+            else:
+                if dist.get_rank() == 0:
+                    self.save_checkpoint(save_path, save_info, self.save_weights_only)
+    def reset_metrics(self, stage) -> None:
+        """
+        Reset metrics for given stage
+        Args:
+            stage: "train", "valid" or "test"
+        """
+        for metric in self.metrics[stage].values():
+            metric.reset()
+    def get_log_dict(self, stage: str) -> dict:
+        """
+        Get log dict for the stage
+        Args:
+            stage: "train", "valid" or "test"
+        Returns:
+            A dictionary of metrics for the stage. Keys are metric names and values are metric values
+        """
+        return {name: metric.compute() for name, metric in self.metrics[stage].items()}
+    def log_info(self, info: dict) -> None:
+        """
+        Record metrics during training and testing
+        Args:
+            info: dict of metrics
+        """
+        if getattr(self, "logger", None) is not None and dist.get_rank() == 0:
+            info["learning_rate"] = self.lr_scheduler.get_last_lr()[0]
+            info["epoch"] = self.epoch
+            self.logger.log_metrics(info, step=self.step)
+    def init_optimizers(self):
+        copy_optimizer_kwargs = copy.deepcopy(self.optimizer_kwargs)
+        # No decay for layer norm and bias
+        no_decay = ['LayerNorm.weight', 'bias']
+        weight_decay = copy_optimizer_kwargs.pop("weight_decay")
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
+             'weight_decay': weight_decay},
+            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+             'weight_decay': 0.0}
+        ]
+        optimizer_cls = eval(f"torch.optim.{copy_optimizer_kwargs.pop('class')}")
+        self.optimizer = optimizer_cls(optimizer_grouped_parameters,
+                                       lr=self.lr_scheduler_kwargs['init_lr'],
+                                       **copy_optimizer_kwargs)
+        tmp_kwargs = copy.deepcopy(self.lr_scheduler_kwargs)
+        lr_scheduler = tmp_kwargs.pop("class")
+        self.lr_scheduler = eval(lr_scheduler)(self.optimizer, **tmp_kwargs)
+    def configure_optimizers(self):
+        return {"optimizer": self.optimizer,
+                "lr_scheduler": {"scheduler": self.lr_scheduler,
+                                 "interval": "step",
+                                 "frequency": 1}
+                }

model/model_interface.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import yaml
+import glob
+# register all available models through *_model.py files
+# def construct_model():
+#     model_dir = os.path.dirname(__file__)
+#
+#     # lists all model files
+#     model_list = []
+#     for root, _, names in os.walk(model_dir):
+#         for name in names:
+#             if name.endswith('_model.py'):
+#                 sub_dirs = root.replace(model_dir, '').split(os.sep)
+#                 model_list.append((sub_dirs, name[:-3]))
+#
+#     # load model_config.yaml, controlling which models to be loaded
+#     model_config = yaml.safe_load(open(f"{model_dir}/model_config.yaml", "r"))
+#
+#     if model_config["verbose"]:
+#         print("*" * 30 + f" Loading model " + "*" * 30)
+#
+#     # register models
+#     for sub_dirs, name in model_list:
+#         if name in model_config["models"]:
+#             if len(sub_dirs) > 1:
+#                 cmd = f"from {'.'.join(sub_dirs)} import {name}"
+#             else:
+#                 cmd = f"from . import {name}"
+#
+#             exec(cmd)
+#
+#             if model_config["verbose"]:
+#                 info = f"Loaded model: {name}"
+#                 print(f"\033[32m{info}\033[0m")
+#         else:
+#             if model_config["verbose"]:
+#                 info = f"Skipped model: {name}"
+#                 print(f"\033[31m{info}\033[0m")
+#
+#     if model_config["verbose"]:
+#         print("*" * 75)
+#
+#
+# # register function as a wrapper for all models
+# def register_model(cls):
+#     model_dict[cls.__name__] = cls
+#     return cls
+#
+#
+# model_dict = {}
+# construct_model()
+#
+#
+# class ModelInterface:
+#     @classmethod
+#     def get_available_models(cls):
+#         return model_dict.keys()
+#
+#     @classmethod
+#     def init_model(cls, model: str, **kwargs):
+#         """
+#
+#         Args:
+#            model   : Class name of model you want to use. Must be in model_dict.keys()
+#            **kwargs: Kwargs for model initialization
+#
+#         Returns: Corresponding model
+#
+#         """
+#         assert model in model_dict.keys(), f"class {model} doesn't exist!"
+#         return model_dict[model](**kwargs)
+########################################################################
+#                             Version 2                                #
+########################################################################
+# register function as a wrapper for all models
+def register_model(cls):
+    global now_cls
+    now_cls = cls
+    return cls
+now_cls = None
+class ModelInterface:
+    @classmethod
+    def init_model(cls, model_py_path: str, **kwargs):
+        """
+        Args:
+            model_py_path: Py file Path of model you want to use.
+           **kwargs: Kwargs for model initialization
+        Returns: Corresponding model
+        """
+        sub_dirs = model_py_path.split(os.sep)
+        cmd = f"from {'.' + '.'.join(sub_dirs[:-1])} import {sub_dirs[-1]}"
+        exec(cmd)
+        return now_cls(**kwargs)

utils/constants.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import itertools
+aa_set = {"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}
+aa_list = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
+foldseek_seq_vocab = "ACDEFGHIKLMNPQRSTVWY#"
+foldseek_struc_vocab = "pynwrqhgdlvtmfsaeikc#"
+struc_unit = "abcdefghijklmnopqrstuvwxyz"
+def create_vocab(size: int) -> dict:
+    """
+    Args:
+        size:   Size of the vocabulary
+    Returns:
+        vocab:  Vocabulary
+    """
+    token_len = 1
+    while size > len(struc_unit) ** token_len:
+        token_len += 1
+    vocab = {}
+    for i, token in enumerate(itertools.product(struc_unit, repeat=token_len)):
+        vocab[i] = "".join(token)
+        if len(vocab) == size:
+            vocab[i+1] = "#"
+            return vocab
+# ProTrek
+residue_level = {"Active site", "Binding site", "Site", "DNA binding", "Natural variant", "Mutagenesis",
+                 "Transmembrane", "Topological domain", "Intramembrane", "Signal peptide", "Propeptide",
+                 "Transit peptide",
+                 "Chain", "Peptide", "Modified residue", "Lipidation", "Glycosylation", "Disulfide bond",
+                 "Cross-link",
+                 "Domain", "Repeat", "Compositional bias", "Region", "Coiled coil", "Motif"}
+sequence_level = {"Function", "Miscellaneous", "Caution", "Catalytic activity", "Cofactor", "Activity regulation",
+                  "Biophysicochemical properties", "Pathway", "Involvement in disease", "Allergenic properties",
+                  "Toxic dose", "Pharmaceutical use", "Disruption phenotype", "Subcellular location",
+                  "Post-translational modification", "Subunit", "Domain (non-positional annotation)",
+                  "Sequence similarities", "RNA Editing", "Tissue specificity", "Developmental stage", "Induction",
+                  "Biotechnology", "Polymorphism", "GO annotation", "Proteomes", "Protein names", "Gene names",
+                  "Organism", "Taxonomic lineage", "Virus host"}
+raw_text_level = {"Function", "Subunit", "Tissue specificity", "Disruption phenotype", "Post-translational modification",
+                  "Induction", "Miscellaneous", "Sequence similarities", "Developmental stage",
+                  "Domain (non-positional annotation)", "Activity regulation", "Caution", "Polymorphism", "Toxic dose",
+                  "Allergenic properties", "Pharmaceutical use", "Cofactor", "Biophysicochemical properties",
+                  "Subcellular location", "RNA Editing"}

utils/downloader.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+from utils.mpr import MultipleProcessRunner
+from tqdm import tqdm
+class Downloader(MultipleProcessRunner):
+    """
+        Download files that has unified resource locator
+    """
+    def __init__(self, base_url, save_path, overwrite=False, skip_error_info=False, **kwargs):
+        """
+        Args:
+            base_url: Unified Resource Locator of pdb file
+            save_path: Unified Resource Locator of saving path
+            overwrite: whether overwrite existing files
+        """
+        super().__init__(**kwargs)
+        self.base_url = base_url
+        self.save_path = save_path
+        self.overwrite = overwrite
+        self.skip_error_info = skip_error_info
+        if not overwrite:
+            # remove existing files in data
+            self.data = [uniprot for uniprot in tqdm(self.data, desc="Filtering out existing files...")
+                         if not os.path.exists(self.save_path.format(uniprot))]
+    def _aggregate(self, final_path: str, sub_paths):
+        pass
+    def _target_static(self, process_id, data, sub_path, *args):
+        for i, uniprot in enumerate(data):
+            url = self.base_url.format(uniprot)
+            save_path = self.save_path.format(uniprot)
+            # shell cmd to download files
+            wget = f"wget -q -o /dev/null {url} -O {save_path}"
+            rm = f"rm {save_path}"
+            err = f"echo 'Error: {url} cannot be downloaded!'"
+            if self.skip_error_info:
+                err += ">/dev/null"
+            os.system(f"{wget} || ({rm} && {err})")
+            self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} Downloading files...")
+    def run(self):
+        """
+            Run this function to download files
+        """
+        super().run()
+    def __len__(self):
+        return len(self.data)
+    @staticmethod
+    # Clear empty files in specific directory
+    def clear_empty_files(path):
+        cnt = 0
+        for file in tqdm(os.listdir(path), desc="Clearing empty files..."):
+            if os.path.getsize(os.path.join(path, file)) == 0:
+                os.remove(os.path.join(path, file))
+                cnt += 1
+        print(f"Removed {cnt} empty files")
+        return cnt
+class AlphaDBDownloader(Downloader):
+    """
+        Download files from AlphaFold2 database
+    """
+    def __init__(self, uniprot_ids, type: str, save_dir: str, **kwargs):
+        """
+        Args:
+            uniprots: Uniprot ids
+            type: Which type of files to download. Must be one of ['pdb', 'mmcif', 'plddt', "pae"]
+            save_dir: Saving directory
+            **kwargs:
+        """
+        url_dict = {
+            "pdb": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.pdb",
+            "mmcif": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.cif",
+            "plddt": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-confidence_v4.json",
+            "pae": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-predicted_aligned_error_v4.json"
+        }
+        save_dict = {
+            "pdb": "{}.pdb",
+            "mmcif": "{}.cif",
+            "plddt": "{}.json",
+            "pae": "{}.json"
+        }
+        base_url = url_dict[type]
+        save_path = os.path.join(save_dir, save_dict[type])
+        super().__init__(data=uniprot_ids, base_url=base_url, save_path=save_path, **kwargs)
+class PDBDownloader(Downloader):
+    """
+        Download files from PDB
+    """
+    def __init__(self, pdb_ids, type: str, save_dir: str, **kwargs):
+        """
+        Args:
+            pdb_ids: PDB ids
+            type: Which type of files to download. Must be one of ['pdb', 'mmcif']
+            save_dir: Saving directory
+        """
+        url_dict = {
+            "pdb": "https://files.rcsb.org/download/{}.pdb",
+            "mmcif": "https://files.rcsb.org/download/{}.cif"
+        }
+        save_dict = {
+            "pdb": "{}.pdb",
+            "mmcif": "{}.cif"
+        }
+        base_url = url_dict[type]
+        save_path = os.path.join(save_dir, save_dict[type])
+        super().__init__(data=pdb_ids, base_url=base_url, save_path=save_path, **kwargs)
+class CATHDownloader(Downloader):
+    def __init__(self, cath_ids, save_dir, **kwargs):
+        """
+            Download files from CATH
+        Args:
+            cath_ids: CATH ids
+            save_dir: Saving directory
+        """
+        url = "http://www.cathdb.info/version/v4_3_0/api/rest/id/{}.pdb"
+        save_path = os.path.join(save_dir, "{}.pdb")
+        super().__init__(data=cath_ids, base_url=url, save_path=save_path, **kwargs)
+def download_pdb(pdb_id: str, format: str, save_path: str):
+    """
+    Download pdb file from PDB
+    Args:
+        pdb_id: PDB id
+        format: File , must be one of ['pdb', 'cif']
+        save_path: Saving path
+    """
+    url = f"https://files.rcsb.org/download/{pdb_id}.{format}"
+    wget = f"wget -q -o /dev/null {url} -O {save_path}"
+    rm = f"rm {save_path}"
+    err = f"echo 'Error: {url} cannot be downloaded!'"
+    os.system(f"{wget} || ({rm} && {err})")
+def download_af2(uniprot_id: str, format: str, save_path: str):
+    """
+    Download files from AlphaFold2 database
+    Args:
+        uniprot_id: Uniprot id
+        format: File format, must be one of ['pdb', 'cif', 'plddt', 'pae']
+        save_path: Saving path
+    """
+    url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.{format}"
+    wget = f"wget -q -o /dev/null {url} -O {save_path}"
+    rm = f"rm {save_path}"
+    err = f"echo 'Error: {url} cannot be downloaded!'"
+    os.system(f"{wget} || ({rm} && {err})")

utils/foldseek_util.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import time
+import json
+import numpy as np
+import re
+import sys
+sys.path.append(".")
+# Get structural seqs from pdb file
+def get_struc_seq(foldseek,
+                  path,
+                  chains: list = None,
+                  process_id: int = 0,
+                  plddt_mask: bool = False,
+                  plddt_threshold: float = 70.,
+                  foldseek_verbose: bool = False) -> dict:
+    """
+    Args:
+        foldseek: Binary executable file of foldseek
+        path: Path to pdb file
+        chains: Chains to be extracted from pdb file. If None, all chains will be extracted.
+        process_id: Process ID for temporary files. This is used for parallel processing.
+        plddt_mask: If True, mask regions with plddt < plddt_threshold. plddt scores are from the pdb file.
+        plddt_threshold: Threshold for plddt. If plddt is lower than this value, the structure will be masked.
+        foldseek_verbose: If True, foldseek will print verbose messages.
+    Returns:
+        seq_dict: A dict of structural seqs. The keys are chain IDs. The values are tuples of
+        (seq, struc_seq, combined_seq).
+    """
+    assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}"
+    assert os.path.exists(path), f"PDB file not found: {path}"
+    tmp_save_path = f"get_struc_seq_{process_id}_{time.time()}.tsv"
+    if foldseek_verbose:
+        cmd = f"{foldseek} structureto3didescriptor --threads 1 --chain-name-mode 1 {path} {tmp_save_path}"
+    else:
+        cmd = f"{foldseek} structureto3didescriptor -v 0 --threads 1 --chain-name-mode 1 {path} {tmp_save_path}"
+    os.system(cmd)
+    seq_dict = {}
+    name = os.path.basename(path)
+    with open(tmp_save_path, "r") as r:
+        for i, line in enumerate(r):
+            desc, seq, struc_seq = line.split("\t")[:3]
+            # Mask low plddt
+            if plddt_mask:
+                plddts = extract_plddt(path)
+                assert len(plddts) == len(struc_seq), f"Length mismatch: {len(plddts)} != {len(struc_seq)}"
+                # Mask regions with plddt < threshold
+                indices = np.where(plddts < plddt_threshold)[0]
+                np_seq = np.array(list(struc_seq))
+                np_seq[indices] = "#"
+                struc_seq = "".join(np_seq)
+            name_chain = desc.split(" ")[0]
+            chain = name_chain.replace(name, "").split("_")[-1]
+            if chains is None or chain in chains:
+                if chain not in seq_dict:
+                    combined_seq = "".join([a + b.lower() for a, b in zip(seq, struc_seq)])
+                    seq_dict[chain] = (seq, struc_seq, combined_seq)
+    os.remove(tmp_save_path)
+    os.remove(tmp_save_path + ".dbtype")
+    return seq_dict
+def extract_plddt(pdb_path: str) -> np.ndarray:
+    """
+    Extract plddt scores from pdb file.
+    Args:
+        pdb_path: Path to pdb file.
+    Returns:
+        plddts: plddt scores.
+    """
+    with open(pdb_path, "r") as r:
+        plddt_dict = {}
+        for line in r:
+            line = re.sub(' +', ' ', line).strip()
+            splits = line.split(" ")
+            if splits[0] == "ATOM":
+                # If position < 1000
+                if len(splits[4]) == 1:
+                    pos = int(splits[5])
+                # If position >= 1000, the blank will be removed, e.g. "A 999" -> "A1000"
+                # So the length of splits[4] is not 1
+                else:
+                    pos = int(splits[4][1:])
+                plddt = float(splits[-2])
+                if pos not in plddt_dict:
+                    plddt_dict[pos] = [plddt]
+                else:
+                    plddt_dict[pos].append(plddt)
+    plddts = np.array([np.mean(v) for v in plddt_dict.values()])
+    return plddts
+if __name__ == '__main__':
+    foldseek = "/sujin/bin/foldseek"
+    # test_path = "/sujin/Datasets/PDB/all/6xtd.cif"
+    test_path = "/sujin/Datasets/FLIP/meltome/af2_structures/A0A061ACX4.pdb"
+    plddt_path = "/sujin/Datasets/FLIP/meltome/af2_plddts/A0A061ACX4.json"
+    res = get_struc_seq(foldseek, test_path, plddt_path=plddt_path, plddt_threshold=70.)
+    print(res["A"][1].lower())

utils/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import math
+from torch.optim.lr_scheduler import _LRScheduler, CosineAnnealingLR
+class ConstantLRScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 last_epoch: int = -1,
+                 verbose: bool = False,
+                 init_lr: float = 0.,
+                 ):
+        """
+        This is an implementation of constant learning rate scheduler.
+        Args:
+            optimizer: Optimizer
+            last_epoch: The index of last epoch. Default: -1
+            verbose: If ``True``, prints a message to stdout for each update. Default: ``False``
+            init_lr: Initial learning rate
+        """
+        self.init_lr = init_lr
+        super().__init__(optimizer, last_epoch, verbose)
+    def state_dict(self):
+        state_dict = {k: v for k, v in self.__dict__.items() if k not in ["optimizer"]}
+        return state_dict
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+        return [self.init_lr for group in self.optimizer.param_groups]
+class CosineAnnealingLRScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 last_epoch: int = -1,
+                 verbose: bool = False,
+                 init_lr: float = 0.,
+                 max_lr: float = 4e-4,
+                 final_lr: float = 4e-5,
+                 warmup_steps: int = 2000,
+                 cosine_steps: int = 10000,
+                 ):
+        """
+        This is an implementation of cosine annealing learning rate scheduler.
+        Args:
+            optimizer: Optimizer
+            last_epoch: The index of last epoch. Default: -1
+            verbose: If ``True``, prints a message to stdout for each update. Default: ``False``
+            init_lr: Initial learning rate
+            max_lr: Maximum learning rate after warmup
+            final_lr: Final learning rate after decay
+            warmup_steps: Number of steps for warmup
+            cosine_steps: Number of steps for cosine annealing
+        """
+        self.init_lr = init_lr
+        self.max_lr = max_lr
+        self.final_lr = final_lr
+        self.warmup_steps = warmup_steps
+        self.cosine_steps = cosine_steps
+        super(CosineAnnealingLRScheduler, self).__init__(optimizer, last_epoch, verbose)
+    def state_dict(self):
+        state_dict = {k: v for k, v in self.__dict__.items() if k not in ["optimizer"]}
+        return state_dict
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+        step_no = self.last_epoch
+        if step_no <= self.warmup_steps:
+            lr = self.init_lr + step_no / self.warmup_steps * (self.max_lr - self.init_lr)
+        else:
+            lr = self.final_lr + 0.5 * (self.max_lr - self.final_lr) \
+                    * (1 + math.cos(math.pi * (step_no - self.warmup_steps) / self.cosine_steps))
+        return [lr for group in self.optimizer.param_groups]
+class Esm2LRScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 last_epoch: int = -1,
+                 verbose: bool = False,
+                 init_lr: float = 0.,
+                 max_lr: float = 4e-4,
+                 final_lr: float = 4e-5,
+                 warmup_steps: int = 2000,
+                 start_decay_after_n_steps: int = 500000,
+                 end_decay_after_n_steps: int = 5000000,
+                 on_use: bool = True,
+                 ):
+        """
+        This is an implementation of ESM2's learning rate scheduler.
+        Args:
+            optimizer: Optimizer
+            last_epoch: The index of last epoch. Default: -1
+            verbose: If ``True``, prints a message to stdout for each update. Default: ``False``
+            init_lr: Initial learning rate
+            max_lr: Maximum learning rate after warmup
+            final_lr: Final learning rate after decay
+            warmup_steps: Number of steps for warmup
+            start_decay_after_n_steps: Start decay after this number of steps
+            end_decay_after_n_steps: End decay after this number of steps
+            on_use: Whether to use this scheduler. If ``False``, the scheduler will not change the learning rate
+            and will only use the ``init_lr``. Default: ``True``
+        """
+        self.init_lr = init_lr
+        self.max_lr = max_lr
+        self.final_lr = final_lr
+        self.warmup_steps = warmup_steps
+        self.start_decay_after_n_steps = start_decay_after_n_steps
+        self.end_decay_after_n_steps = end_decay_after_n_steps
+        self.on_use = on_use
+        super(Esm2LRScheduler, self).__init__(optimizer, last_epoch, verbose)
+    def state_dict(self):
+        state_dict = {k: v for k, v in self.__dict__.items() if k not in ["optimizer"]}
+        return state_dict
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+        step_no = self.last_epoch
+        if not self.on_use:
+            return [base_lr for base_lr in self.base_lrs]
+        if step_no <= self.warmup_steps:
+            lr = self.init_lr + step_no / self.warmup_steps * (self.max_lr - self.init_lr)
+        elif step_no <= self.start_decay_after_n_steps:
+            lr = self.max_lr
+        elif step_no <= self.end_decay_after_n_steps:
+            portion = (step_no - self.start_decay_after_n_steps) / (self.end_decay_after_n_steps - self.start_decay_after_n_steps)
+            lr = self.max_lr - portion * (self.max_lr - self.final_lr)
+        else:
+            lr = self.final_lr
+        return [lr for group in self.optimizer.param_groups]

utils/mpr.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import abc
+import os
+import time
+import sys
+from tqdm import tqdm
+from math import ceil
+class MultipleProcessRunner:
+	"""
+	Abstarct class for running tasks with multiple process
+	There are three abstract methods that should be implemented:
+		1. __len__() : return the length of data
+		2. _target() : target function for each process
+		3. _aggregate() : aggregate results from each process
+	"""
+	def __init__(self,
+	             data,
+	             save_path=None,
+	             n_process=1,
+	             verbose=True,
+	             total_only=True,
+	             log_step=1,
+	             start_method='fork'):
+		"""
+		Args:
+			data     : data to be processed that can be sliced
+			path     : final output path
+			n_process: number of process
+			verbose  : if True, display progress bar
+			total_only: If True, only total progress bar is displayed
+			log_step : For total progress bar, Next log will be printed when
+			``current iteration`` - ``last log iteration`` >= log_step
+			start_method: start method for multiprocessing
+		"""
+		self.data = data
+		self.save_path = save_path
+		self.n_process = n_process
+		self.verbose = verbose
+		self.total_only = total_only
+		self.log_step = log_step
+		self.start_method = start_method
+		# get terminal width to format output
+		try:
+			self.terminal_y = os.get_terminal_size()[0]
+		except Exception as e:
+			print(e)
+			print("Can't get terminal size, set terminal_y = None")
+			self.terminal_y = None
+	def _s2hms(self, seconds: float):
+		"""
+		convert second format of time into hour:minute:second format
+		"""
+		m, s = divmod(seconds, 60)
+		h, m = divmod(m, 60)
+		return "%02d:%02d:%02d" % (h, m, s)
+	def _display_time(self, st_time, now, total):
+		ed_time = time.time()
+		running_time = ed_time - st_time
+		rest_time = running_time * (total - now) / now
+		iter_sec = f"{now / running_time:.2f}it/s" if now > running_time else f"{running_time / now:.2f}s/it"
+		return f' [{self._s2hms(running_time)} < {self._s2hms(rest_time)}, {iter_sec}]'
+	def _display_bar(self, now, total, length):
+		now = now if now <= total else total
+		num = now * length // total
+		progress_bar = '[' + '#' * num + '_' * (length - num) + ']'
+		return progress_bar
+	def _display_all(self, now, total, desc, st_time):
+		# make a progress bar
+		length = 50
+		progress_bar = self._display_bar(now, total, length)
+		time_display = self._display_time(st_time, now, total)
+		display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
+		# Clean a line
+		width = self.terminal_y if self.terminal_y is not None else 100
+		num_space = width - len(display)
+		if num_space > 0:
+			display += ' ' * num_space
+		else:
+			length += num_space
+			progress_bar = self._display_bar(now, total, length)
+			display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
+		# Set color
+		display = f"\033[31m{display}\033[0m"
+		return display
+	# Print progress bar at specific position in terminal
+	def terminal_progress_bar(self,
+	                          process_id: int,
+	                          now: int,
+	                          total: int,
+	                          desc: str = ''):
+		"""
+		Args:
+			process_id: process id
+			now: now iteration number
+			total: total iteration number
+			desc: description
+		"""
+		st_time = self.process_st_time[process_id]
+		# Aggregate total information
+		self.counts[process_id] = now
+		self._total_display(self.process_st_time["total"])
+		if not self.total_only:
+			process_display = self._display_all(now, total, desc, st_time)
+			if self.terminal_y is not None:
+				sys.stdout.write(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8")
+				sys.stdout.flush()
+			else:
+				print(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8", flush=True)
+	# Print global information
+	def _total_display(self, st_time):
+		if self.total_display_callable.value == 1:
+			self.total_display_callable.value = 0
+			cnt = sum([self.counts[i] for i in range(self.n_process)])
+			if cnt - self.last_cnt.value >= self.log_step:
+				total_display = self._display_all(cnt, self.__len__(), f"Total: ", st_time)
+				self.last_cnt.value = cnt
+				x = self.n_process + 1 if not self.total_only else 0
+				# if self.terminal_y is not None:
+				# 	sys.stdout.write(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8")
+				# 	sys.stdout.flush()
+				# else:
+				# 	print(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True)
+				print(f"\r\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True, end="")
+			self.total_display_callable.value = 1
+	def run(self):
+		"""
+		The function is used to run a multi-process task
+		Returns: return the result of function '_aggregate()'
+		"""
+		import multiprocess as mp
+		mp.set_start_method(self.start_method, force=True)
+		# total number of data that is already processed
+		self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
+		# record start time for each process
+		self.process_st_time = {"total": time.time()}
+		# set a lock to call total number display
+		self.total_display_callable = mp.Value('d', 1)
+		# Save last log iteration number
+		self.last_cnt = mp.Value('d', 0)
+		num_per_process = ceil(self.__len__() / self.n_process)
+		if self.save_path is not None:
+			file_name, suffix = os.path.splitext(self.save_path)
+		process_list = []
+		sub_paths = []
+		for i in range(self.n_process):
+			st = i * num_per_process
+			ed = st + num_per_process
+			# construct slice and sub path for sub process
+			data_slice = self.data[st: ed]
+			sub_path = None
+			# Create a directory to save sub-results
+			if self.save_path is not None:
+				save_dir = f"{file_name}{suffix}_temp"
+				os.makedirs(save_dir, exist_ok=True)
+				sub_path = f"{save_dir}/temp_{i}{suffix}"
+			# construct sub process
+			input_args = (i, data_slice, sub_path)
+			self.process_st_time[i] = time.time()
+			p = mp.Process(target=self._target, args=input_args)
+			p.start()
+			process_list.append(p)
+			sub_paths.append(sub_path)
+		for p in process_list:
+			p.join()
+		# aggregate results and remove temporary directory
+		results = self._aggregate(self.save_path, sub_paths)
+		if self.save_path is not None:
+			save_dir = f"{file_name}{suffix}_temp"
+			os.rmdir(save_dir)
+		return results
+	def parallel_run(self):
+		import multiprocess as mp
+		from joblib import Parallel, delayed
+		# total number of data that is already processed
+		self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
+		# record start time for each process
+		self.process_st_time = {"total": time.time()}
+		# set a lock to call total number display
+		self.total_display_callable = mp.Value('d', 1)
+		# Save last log iteration number
+		self.last_cnt = mp.Value('d', 0)
+		num_per_process = ceil(self.__len__() / self.n_process)
+		if self.save_path is not None:
+			file_name, suffix = os.path.splitext(self.save_path)
+		sub_paths = []
+		input_arg_list = []
+		for i in range(self.n_process):
+			st = i * num_per_process
+			ed = st + num_per_process
+			# construct slice and sub path for sub process
+			data_slice = self.data[st: ed]
+			sub_path = None
+			# Create a directory to save sub-results
+			if self.save_path is not None:
+				save_dir = f"{file_name}{suffix}_temp"
+				os.makedirs(save_dir, exist_ok=True)
+				sub_path = f"{save_dir}/temp_{i}{suffix}"
+			# construct sub process
+			input_args = (i, data_slice, sub_path)
+			self.process_st_time[i] = time.time()
+			sub_paths.append(sub_path)
+			input_arg_list.append(input_args)
+		# Start parallel processing
+		Parallel(n_jobs=self.n_process)(delayed(self._target)(input_args) for input_args in input_arg_list)
+		# aggregate results and remove temporary directory
+		results = self._aggregate(self.save_path, sub_paths)
+		if self.save_path is not None:
+			save_dir = f"{file_name}{suffix}_temp"
+			os.rmdir(save_dir)
+		return results
+	@abc.abstractmethod
+	def _aggregate(self, final_path: str, sub_paths):
+		"""
+		This function is used to aggregate results from sub processes into a file
+		Args:
+			final_path: path to save final results
+			sub_paths : list of sub paths
+		Returns: None or desirable results specified by user
+		"""
+		raise NotImplementedError
+	@abc.abstractmethod
+	def _target(self, process_id, data, sub_path):
+		"""
+		The main body to operate data in one process
+		Args:
+			i       : process id
+			data    : data slice
+			sub_path: sub path to save results
+		"""
+		raise NotImplementedError
+	@abc.abstractmethod
+	def __len__(self):
+		raise NotImplementedError
+class MultipleProcessRunnerSimplifier(MultipleProcessRunner):
+	"""
+	A simplified version of MultipleProcessRunner.
+	User only need to implement the function 'do', then it will be automatically executed
+	in every iteration after call the function 'run'.
+	If 'save_path' is specified, it will open a file in the 'sub_path' into which
+	user can write results, and results will be aggregated into 'save_path'.
+	The procedure would be like:
+		...
+		with open(sub_path, 'w') as w:
+			for i, d in enumerate(data):
+				self.do(process_id, i, d, w) # You can write results into the file.
+				...
+	The 'do' function should be like:
+		def do(process_id, idx, data, writer):
+			...
+	If 'save_path' is None, the argument 'writer' will be set to None.
+	"""
+	def __init__(self,
+	             data,
+	             do,
+	             save_path=None,
+	             n_process=1,
+	             verbose=True,
+	             total_only=True,
+	             log_step=1,
+	             return_results=False,
+	             start_method='fork'):
+		super().__init__(data=data,
+		                 save_path=save_path,
+		                 n_process=n_process,
+		                 verbose=verbose,
+		                 total_only=total_only,
+		                 log_step=log_step,
+		                 start_method=start_method)
+		self.do = do
+		self.return_results = return_results
+	def run(self):
+		self.start_time = time.time()
+		return super().run()
+	def _aggregate(self, final_path: str, sub_paths):
+		results = []
+		w = open(final_path, 'w') if final_path is not None else None
+		if self.verbose:
+			iterator = tqdm(enumerate(sub_paths), "Aggregating results...")
+		else:
+			iterator = enumerate(sub_paths)
+		for i, sub_path in iterator:
+			if sub_path is None and self.return_results:
+				sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{i}.tmp"
+			if sub_path is not None:
+				with open(sub_path, 'r') as r:
+					for line in r:
+						if w is not None:
+							w.write(line)
+						if self.return_results:
+							results.append(line[:-1])
+				os.remove(sub_path)
+		return results
+	def _target(self, process_id, data, sub_path):
+		if sub_path is None and self.return_results:
+			sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{process_id}.tmp"
+		w = open(sub_path, 'w') if sub_path is not None else None
+		for i, d in enumerate(data):
+			self.do(process_id, i, d, w)
+			if self.verbose:
+				self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} running...")
+		if w is not None:
+			w.close()
+	def __len__(self):
+		return len(self.data)