Spaces:

VaultChem
/

molvault

Running

App Files Files Community

VaultChem commited on Feb 2, 2024

Commit

b9fbcbc

verified ·

1 Parent(s): 80007ba

updated app format

Browse files

Files changed (2) hide show

app.py +6 -6
chemdata.py +2 -55

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from rdkit.Chem.Draw import rdMolDraw2D
 import pandas as pd
 from st_keyup import st_keyup
-st.set_page_config(layout="centered", page_title="VaultChem")
 def local_css(file_name):
@@ -60,7 +60,7 @@ formatted_text = (
     "<h1 style='text-align: center;'>"
     "<span style='color: red;'>Pharmacokinetics</span>"
     "<span style='color: black;'> of </span>"
-    "<span style='color: blue;'>🤫confidential🤫</span>"
     "<span style='color: black;'> molecules</span>"
     "</h1>"
 )
@@ -82,9 +82,9 @@ The server on which the prediction is computed will never see the molecule in cl
 Why is this **magic**? Because this is equivalent to computing the prediction on the molecule in clear text, but without sharing the molecule with the server.
 Even if organization "B" - or in fact any other party - would try to steal the data, they would only see the encrypted molecular data.
 **Only the party that has the private key (organization "A") can decrypt the prediction**. This is possible using a method called "Fully Homomorphic Encryption" (FHE).
-This special encryption scheme allows to perform computations on encrypted data.
-We use the open-source library <a href="https://docs.zama.ai/concrete-ml" target="_blank">Concrete ML</a> to develop safe and robust encryption technology.
 The code used for the FHE prediction is available in the open-source library
 \n
@@ -103,7 +103,7 @@ st.divider()
 st.markdown(
     "<p style='text-align: center; color: grey;'>"
-    + img_to_html("scheme2.png", width="80%")
     + "</p>",
     unsafe_allow_html=True,
 )
@@ -652,7 +652,7 @@ if __name__ == "__main__":
 st.markdown(
     """
     <div style="width: 100%; text-align: center; padding: 10px;">
-        The app was built with <a href="https://docs.zama.ai/concrete-ml" target="_blank">Concrete ML</a>,
         an open-source library by <a href="https://www.zama.ai/" target="_blank">Zama</a>.
     </div>
     """,

 import pandas as pd
 from st_keyup import st_keyup
+st.set_page_config(layout="wide", page_title="VaultChem")
 def local_css(file_name):
     "<h1 style='text-align: center;'>"
     "<span style='color: red;'>Pharmacokinetics</span>"
     "<span style='color: black;'> of </span>"
+    "<span style='color: blue;'>🤫confidential</span>"
     "<span style='color: black;'> molecules</span>"
     "</h1>"
 )
 Why is this **magic**? Because this is equivalent to computing the prediction on the molecule in clear text, but without sharing the molecule with the server.
 Even if organization "B" - or in fact any other party - would try to steal the data, they would only see the encrypted molecular data.
 **Only the party that has the private key (organization "A") can decrypt the prediction**. This is possible using a method called "Fully Homomorphic Encryption" (FHE).
+This special encryption scheme allows to perform computations on encrypted data, to learn more about FHE, click [here](https://fhe.org/resources/).
+We use the open-source library <a href="https://github.com/zama-ai/concrete-ml" target="_blank">Concrete-ML</a> to develop safe and robust encryption technology.
 The code used for the FHE prediction is available in the open-source library
 \n
 st.markdown(
     "<p style='text-align: center; color: grey;'>"
+    + img_to_html("scheme2.png", width="65%")
     + "</p>",
     unsafe_allow_html=True,
 )
 st.markdown(
     """
     <div style="width: 100%; text-align: center; padding: 10px;">
+        The app was built with <a href="https://github.com/zama-ai/concrete-ml" target="_blank">Concrete-ML</a>,
         an open-source library by <a href="https://www.zama.ai/" target="_blank">Zama</a>.
     </div>
     """,

chemdata.py CHANGED Viewed

@@ -153,60 +153,6 @@ def compute_descriptors_from_smiles_list(SMILES):
     return np.array(X)
-class ProcessToxChemData:
-    def __init__(self, bits=256):
-        self.bits = int(bits)
-        if not os.path.exists("data"):
-            os.makedirs("data")
-        self.save_file = "data/" + "save_file_Tox" + str(self.bits) + ".pkl"
-        if os.path.exists(self.save_file):
-            with open(self.save_file, "rb") as file:
-                self.adjusted_valid_entries_per_task = pickle.load(file)
-        else:
-            url = "https://github.com/deepchem/deepchem/blob/master/datasets/tox21.csv.gz?raw=true"
-            response = requests.get(url)
-            content = gzip.decompress(response.content)
-            self.df = pd.read_csv(BytesIO(content))
-            self.process()
-            self.save_adjusted_data()
-    def process(self):
-        self.adjusted_valid_entries_per_task = {}
-        # Iterating through each task column and extracting valid entries
-        for task in self.df.columns[
-            :-2
-        ]:  # Excluding mol_id and smiles from the iteration
-            valid_entries = self.df.dropna(subset=[task])[["mol_id", "smiles", task]]
-            valid_entries["fps"] = valid_entries["smiles"].apply(
-                lambda x: generate_fingerprint(x, radius=2, bits=self.bits)
-            )
-            valid_entries = valid_entries.dropna(subset=["fps"])
-            valid_entries["descriptors"] = valid_entries["smiles"].apply(
-                lambda x: compute_descriptors_from_smiles_list([x])[0]
-            )
-            valid_entries = valid_entries.dropna(subset=["descriptors"])
-            # Shuffle the rows
-            valid_entries = valid_entries.sample(frac=1, random_state=42).reset_index(
-                drop=True
-            )
-            self.adjusted_valid_entries_per_task[task] = valid_entries
-            self.adjusted_valid_entries_per_task[
-                task
-            ] = self.adjusted_valid_entries_per_task[task].rename(columns={task: "y"})
-    def save_adjusted_data(self):
-        with open(self.save_file, "wb") as file:
-            pickle.dump(self.adjusted_valid_entries_per_task, file)
-    def get_X_y(self, task):
-        X = np.float_(np.stack(self.adjusted_valid_entries_per_task[task].fps.values))
-        y = self.adjusted_valid_entries_per_task[task].y.values.astype(int)
-        return X, y
 class ProcessADMEChemData:
     def __init__(self, bits=512, radius=2):
         self.bits = int(bits)
@@ -291,7 +237,8 @@ def load_ADME_data(task, bits=256, radius=2):
     """
     data = ProcessADMEChemData(bits=bits, radius=radius)
     X, y = data.get_X_y(task)
-    return train_test_split(X, y, test_size=0.2, random_state=42)
 class ProcessGenericChemData:

     return np.array(X)
 class ProcessADMEChemData:
     def __init__(self, bits=512, radius=2):
         self.bits = int(bits)
     """
     data = ProcessADMEChemData(bits=bits, radius=radius)
     X, y = data.get_X_y(task)
+    SMILES = data.adjusted_valid_entries_per_task[task]["smiles"].values
+    return train_test_split(SMILES,X, y, test_size=0.2, random_state=42)
 class ProcessGenericChemData: