Spaces:

ChemFM
/

molecular_property_prediction

Sleeping

App Files Files Community

feiyang-cai commited on Oct 18, 2024

Commit

8b9fe11

1 Parent(s): c6866a7

add more detail time calcuated

Browse files

Files changed (1) hide show

utils.py +44 -42

utils.py CHANGED Viewed

@@ -132,33 +132,32 @@ class DataCollator(object):
         return self.sme.augment([molecule])[0]
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        sources = []
-        targets = []
-        for example in instances:
-            smiles = example['smiles'].strip()
-            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
-            # get the properties except the smiles and mol_id cols
-            #props = [example[col] if example[col] is not None else np.nan for col in sorted(example.keys()) if col not in ['smiles', 'is_aug']]
-            source = f"{self.molecule_start_str}{smiles}{self.end_str}"
-            sources.append(source)
-        # Tokenize
-        tokenized_sources_with_prompt = self.tokenizer(
-            sources,
-            max_length=self.source_max_len,
-            truncation=True,
-            add_special_tokens=False,
-        )
-        input_ids = [torch.tensor(tokenized_source) for tokenized_source in tokenized_sources_with_prompt['input_ids']]
-        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
-        data_dict = {
-            'input_ids': input_ids,
-            'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
-        }
         return data_dict
@@ -259,13 +258,14 @@ class MolecularPropertyPredictionModel():
     def predict(self, valid_df, task_type):
         with calculateDuration("predicting"):
-            test_dataset = Dataset.from_pandas(valid_df)
-            # construct the dataloader
-            test_loader = torch.utils.data.DataLoader(
-                test_dataset,
-                batch_size=16,
-                collate_fn=self.data_collator,
-            )
             # predict
             y_pred = []
@@ -302,17 +302,19 @@ class MolecularPropertyPredictionModel():
         with calculateDuration("predicting a file"):
             # we should add the index first
             df = df.reset_index()
-            # we need to check the SMILES strings are valid, the invalid ones will be moved to the last
-            valid_idx = []
-            invalid_idx = []
-            for idx, smiles in enumerate(df['smiles']):
-                if Chem.MolFromSmiles(smiles):
-                    valid_idx.append(idx)
-                else:
-                    invalid_idx.append(idx)
-            valid_df = df.loc[valid_idx]
-            # get the smiles list
-            valid_df_smiles = valid_df['smiles'].tolist()
             input_df = pd.DataFrame(valid_df_smiles, columns=['smiles'])
             results = self.predict(input_df, task_type)

         return self.sme.augment([molecule])[0]
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        with calculateDuration("DataCollator"):
+            sources = []
+            for example in instances:
+                smiles = example['smiles'].strip()
+                smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
+                # get the properties except the smiles and mol_id cols
+                #props = [example[col] if example[col] is not None else np.nan for col in sorted(example.keys()) if col not in ['smiles', 'is_aug']]
+                source = f"{self.molecule_start_str}{smiles}{self.end_str}"
+                sources.append(source)
+            # Tokenize
+            tokenized_sources_with_prompt = self.tokenizer(
+                sources,
+                max_length=self.source_max_len,
+                truncation=True,
+                add_special_tokens=False,
+            )
+            input_ids = [torch.tensor(tokenized_source) for tokenized_source in tokenized_sources_with_prompt['input_ids']]
+            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+            data_dict = {
+                'input_ids': input_ids,
+                'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
+            }
         return data_dict
     def predict(self, valid_df, task_type):
         with calculateDuration("predicting"):
+            with calculateDuration("construct dataloader"):
+                test_dataset = Dataset.from_pandas(valid_df)
+                # construct the dataloader
+                test_loader = torch.utils.data.DataLoader(
+                    test_dataset,
+                    batch_size=16,
+                    collate_fn=self.data_collator,
+                )
             # predict
             y_pred = []
         with calculateDuration("predicting a file"):
             # we should add the index first
             df = df.reset_index()
+            with calculateDuration("pre-checking SMILES"):
+                # we need to check the SMILES strings are valid, the invalid ones will be moved to the last
+                valid_idx = []
+                invalid_idx = []
+                for idx, smiles in enumerate(df['smiles']):
+                    if Chem.MolFromSmiles(smiles):
+                        valid_idx.append(idx)
+                    else:
+                        invalid_idx.append(idx)
+                valid_df = df.loc[valid_idx]
+                # get the smiles list
+                valid_df_smiles = valid_df['smiles'].tolist()
             input_df = pd.DataFrame(valid_df_smiles, columns=['smiles'])
             results = self.predict(input_df, task_type)