Spaces:

ronaldahmed
/

ccl_win

Sleeping

App Files Files Community

ronaldahmed commited on Jan 12, 2024

Commit

73342a7

verified ·

1 Parent(s): 65798f3

Update ccl_win.py

Browse files

Files changed (1) hide show

ccl_win.py +101 -66

ccl_win.py CHANGED Viewed

@@ -64,34 +64,46 @@ Examples:
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 WINDOW_SIZE = 3
-@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class ccl_win(evaluate.Measurement):
-    """TODO: Short description of my evaluation module."""
-    def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
-        return evaluate.MeasurementInfo(
-            # This is the description that will appear on the modules page.
-            module_type="measurement",
-            description=_DESCRIPTION,
-            citation=_CITATION,
-            inputs_description=_KWARGS_DESCRIPTION,
-            # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('string'),
-            }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
-            # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
-        )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
     def preprocess_adjacent_window(self,preds):
         pred_list = []
@@ -114,37 +126,7 @@ class ccl_win(evaluate.Measurement):
         return pred_list,lens
-    def _compute(self, predictions, dataset="arxiv", batch_size: int = 16, device=None, use_aggregator=True):
-        """Returns the scores"""
-        MODEL_CACHE_DIR = "/home/rcardena/.cache/huggingface/"
-        BASEDIR = "/bask/projects/j/jlxi8926-auto-sum/rcardenas/tools/ccl_win"
-        if getpass.getuser() == "s1987051":
-            MODEL_CACHE_DIR="/disk/ocean/rcardenas/tools/huggingface/"
-        elif getpass.getuser() == "rcardena":
-            MODEL_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface/"
-        elif getpass.getuser() == "gvhr8913":
-            MODEL_CACHE_DIR="/bask/projects/j/jlxi8926-auto-sum/rcardenas/cache"
-        if device is not None:
-            # assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
-            if device == "gpu":
-                device = "cuda"
-        else:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-        results = []
-        sent_lens = [len(x.split("\n")) for x in predictions]
-        aggregator = None
-        if use_aggregator:
-            np.random.seed(42)
-            aggregator = scoring.BootstrapAggregator()
-        tokenizer = AutoTokenizer.from_pretrained("roberta-large")
-        model = AutoModelForSequenceClassification.from_pretrained(os.path.join(BASEDIR,dataset))
-        model.to(device)
-        model.eval()
         pred_list,len_by_sample = self.preprocess_adjacent_window(predictions)
@@ -153,27 +135,80 @@ class ccl_win(evaluate.Measurement):
         with torch.no_grad():
             for b in range(0,n_preds,batch_size):
                 strides = [x.lower() for x in pred_list[b:b+batch_size]]
-                tinput = tokenizer(strides,padding=True,truncation=True,max_length=512,return_tensors="pt")
-                tinput = {k:v.to(device) for k,v in tinput.items()}
-                output = model(**tinput)
                 probs = torch.softmax(output.logits,dim=-1).detach().cpu().numpy()
                 scores.extend(probs[:,0].tolist())
             #
         offset = 0
         for i,_len in enumerate(len_by_sample):
             score = float(np.mean(scores[offset:offset+_len])) if sent_lens[i]>1 else 0.
-            if use_aggregator:
-                aggregator.add_scores({"loc_coh_ccl": score})
-            else:
-                results.append(score)
             offset += _len
         #
         outres = {}
         if use_aggregator:
             res = aggregator.aggregate()
             for k in res:   outres[k] = res[k].mid
         else:
             outres = {"loc_coh_ccl": results}
-        return outres

 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 WINDOW_SIZE = 3
+@contextmanager
+def filter_logging_context():
+    def filter_log(record):
+        return False if "This IS expected if you are initializing" in record.msg else True
+    logger = datasets.utils.logging.get_logger("transformers.modeling_utils")
+    logger.addFilter(filter_log)
+    try:
+        yield
+    finally:
+        logger.removeFilter(filter_log)
+class Scorer:
+    def __init__(
+        self,
+        model_type=None,
+        batch_size=64,
+        device=None,
+        use_fast_tokenizer=False):
+        if device is not None:
+            # assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
+            if device == "gpu":
+                device = "cuda"
+        else:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self._tokenizer = AutoTokenizer.from_pretrained("roberta-large")
+        self._model = AutoModelForSequenceClassification.from_pretrained(model_type)
+        self._model.to(device)
+        self._model.eval()
+    @property
+    def hash(self):
+        return self.model_type
     def preprocess_adjacent_window(self,preds):
         pred_list = []
         return pred_list,lens
+    def score(self,predictions):
         pred_list,len_by_sample = self.preprocess_adjacent_window(predictions)
         with torch.no_grad():
             for b in range(0,n_preds,batch_size):
                 strides = [x.lower() for x in pred_list[b:b+batch_size]]
+                tinput = self._tokenizer(strides,padding=True,truncation=True,max_length=512,return_tensors="pt")
+                tinput = {k:v.to(self.device) for k,v in tinput.items()}
+                output = self._model(**tinput)
                 probs = torch.softmax(output.logits,dim=-1).detach().cpu().numpy()
                 scores.extend(probs[:,0].tolist())
             #
+        results = []
         offset = 0
         for i,_len in enumerate(len_by_sample):
             score = float(np.mean(scores[offset:offset+_len])) if sent_lens[i]>1 else 0.
+            results.append(score)
             offset += _len
         #
+        return results
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ccl_win(evaluate.Measurement):
+    """TODO: Short description of my evaluation module."""
+    def _info(self):
+        # TODO: Specifies the evaluate.EvaluationModuleInfo object
+        return evaluate.MeasurementInfo(
+            # This is the description that will appear on the modules page.
+            module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('string'),
+            }),
+            # Homepage of the module for documentation
+            homepage="http://module.homepage",
+            # Additional links to the codebase or references
+            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=["http://path.to.reference.url/new_module"]
+        )
+    def _download_and_prepare(self, dl_manager):
+        """Optional: download external resources useful to compute the scores"""
+        # TODO: Download external resources if needed
+        pass
+    def _compute(self, predictions, dataset="arxiv", batch_size: int = 16, device=None, use_aggregator=True):
+        """Returns the scores"""
+        hashcode = dataset
+        with filter_logging_context():
+            if not hasattr(self, "cached_scorer") or self.cached_scorer.hash != hashcode:
+                self.cached_scorer = Scorer(
+                    model_type=dataset,
+                    batch_size=batch_size,
+                    device=device,
+                )
+        results = self.cached_scorer.score(predictions)
         outres = {}
+        aggregator = None
         if use_aggregator:
+            np.random.seed(42)
+            aggregator = scoring.BootstrapAggregator()
+            for score in results:
+                aggregator.add_scores({"loc_coh_ccl": score})
+            #
             res = aggregator.aggregate()
             for k in res:   outres[k] = res[k].mid
         else:
             outres = {"loc_coh_ccl": results}
+        return outres