Spaces:

bowdbeg
/

patch_series

Sleeping

App Files Files Community

bowdbeg commited on Jul 18, 2024

Commit

6d01d6a

1 Parent(s): bdc074b

implemented

Browse files

Files changed (3) hide show

.gitignore +133 -0
__main__.py +56 -0
patch_series.py +99 -22

.gitignore ADDED Viewed

	@@ -0,0 +1,133 @@

+.vscode
+data/
+output/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

__main__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import json
+import logging
+import time
+from argparse import ArgumentParser
+import evaluate
+import numpy as np
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+parser = ArgumentParser(
+    description="Compute the matching series score between two time series freezed in a numpy array"
+)
+parser.add_argument("predictions", type=str, help="Path to the numpy array containing the predictions")
+parser.add_argument("references", type=str, help="Path to the numpy array containing the references")
+parser.add_argument("--output", type=str, help="Path to the output file")
+parser.add_argument("--batch_size", type=int, help="Batch size to use for the computation")
+parser.add_argument("--num_processes", type=int, help="Batch size to use for the computation", default=1)
+parser.add_argument("--dtype", type=str, help="Data type to use for the computation", default="float32")
+parser.add_argument("--debug", action="store_true", help="Debug mode")
+args = parser.parse_args()
+if not args.predictions or not args.references:
+    raise ValueError("You must provide the path to the predictions and references numpy arrays")
+predictions = np.load(args.predictions).astype(args.dtype)
+references = np.load(args.references).astype(args.dtype)
+if args.debug:
+    predictions = predictions[:1000]
+    references = references[:1000]
+logger.info(f"predictions shape: {predictions.shape}")
+logger.info(f"references shape: {references.shape}")
+import patch_series
+s = time.time()
+metric = patch_series.patch_series()
+# metric = evaluate.load("patch_series.py")
+results = metric.compute(
+    predictions=predictions,
+    references=references,
+    batch_size=args.batch_size,
+    num_processes=args.num_process,
+    return_each_features=True,
+    return_coverages=True,
+    dtype=args.dtype,
+)
+logger.info(f"Time taken: {time.time() - s}")
+print(json.dumps(results))
+if args.output:
+    with open(args.output, "w") as f:
+        json.dump(results, f)

patch_series.py CHANGED Viewed

@@ -13,9 +13,14 @@
 # limitations under the License.
 """TODO: Add a description here."""
-import evaluate
 import datasets
 # TODO: Add BibTeX citation
 _CITATION = """\
@@ -53,13 +58,13 @@ Examples:
     {'accuracy': 1.0}
 """
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class patch_series(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -70,26 +75,98 @@ class patch_series(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

 # limitations under the License.
 """TODO: Add a description here."""
+import logging
+from typing import List, Optional, Union
 import datasets
+import evaluate
+import numpy as np
+logger = logging.getLogger(__name__)
 # TODO: Add BibTeX citation
 _CITATION = """\
     {'accuracy': 1.0}
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class patch_series(evaluate.Metric):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.matching_series_metric = evaluate.load("bowdbeg/matching_series")
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("int64"),
+                    "references": datasets.Value("int64"),
+                }
+            ),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=["http://path.to.reference.url/new_module"],
         )
+    def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[dict]:
+        """"""
+        all_kwargs = {"predictions": predictions, "references": references, **kwargs}
+        if predictions is None and references is None:
+            missing_kwargs = {k: None for k in self._feature_names() if k not in all_kwargs}
+            all_kwargs.update(missing_kwargs)
+        else:
+            missing_inputs = [k for k in self._feature_names() if k not in all_kwargs]
+            if missing_inputs:
+                raise ValueError(
+                    f"Evaluation module inputs are missing: {missing_inputs}. All required inputs are {list(self._feature_names())}"
+                )
+        inputs = {input_name: all_kwargs[input_name] for input_name in self._feature_names()}
+        compute_kwargs = {k: kwargs[k] for k in kwargs if k not in self._feature_names()}
+        return self._compute(**inputs, **compute_kwargs)
+    def _compute(
+        self,
+        predictions: Union[List, np.ndarray],
+        references: Union[List, np.ndarray],
+        patch_length: List[int] = [1],
+        strides: Union[List[int], None] = None,
+        **kwargs,
+    ):
+        """Compute the evaluation score for bowdbeg/matching_series for each patch and take mean."""
+        if strides is None:
+            strides = patch_length
+        assert len(patch_length) == len(strides), "The patch_length and strides should have the same length."
+        predictions = np.array(predictions)
+        references = np.array(references)
+        if not all(predictions.shape[1] % p == 0 for p in patch_length) and not all(
+            references.shape[1] % p == 0 for p in patch_length
+        ):
+            raise ValueError("The patch_length should divide the length of the predictions and references.")
+        if len(predictions.shape) != 3:
+            raise ValueError("Predictions should have shape (batch_size, sequence_length, num_features)")
+        if len(patch_length) == 0:
+            raise ValueError("The patch_length should be a list of integers.")
+        res_sum: Union[None, dict] = None
+        orig_pred_shape = predictions.shape
+        orig_ref_shape = references.shape
+        for patch, stride in zip(patch_length, strides):
+            # create patched predictions and references
+            patched_predictions = self.get_patches(predictions, patch, stride, axis=1)
+            patched_references = self.get_patches(references, patch, stride, axis=1)
+            patched_predictions = patched_predictions.reshape(-1, patch, orig_pred_shape[2])
+            patched_references = patched_references.reshape(-1, patch, orig_ref_shape[2])
+            # compute the score for each patch
+            res = self.matching_series_metric.compute(
+                predictions=patched_predictions, references=patched_references, **kwargs
+            )
+            # sum the results
+            if res_sum is None:
+                res_sum = res
+            else:
+                assert isinstance(res_sum, dict)
+                assert isinstance(res, dict)
+                for key in res_sum:
+                    if isinstance(res_sum[key], (list, np.ndarray)):
+                        res_sum[key] = np.array(res_sum[key]) + np.array(res[key])
+                    elif isinstance(res_sum[key], (float, int)):
+                        res_sum[key] += res[key]
+                    else:
+                        logger.warning(f"Unsupported type for key {key}: {type(res_sum[key])}")
+                        del res_sum[key]
+        # take the mean of the results
+        assert isinstance(res_sum, dict)
+        for key in res_sum:
+            if isinstance(res_sum[key], (list, np.ndarray)):
+                res_sum[key] = np.array(res_sum[key]) / len(patch_length)
+            else:
+                res_sum[key] /= len(patch_length)
+        return res_sum
+    @staticmethod
+    def get_patches(series: np.ndarray, patch_length: int, stride: int, axis=0):
+        # create patched predictions and references
+        o = np.lib.stride_tricks.sliding_window_view(series, window_shape=patch_length, axis=axis)
+        o = o[::stride]
+        return o