spam-classifier
/
venv
/lib
/python3.11
/site-packages
/sklearn
/datasets
/_species_distributions.py
""" | |
============================= | |
Species distribution dataset | |
============================= | |
This dataset represents the geographic distribution of species. | |
The dataset is provided by Phillips et. al. (2006). | |
The two species are: | |
- `"Bradypus variegatus" | |
<http://www.iucnredlist.org/details/3038/0>`_ , | |
the Brown-throated Sloth. | |
- `"Microryzomys minutus" | |
<http://www.iucnredlist.org/details/13408/0>`_ , | |
also known as the Forest Small Rice Rat, a rodent that lives in Peru, | |
Colombia, Ecuador, Peru, and Venezuela. | |
References | |
---------- | |
`"Maximum entropy modeling of species geographic distributions" | |
<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips, | |
R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. | |
""" | |
# Authors: The scikit-learn developers | |
# SPDX-License-Identifier: BSD-3-Clause | |
import logging | |
from io import BytesIO | |
from numbers import Integral, Real | |
from os import PathLike, makedirs, remove | |
from os.path import exists | |
import joblib | |
import numpy as np | |
from ..utils import Bunch | |
from ..utils._param_validation import Interval, validate_params | |
from . import get_data_home | |
from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath | |
# The original data can be found at: | |
# https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip | |
SAMPLES = RemoteFileMetadata( | |
filename="samples.zip", | |
url="https://ndownloader.figshare.com/files/5976075", | |
checksum="abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28", | |
) | |
# The original data can be found at: | |
# https://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip | |
COVERAGES = RemoteFileMetadata( | |
filename="coverages.zip", | |
url="https://ndownloader.figshare.com/files/5976078", | |
checksum="4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807", | |
) | |
DATA_ARCHIVE_NAME = "species_coverage.pkz" | |
logger = logging.getLogger(__name__) | |
def _load_coverage(F, header_length=6, dtype=np.int16): | |
"""Load a coverage file from an open file object. | |
This will return a numpy array of the given dtype | |
""" | |
header = [F.readline() for _ in range(header_length)] | |
make_tuple = lambda t: (t.split()[0], float(t.split()[1])) | |
header = dict([make_tuple(line) for line in header]) | |
M = np.loadtxt(F, dtype=dtype) | |
nodata = int(header[b"NODATA_value"]) | |
if nodata != -9999: | |
M[nodata] = -9999 | |
return M | |
def _load_csv(F): | |
"""Load csv file. | |
Parameters | |
---------- | |
F : file object | |
CSV file open in byte mode. | |
Returns | |
------- | |
rec : np.ndarray | |
record array representing the data | |
""" | |
names = F.readline().decode("ascii").strip().split(",") | |
rec = np.loadtxt(F, skiprows=0, delimiter=",", dtype="S22,f4,f4") | |
rec.dtype.names = names | |
return rec | |
def construct_grids(batch): | |
"""Construct the map grid from the batch object | |
Parameters | |
---------- | |
batch : Batch object | |
The object returned by :func:`fetch_species_distributions` | |
Returns | |
------- | |
(xgrid, ygrid) : 1-D arrays | |
The grid corresponding to the values in batch.coverages | |
""" | |
# x,y coordinates for corner cells | |
xmin = batch.x_left_lower_corner + batch.grid_size | |
xmax = xmin + (batch.Nx * batch.grid_size) | |
ymin = batch.y_left_lower_corner + batch.grid_size | |
ymax = ymin + (batch.Ny * batch.grid_size) | |
# x coordinates of the grid cells | |
xgrid = np.arange(xmin, xmax, batch.grid_size) | |
# y coordinates of the grid cells | |
ygrid = np.arange(ymin, ymax, batch.grid_size) | |
return (xgrid, ygrid) | |
def fetch_species_distributions( | |
*, | |
data_home=None, | |
download_if_missing=True, | |
n_retries=3, | |
delay=1.0, | |
): | |
"""Loader for species distribution dataset from Phillips et. al. (2006). | |
Read more in the :ref:`User Guide <species_distribution_dataset>`. | |
Parameters | |
---------- | |
data_home : str or path-like, default=None | |
Specify another download and cache folder for the datasets. By default | |
all scikit-learn data is stored in '~/scikit_learn_data' subfolders. | |
download_if_missing : bool, default=True | |
If False, raise an OSError if the data is not locally available | |
instead of trying to download the data from the source site. | |
n_retries : int, default=3 | |
Number of retries when HTTP errors are encountered. | |
.. versionadded:: 1.5 | |
delay : float, default=1.0 | |
Number of seconds between retries. | |
.. versionadded:: 1.5 | |
Returns | |
------- | |
data : :class:`~sklearn.utils.Bunch` | |
Dictionary-like object, with the following attributes. | |
coverages : array, shape = [14, 1592, 1212] | |
These represent the 14 features measured | |
at each point of the map grid. | |
The latitude/longitude values for the grid are discussed below. | |
Missing data is represented by the value -9999. | |
train : record array, shape = (1624,) | |
The training points for the data. Each point has three fields: | |
- train['species'] is the species name | |
- train['dd long'] is the longitude, in degrees | |
- train['dd lat'] is the latitude, in degrees | |
test : record array, shape = (620,) | |
The test points for the data. Same format as the training data. | |
Nx, Ny : integers | |
The number of longitudes (x) and latitudes (y) in the grid | |
x_left_lower_corner, y_left_lower_corner : floats | |
The (x,y) position of the lower-left corner, in degrees | |
grid_size : float | |
The spacing between points of the grid, in degrees | |
Notes | |
----- | |
This dataset represents the geographic distribution of species. | |
The dataset is provided by Phillips et. al. (2006). | |
The two species are: | |
- `"Bradypus variegatus" | |
<http://www.iucnredlist.org/details/3038/0>`_ , | |
the Brown-throated Sloth. | |
- `"Microryzomys minutus" | |
<http://www.iucnredlist.org/details/13408/0>`_ , | |
also known as the Forest Small Rice Rat, a rodent that lives in Peru, | |
Colombia, Ecuador, Peru, and Venezuela. | |
References | |
---------- | |
* `"Maximum entropy modeling of species geographic distributions" | |
<http://rob.schapire.net/papers/ecolmod.pdf>`_ | |
S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, | |
190:231-259, 2006. | |
Examples | |
-------- | |
>>> from sklearn.datasets import fetch_species_distributions | |
>>> species = fetch_species_distributions() | |
>>> species.train[:5] | |
array([(b'microryzomys_minutus', -64.7 , -17.85 ), | |
(b'microryzomys_minutus', -67.8333, -16.3333), | |
(b'microryzomys_minutus', -67.8833, -16.3 ), | |
(b'microryzomys_minutus', -67.8 , -16.2667), | |
(b'microryzomys_minutus', -67.9833, -15.9 )], | |
dtype=[('species', 'S22'), ('dd long', '<f4'), ('dd lat', '<f4')]) | |
For a more extended example, | |
see :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` | |
""" | |
data_home = get_data_home(data_home) | |
if not exists(data_home): | |
makedirs(data_home) | |
# Define parameters for the data files. These should not be changed | |
# unless the data model changes. They will be saved in the npz file | |
# with the downloaded data. | |
extra_params = dict( | |
x_left_lower_corner=-94.8, | |
Nx=1212, | |
y_left_lower_corner=-56.05, | |
Ny=1592, | |
grid_size=0.05, | |
) | |
dtype = np.int16 | |
archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME) | |
if not exists(archive_path): | |
if not download_if_missing: | |
raise OSError("Data not found and `download_if_missing` is False") | |
logger.info("Downloading species data from %s to %s" % (SAMPLES.url, data_home)) | |
samples_path = _fetch_remote( | |
SAMPLES, dirname=data_home, n_retries=n_retries, delay=delay | |
) | |
with np.load(samples_path) as X: # samples.zip is a valid npz | |
for f in X.files: | |
fhandle = BytesIO(X[f]) | |
if "train" in f: | |
train = _load_csv(fhandle) | |
if "test" in f: | |
test = _load_csv(fhandle) | |
remove(samples_path) | |
logger.info( | |
"Downloading coverage data from %s to %s" % (COVERAGES.url, data_home) | |
) | |
coverages_path = _fetch_remote( | |
COVERAGES, dirname=data_home, n_retries=n_retries, delay=delay | |
) | |
with np.load(coverages_path) as X: # coverages.zip is a valid npz | |
coverages = [] | |
for f in X.files: | |
fhandle = BytesIO(X[f]) | |
logger.debug(" - converting {}".format(f)) | |
coverages.append(_load_coverage(fhandle)) | |
coverages = np.asarray(coverages, dtype=dtype) | |
remove(coverages_path) | |
bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) | |
joblib.dump(bunch, archive_path, compress=9) | |
else: | |
bunch = joblib.load(archive_path) | |
return bunch | |