|
|
|
|
|
|
|
import gzip |
|
import hashlib |
|
import json |
|
import os |
|
import shutil |
|
import time |
|
from contextlib import closing |
|
from functools import wraps |
|
from os.path import join |
|
from tempfile import TemporaryDirectory |
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union |
|
from urllib.error import HTTPError, URLError |
|
from urllib.request import Request, urlopen |
|
from warnings import warn |
|
|
|
import numpy as np |
|
|
|
from ..utils import Bunch |
|
from ..utils._optional_dependencies import check_pandas_support |
|
from ..utils._param_validation import ( |
|
Integral, |
|
Interval, |
|
Real, |
|
StrOptions, |
|
validate_params, |
|
) |
|
from . import get_data_home |
|
from ._arff_parser import load_arff_from_gzip_file |
|
|
|
__all__ = ["fetch_openml"] |
|
|
|
_OPENML_PREFIX = "https://api.openml.org/" |
|
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2" |
|
_DATA_INFO = "api/v1/json/data/{}" |
|
_DATA_FEATURES = "api/v1/json/data/features/{}" |
|
_DATA_QUALITIES = "api/v1/json/data/qualities/{}" |
|
_DATA_FILE = "data/v1/download/{}" |
|
|
|
OpenmlQualitiesType = List[Dict[str, str]] |
|
OpenmlFeaturesType = List[Dict[str, str]] |
|
|
|
|
|
def _get_local_path(openml_path: str, data_home: str) -> str: |
|
return os.path.join(data_home, "openml.org", openml_path + ".gz") |
|
|
|
|
|
def _retry_with_clean_cache( |
|
openml_path: str, |
|
data_home: Optional[str], |
|
no_retry_exception: Optional[Exception] = None, |
|
) -> Callable: |
|
"""If the first call to the decorated function fails, the local cached |
|
file is removed, and the function is called again. If ``data_home`` is |
|
``None``, then the function is called once. We can provide a specific |
|
exception to not retry on using `no_retry_exception` parameter. |
|
""" |
|
|
|
def decorator(f): |
|
@wraps(f) |
|
def wrapper(*args, **kw): |
|
if data_home is None: |
|
return f(*args, **kw) |
|
try: |
|
return f(*args, **kw) |
|
except URLError: |
|
raise |
|
except Exception as exc: |
|
if no_retry_exception is not None and isinstance( |
|
exc, no_retry_exception |
|
): |
|
raise |
|
warn("Invalid cache, redownloading file", RuntimeWarning) |
|
local_path = _get_local_path(openml_path, data_home) |
|
if os.path.exists(local_path): |
|
os.unlink(local_path) |
|
return f(*args, **kw) |
|
|
|
return wrapper |
|
|
|
return decorator |
|
|
|
|
|
def _retry_on_network_error( |
|
n_retries: int = 3, delay: float = 1.0, url: str = "" |
|
) -> Callable: |
|
"""If the function call results in a network error, call the function again |
|
up to ``n_retries`` times with a ``delay`` between each call. If the error |
|
has a 412 status code, don't call the function again as this is a specific |
|
OpenML error. |
|
The url parameter is used to give more information to the user about the |
|
error. |
|
""" |
|
|
|
def decorator(f): |
|
@wraps(f) |
|
def wrapper(*args, **kwargs): |
|
retry_counter = n_retries |
|
while True: |
|
try: |
|
return f(*args, **kwargs) |
|
except (URLError, TimeoutError) as e: |
|
|
|
if isinstance(e, HTTPError) and e.code == 412: |
|
raise |
|
if retry_counter == 0: |
|
raise |
|
warn( |
|
f"A network error occurred while downloading {url}. Retrying..." |
|
) |
|
retry_counter -= 1 |
|
time.sleep(delay) |
|
|
|
return wrapper |
|
|
|
return decorator |
|
|
|
|
|
def _open_openml_url( |
|
openml_path: str, data_home: Optional[str], n_retries: int = 3, delay: float = 1.0 |
|
): |
|
""" |
|
Returns a resource from OpenML.org. Caches it to data_home if required. |
|
|
|
Parameters |
|
---------- |
|
openml_path : str |
|
OpenML URL that will be accessed. This will be prefixes with |
|
_OPENML_PREFIX. |
|
|
|
data_home : str |
|
Directory to which the files will be cached. If None, no caching will |
|
be applied. |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors are encountered. Error with status |
|
code 412 won't be retried as they represent OpenML generic errors. |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
Returns |
|
------- |
|
result : stream |
|
A stream to the OpenML resource. |
|
""" |
|
|
|
def is_gzip_encoded(_fsrc): |
|
return _fsrc.info().get("Content-Encoding", "") == "gzip" |
|
|
|
req = Request(_OPENML_PREFIX + openml_path) |
|
req.add_header("Accept-encoding", "gzip") |
|
|
|
if data_home is None: |
|
fsrc = _retry_on_network_error(n_retries, delay, req.full_url)(urlopen)(req) |
|
if is_gzip_encoded(fsrc): |
|
return gzip.GzipFile(fileobj=fsrc, mode="rb") |
|
return fsrc |
|
|
|
local_path = _get_local_path(openml_path, data_home) |
|
dir_name, file_name = os.path.split(local_path) |
|
if not os.path.exists(local_path): |
|
os.makedirs(dir_name, exist_ok=True) |
|
try: |
|
|
|
|
|
|
|
|
|
with TemporaryDirectory(dir=dir_name) as tmpdir: |
|
with closing( |
|
_retry_on_network_error(n_retries, delay, req.full_url)(urlopen)( |
|
req |
|
) |
|
) as fsrc: |
|
opener: Callable |
|
if is_gzip_encoded(fsrc): |
|
opener = open |
|
else: |
|
opener = gzip.GzipFile |
|
with opener(os.path.join(tmpdir, file_name), "wb") as fdst: |
|
shutil.copyfileobj(fsrc, fdst) |
|
shutil.move(fdst.name, local_path) |
|
except Exception: |
|
if os.path.exists(local_path): |
|
os.unlink(local_path) |
|
raise |
|
|
|
|
|
|
|
return gzip.GzipFile(local_path, "rb") |
|
|
|
|
|
class OpenMLError(ValueError): |
|
"""HTTP 412 is a specific OpenML error code, indicating a generic error""" |
|
|
|
pass |
|
|
|
|
|
def _get_json_content_from_openml_api( |
|
url: str, |
|
error_message: Optional[str], |
|
data_home: Optional[str], |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
) -> Dict: |
|
""" |
|
Loads json data from the openml api. |
|
|
|
Parameters |
|
---------- |
|
url : str |
|
The URL to load from. Should be an official OpenML endpoint. |
|
|
|
error_message : str or None |
|
The error message to raise if an acceptable OpenML error is thrown |
|
(acceptable error is, e.g., data id not found. Other errors, like 404's |
|
will throw the native error message). |
|
|
|
data_home : str or None |
|
Location to cache the response. None if no cache is required. |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors are encountered. Error with status |
|
code 412 won't be retried as they represent OpenML generic errors. |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
Returns |
|
------- |
|
json_data : json |
|
the json result from the OpenML server if the call was successful. |
|
An exception otherwise. |
|
""" |
|
|
|
@_retry_with_clean_cache(url, data_home=data_home) |
|
def _load_json(): |
|
with closing( |
|
_open_openml_url(url, data_home, n_retries=n_retries, delay=delay) |
|
) as response: |
|
return json.loads(response.read().decode("utf-8")) |
|
|
|
try: |
|
return _load_json() |
|
except HTTPError as error: |
|
|
|
|
|
if error.code != 412: |
|
raise error |
|
|
|
|
|
raise OpenMLError(error_message) |
|
|
|
|
|
def _get_data_info_by_name( |
|
name: str, |
|
version: Union[int, str], |
|
data_home: Optional[str], |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
): |
|
""" |
|
Utilizes the openml dataset listing api to find a dataset by |
|
name/version |
|
OpenML api function: |
|
https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name |
|
|
|
Parameters |
|
---------- |
|
name : str |
|
name of the dataset |
|
|
|
version : int or str |
|
If version is an integer, the exact name/version will be obtained from |
|
OpenML. If version is a string (value: "active") it will take the first |
|
version from OpenML that is annotated as active. Any other string |
|
values except "active" are treated as integer. |
|
|
|
data_home : str or None |
|
Location to cache the response. None if no cache is required. |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors are encountered. Error with status |
|
code 412 won't be retried as they represent OpenML generic errors. |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
Returns |
|
------- |
|
first_dataset : json |
|
json representation of the first dataset object that adhired to the |
|
search criteria |
|
|
|
""" |
|
if version == "active": |
|
|
|
url = _SEARCH_NAME.format(name) + "/status/active/" |
|
error_msg = "No active dataset {} found.".format(name) |
|
json_data = _get_json_content_from_openml_api( |
|
url, |
|
error_msg, |
|
data_home=data_home, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
res = json_data["data"]["dataset"] |
|
if len(res) > 1: |
|
first_version = version = res[0]["version"] |
|
warning_msg = ( |
|
"Multiple active versions of the dataset matching the name" |
|
f" {name} exist. Versions may be fundamentally different, " |
|
f"returning version {first_version}. " |
|
"Available versions:\n" |
|
) |
|
for r in res: |
|
warning_msg += f"- version {r['version']}, status: {r['status']}\n" |
|
warning_msg += ( |
|
f" url: https://www.openml.org/search?type=data&id={r['did']}\n" |
|
) |
|
warn(warning_msg) |
|
return res[0] |
|
|
|
|
|
url = (_SEARCH_NAME + "/data_version/{}").format(name, version) |
|
try: |
|
json_data = _get_json_content_from_openml_api( |
|
url, |
|
error_message=None, |
|
data_home=data_home, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
except OpenMLError: |
|
|
|
|
|
|
|
|
|
url += "/status/deactivated" |
|
error_msg = "Dataset {} with version {} not found.".format(name, version) |
|
json_data = _get_json_content_from_openml_api( |
|
url, |
|
error_msg, |
|
data_home=data_home, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
|
|
return json_data["data"]["dataset"][0] |
|
|
|
|
|
def _get_data_description_by_id( |
|
data_id: int, |
|
data_home: Optional[str], |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
) -> Dict[str, Any]: |
|
|
|
url = _DATA_INFO.format(data_id) |
|
error_message = "Dataset with data_id {} not found.".format(data_id) |
|
json_data = _get_json_content_from_openml_api( |
|
url, |
|
error_message, |
|
data_home=data_home, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
return json_data["data_set_description"] |
|
|
|
|
|
def _get_data_features( |
|
data_id: int, |
|
data_home: Optional[str], |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
) -> OpenmlFeaturesType: |
|
|
|
|
|
url = _DATA_FEATURES.format(data_id) |
|
error_message = "Dataset with data_id {} not found.".format(data_id) |
|
json_data = _get_json_content_from_openml_api( |
|
url, |
|
error_message, |
|
data_home=data_home, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
return json_data["data_features"]["feature"] |
|
|
|
|
|
def _get_data_qualities( |
|
data_id: int, |
|
data_home: Optional[str], |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
) -> OpenmlQualitiesType: |
|
|
|
|
|
url = _DATA_QUALITIES.format(data_id) |
|
error_message = "Dataset with data_id {} not found.".format(data_id) |
|
json_data = _get_json_content_from_openml_api( |
|
url, |
|
error_message, |
|
data_home=data_home, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
|
|
|
|
return json_data.get("data_qualities", {}).get("quality", []) |
|
|
|
|
|
def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int: |
|
"""Get the number of samples from data qualities. |
|
|
|
Parameters |
|
---------- |
|
data_qualities : list of dict |
|
Used to retrieve the number of instances (samples) in the dataset. |
|
|
|
Returns |
|
------- |
|
n_samples : int |
|
The number of samples in the dataset or -1 if data qualities are |
|
unavailable. |
|
""" |
|
|
|
default_n_samples = -1 |
|
|
|
qualities = {d["name"]: d["value"] for d in data_qualities} |
|
return int(float(qualities.get("NumberOfInstances", default_n_samples))) |
|
|
|
|
|
def _load_arff_response( |
|
url: str, |
|
data_home: Optional[str], |
|
parser: str, |
|
output_type: str, |
|
openml_columns_info: dict, |
|
feature_names_to_select: List[str], |
|
target_names_to_select: List[str], |
|
shape: Optional[Tuple[int, int]], |
|
md5_checksum: str, |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
read_csv_kwargs: Optional[Dict] = None, |
|
): |
|
"""Load the ARFF data associated with the OpenML URL. |
|
|
|
In addition of loading the data, this function will also check the |
|
integrity of the downloaded file from OpenML using MD5 checksum. |
|
|
|
Parameters |
|
---------- |
|
url : str |
|
The URL of the ARFF file on OpenML. |
|
|
|
data_home : str |
|
The location where to cache the data. |
|
|
|
parser : {"liac-arff", "pandas"} |
|
The parser used to parse the ARFF file. |
|
|
|
output_type : {"numpy", "pandas", "sparse"} |
|
The type of the arrays that will be returned. The possibilities are: |
|
|
|
- `"numpy"`: both `X` and `y` will be NumPy arrays; |
|
- `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; |
|
- `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a |
|
pandas Series or DataFrame. |
|
|
|
openml_columns_info : dict |
|
The information provided by OpenML regarding the columns of the ARFF |
|
file. |
|
|
|
feature_names_to_select : list of str |
|
The list of the features to be selected. |
|
|
|
target_names_to_select : list of str |
|
The list of the target variables to be selected. |
|
|
|
shape : tuple or None |
|
With `parser="liac-arff"`, when using a generator to load the data, |
|
one needs to provide the shape of the data beforehand. |
|
|
|
md5_checksum : str |
|
The MD5 checksum provided by OpenML to check the data integrity. |
|
|
|
n_retries : int, default=3 |
|
The number of times to retry downloading the data if it fails. |
|
|
|
delay : float, default=1.0 |
|
The delay between two consecutive downloads in seconds. |
|
|
|
read_csv_kwargs : dict, default=None |
|
Keyword arguments to pass to `pandas.read_csv` when using the pandas parser. |
|
It allows to overwrite the default options. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
Returns |
|
------- |
|
X : {ndarray, sparse matrix, dataframe} |
|
The data matrix. |
|
|
|
y : {ndarray, dataframe, series} |
|
The target. |
|
|
|
frame : dataframe or None |
|
A dataframe containing both `X` and `y`. `None` if |
|
`output_array_type != "pandas"`. |
|
|
|
categories : list of str or None |
|
The names of the features that are categorical. `None` if |
|
`output_array_type == "pandas"`. |
|
""" |
|
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) |
|
with closing(gzip_file): |
|
md5 = hashlib.md5() |
|
for chunk in iter(lambda: gzip_file.read(4096), b""): |
|
md5.update(chunk) |
|
actual_md5_checksum = md5.hexdigest() |
|
|
|
if actual_md5_checksum != md5_checksum: |
|
raise ValueError( |
|
f"md5 checksum of local file for {url} does not match description: " |
|
f"expected: {md5_checksum} but got {actual_md5_checksum}. " |
|
"Downloaded file could have been modified / corrupted, clean cache " |
|
"and retry..." |
|
) |
|
|
|
def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params): |
|
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) |
|
with closing(gzip_file): |
|
return load_arff_from_gzip_file(gzip_file, **arff_params) |
|
|
|
arff_params: Dict = dict( |
|
parser=parser, |
|
output_type=output_type, |
|
openml_columns_info=openml_columns_info, |
|
feature_names_to_select=feature_names_to_select, |
|
target_names_to_select=target_names_to_select, |
|
shape=shape, |
|
read_csv_kwargs=read_csv_kwargs or {}, |
|
) |
|
try: |
|
X, y, frame, categories = _open_url_and_load_gzip_file( |
|
url, data_home, n_retries, delay, arff_params |
|
) |
|
except Exception as exc: |
|
if parser != "pandas": |
|
raise |
|
|
|
from pandas.errors import ParserError |
|
|
|
if not isinstance(exc, ParserError): |
|
raise |
|
|
|
|
|
|
|
|
|
arff_params["read_csv_kwargs"].update(quotechar="'") |
|
X, y, frame, categories = _open_url_and_load_gzip_file( |
|
url, data_home, n_retries, delay, arff_params |
|
) |
|
|
|
return X, y, frame, categories |
|
|
|
|
|
def _download_data_to_bunch( |
|
url: str, |
|
sparse: bool, |
|
data_home: Optional[str], |
|
*, |
|
as_frame: bool, |
|
openml_columns_info: List[dict], |
|
data_columns: List[str], |
|
target_columns: List[str], |
|
shape: Optional[Tuple[int, int]], |
|
md5_checksum: str, |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
parser: str, |
|
read_csv_kwargs: Optional[Dict] = None, |
|
): |
|
"""Download ARFF data, load it to a specific container and create to Bunch. |
|
|
|
This function has a mechanism to retry/cache/clean the data. |
|
|
|
Parameters |
|
---------- |
|
url : str |
|
The URL of the ARFF file on OpenML. |
|
|
|
sparse : bool |
|
Whether the dataset is expected to use the sparse ARFF format. |
|
|
|
data_home : str |
|
The location where to cache the data. |
|
|
|
as_frame : bool |
|
Whether or not to return the data into a pandas DataFrame. |
|
|
|
openml_columns_info : list of dict |
|
The information regarding the columns provided by OpenML for the |
|
ARFF dataset. The information is stored as a list of dictionaries. |
|
|
|
data_columns : list of str |
|
The list of the features to be selected. |
|
|
|
target_columns : list of str |
|
The list of the target variables to be selected. |
|
|
|
shape : tuple or None |
|
With `parser="liac-arff"`, when using a generator to load the data, |
|
one needs to provide the shape of the data beforehand. |
|
|
|
md5_checksum : str |
|
The MD5 checksum provided by OpenML to check the data integrity. |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors are encountered. Error with status |
|
code 412 won't be retried as they represent OpenML generic errors. |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
parser : {"liac-arff", "pandas"} |
|
The parser used to parse the ARFF file. |
|
|
|
read_csv_kwargs : dict, default=None |
|
Keyword arguments to pass to `pandas.read_csv` when using the pandas parser. |
|
It allows to overwrite the default options. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
Returns |
|
------- |
|
data : :class:`~sklearn.utils.Bunch` |
|
Dictionary-like object, with the following attributes. |
|
|
|
X : {ndarray, sparse matrix, dataframe} |
|
The data matrix. |
|
y : {ndarray, dataframe, series} |
|
The target. |
|
frame : dataframe or None |
|
A dataframe containing both `X` and `y`. `None` if |
|
`output_array_type != "pandas"`. |
|
categories : list of str or None |
|
The names of the features that are categorical. `None` if |
|
`output_array_type == "pandas"`. |
|
""" |
|
|
|
features_dict = {feature["name"]: feature for feature in openml_columns_info} |
|
|
|
if sparse: |
|
output_type = "sparse" |
|
elif as_frame: |
|
output_type = "pandas" |
|
else: |
|
output_type = "numpy" |
|
|
|
|
|
_verify_target_data_type(features_dict, target_columns) |
|
for name in target_columns: |
|
column_info = features_dict[name] |
|
n_missing_values = int(column_info["number_of_missing_values"]) |
|
if n_missing_values > 0: |
|
raise ValueError( |
|
f"Target column '{column_info['name']}' has {n_missing_values} missing " |
|
"values. Missing values are not supported for target columns." |
|
) |
|
|
|
no_retry_exception = None |
|
if parser == "pandas": |
|
|
|
|
|
from pandas.errors import ParserError |
|
|
|
no_retry_exception = ParserError |
|
|
|
X, y, frame, categories = _retry_with_clean_cache( |
|
url, data_home, no_retry_exception |
|
)(_load_arff_response)( |
|
url, |
|
data_home, |
|
parser=parser, |
|
output_type=output_type, |
|
openml_columns_info=features_dict, |
|
feature_names_to_select=data_columns, |
|
target_names_to_select=target_columns, |
|
shape=shape, |
|
md5_checksum=md5_checksum, |
|
n_retries=n_retries, |
|
delay=delay, |
|
read_csv_kwargs=read_csv_kwargs, |
|
) |
|
|
|
return Bunch( |
|
data=X, |
|
target=y, |
|
frame=frame, |
|
categories=categories, |
|
feature_names=data_columns, |
|
target_names=target_columns, |
|
) |
|
|
|
|
|
def _verify_target_data_type(features_dict, target_columns): |
|
|
|
|
|
if not isinstance(target_columns, list): |
|
raise ValueError("target_column should be list, got: %s" % type(target_columns)) |
|
found_types = set() |
|
for target_column in target_columns: |
|
if target_column not in features_dict: |
|
raise KeyError(f"Could not find target_column='{target_column}'") |
|
if features_dict[target_column]["data_type"] == "numeric": |
|
found_types.add(np.float64) |
|
else: |
|
found_types.add(object) |
|
|
|
|
|
if features_dict[target_column]["is_ignore"] == "true": |
|
warn(f"target_column='{target_column}' has flag is_ignore.") |
|
if features_dict[target_column]["is_row_identifier"] == "true": |
|
warn(f"target_column='{target_column}' has flag is_row_identifier.") |
|
if len(found_types) > 1: |
|
raise ValueError( |
|
"Can only handle homogeneous multi-target datasets, " |
|
"i.e., all targets are either numeric or " |
|
"categorical." |
|
) |
|
|
|
|
|
def _valid_data_column_names(features_list, target_columns): |
|
|
|
|
|
|
|
|
|
valid_data_column_names = [] |
|
for feature in features_list: |
|
if ( |
|
feature["name"] not in target_columns |
|
and feature["is_ignore"] != "true" |
|
and feature["is_row_identifier"] != "true" |
|
): |
|
valid_data_column_names.append(feature["name"]) |
|
return valid_data_column_names |
|
|
|
|
|
@validate_params( |
|
{ |
|
"name": [str, None], |
|
"version": [Interval(Integral, 1, None, closed="left"), StrOptions({"active"})], |
|
"data_id": [Interval(Integral, 1, None, closed="left"), None], |
|
"data_home": [str, os.PathLike, None], |
|
"target_column": [str, list, None], |
|
"cache": [bool], |
|
"return_X_y": [bool], |
|
"as_frame": [bool, StrOptions({"auto"})], |
|
"n_retries": [Interval(Integral, 1, None, closed="left")], |
|
"delay": [Interval(Real, 0.0, None, closed="neither")], |
|
"parser": [ |
|
StrOptions({"auto", "pandas", "liac-arff"}), |
|
], |
|
"read_csv_kwargs": [dict, None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def fetch_openml( |
|
name: Optional[str] = None, |
|
*, |
|
version: Union[str, int] = "active", |
|
data_id: Optional[int] = None, |
|
data_home: Optional[Union[str, os.PathLike]] = None, |
|
target_column: Optional[Union[str, List]] = "default-target", |
|
cache: bool = True, |
|
return_X_y: bool = False, |
|
as_frame: Union[str, bool] = "auto", |
|
n_retries: int = 3, |
|
delay: float = 1.0, |
|
parser: str = "auto", |
|
read_csv_kwargs: Optional[Dict] = None, |
|
): |
|
"""Fetch dataset from openml by name or dataset id. |
|
|
|
Datasets are uniquely identified by either an integer ID or by a |
|
combination of name and version (i.e. there might be multiple |
|
versions of the 'iris' dataset). Please give either name or data_id |
|
(not both). In case a name is given, a version can also be |
|
provided. |
|
|
|
Read more in the :ref:`User Guide <openml>`. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
.. note:: EXPERIMENTAL |
|
|
|
The API is experimental (particularly the return value structure), |
|
and might have small backward-incompatible changes without notice |
|
or warning in future releases. |
|
|
|
Parameters |
|
---------- |
|
name : str, default=None |
|
String identifier of the dataset. Note that OpenML can have multiple |
|
datasets with the same name. |
|
|
|
version : int or 'active', default='active' |
|
Version of the dataset. Can only be provided if also ``name`` is given. |
|
If 'active' the oldest version that's still active is used. Since |
|
there may be more than one active version of a dataset, and those |
|
versions may fundamentally be different from one another, setting an |
|
exact version is highly recommended. |
|
|
|
data_id : int, default=None |
|
OpenML ID of the dataset. The most specific way of retrieving a |
|
dataset. If data_id is not given, name (and potential version) are |
|
used to obtain a dataset. |
|
|
|
data_home : str or path-like, default=None |
|
Specify another download and cache folder for the data sets. By default |
|
all scikit-learn data is stored in '~/scikit_learn_data' subfolders. |
|
|
|
target_column : str, list or None, default='default-target' |
|
Specify the column name in the data to use as target. If |
|
'default-target', the standard target column a stored on the server |
|
is used. If ``None``, all columns are returned as data and the |
|
target is ``None``. If list (of strings), all columns with these names |
|
are returned as multi-target (Note: not all scikit-learn classifiers |
|
can handle all types of multi-output combinations). |
|
|
|
cache : bool, default=True |
|
Whether to cache the downloaded datasets into `data_home`. |
|
|
|
return_X_y : bool, default=False |
|
If True, returns ``(data, target)`` instead of a Bunch object. See |
|
below for more information about the `data` and `target` objects. |
|
|
|
as_frame : bool or 'auto', default='auto' |
|
If True, the data is a pandas DataFrame including columns with |
|
appropriate dtypes (numeric, string or categorical). The target is |
|
a pandas DataFrame or Series depending on the number of target_columns. |
|
The Bunch will contain a ``frame`` attribute with the target and the |
|
data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas |
|
DataFrames or Series as describe above. |
|
|
|
If `as_frame` is 'auto', the data and target will be converted to |
|
DataFrame or Series as if `as_frame` is set to True, unless the dataset |
|
is stored in sparse format. |
|
|
|
If `as_frame` is False, the data and target will be NumPy arrays and |
|
the `data` will only contain numerical values when `parser="liac-arff"` |
|
where the categories are provided in the attribute `categories` of the |
|
`Bunch` instance. When `parser="pandas"`, no ordinal encoding is made. |
|
|
|
.. versionchanged:: 0.24 |
|
The default value of `as_frame` changed from `False` to `'auto'` |
|
in 0.24. |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors or network timeouts are encountered. |
|
Error with status code 412 won't be retried as they represent OpenML |
|
generic errors. |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
parser : {"auto", "pandas", "liac-arff"}, default="auto" |
|
Parser used to load the ARFF file. Two parsers are implemented: |
|
|
|
- `"pandas"`: this is the most efficient parser. However, it requires |
|
pandas to be installed and can only open dense datasets. |
|
- `"liac-arff"`: this is a pure Python ARFF parser that is much less |
|
memory- and CPU-efficient. It deals with sparse ARFF datasets. |
|
|
|
If `"auto"`, the parser is chosen automatically such that `"liac-arff"` |
|
is selected for sparse ARFF datasets, otherwise `"pandas"` is selected. |
|
|
|
.. versionadded:: 1.2 |
|
.. versionchanged:: 1.4 |
|
The default value of `parser` changes from `"liac-arff"` to |
|
`"auto"`. |
|
|
|
read_csv_kwargs : dict, default=None |
|
Keyword arguments passed to :func:`pandas.read_csv` when loading the data |
|
from a ARFF file and using the pandas parser. It can allow to |
|
overwrite some default parameters. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
Returns |
|
------- |
|
data : :class:`~sklearn.utils.Bunch` |
|
Dictionary-like object, with the following attributes. |
|
|
|
data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame |
|
The feature matrix. Categorical features are encoded as ordinals. |
|
target : np.array, pandas Series or DataFrame |
|
The regression target or classification labels, if applicable. |
|
Dtype is float if numeric, and object if categorical. If |
|
``as_frame`` is True, ``target`` is a pandas object. |
|
DESCR : str |
|
The full description of the dataset. |
|
feature_names : list |
|
The names of the dataset columns. |
|
target_names: list |
|
The names of the target columns. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
categories : dict or None |
|
Maps each categorical feature name to a list of values, such |
|
that the value encoded as i is ith in the list. If ``as_frame`` |
|
is True, this is None. |
|
details : dict |
|
More metadata from OpenML. |
|
frame : pandas DataFrame |
|
Only present when `as_frame=True`. DataFrame with ``data`` and |
|
``target``. |
|
|
|
(data, target) : tuple if ``return_X_y`` is True |
|
|
|
.. note:: EXPERIMENTAL |
|
|
|
This interface is **experimental** and subsequent releases may |
|
change attributes without notice (although there should only be |
|
minor changes to ``data`` and ``target``). |
|
|
|
Missing values in the 'data' are represented as NaN's. Missing values |
|
in 'target' are represented as NaN's (numerical target) or None |
|
(categorical target). |
|
|
|
Notes |
|
----- |
|
The `"pandas"` and `"liac-arff"` parsers can lead to different data types |
|
in the output. The notable differences are the following: |
|
|
|
- The `"liac-arff"` parser always encodes categorical features as `str` objects. |
|
To the contrary, the `"pandas"` parser instead infers the type while |
|
reading and numerical categories will be casted into integers whenever |
|
possible. |
|
- The `"liac-arff"` parser uses float64 to encode numerical features |
|
tagged as 'REAL' and 'NUMERICAL' in the metadata. The `"pandas"` |
|
parser instead infers if these numerical features corresponds |
|
to integers and uses panda's Integer extension dtype. |
|
- In particular, classification datasets with integer categories are |
|
typically loaded as such `(0, 1, ...)` with the `"pandas"` parser while |
|
`"liac-arff"` will force the use of string encoded class labels such as |
|
`"0"`, `"1"` and so on. |
|
- The `"pandas"` parser will not strip single quotes - i.e. `'` - from |
|
string columns. For instance, a string `'my string'` will be kept as is |
|
while the `"liac-arff"` parser will strip the single quotes. For |
|
categorical columns, the single quotes are stripped from the values. |
|
|
|
In addition, when `as_frame=False` is used, the `"liac-arff"` parser |
|
returns ordinally encoded data where the categories are provided in the |
|
attribute `categories` of the `Bunch` instance. Instead, `"pandas"` returns |
|
a NumPy array were the categories are not encoded. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import fetch_openml |
|
>>> adult = fetch_openml("adult", version=2) # doctest: +SKIP |
|
>>> adult.frame.info() # doctest: +SKIP |
|
<class 'pandas.core.frame.DataFrame'> |
|
RangeIndex: 48842 entries, 0 to 48841 |
|
Data columns (total 15 columns): |
|
# Column Non-Null Count Dtype |
|
--- ------ -------------- ----- |
|
0 age 48842 non-null int64 |
|
1 workclass 46043 non-null category |
|
2 fnlwgt 48842 non-null int64 |
|
3 education 48842 non-null category |
|
4 education-num 48842 non-null int64 |
|
5 marital-status 48842 non-null category |
|
6 occupation 46033 non-null category |
|
7 relationship 48842 non-null category |
|
8 race 48842 non-null category |
|
9 sex 48842 non-null category |
|
10 capital-gain 48842 non-null int64 |
|
11 capital-loss 48842 non-null int64 |
|
12 hours-per-week 48842 non-null int64 |
|
13 native-country 47985 non-null category |
|
14 class 48842 non-null category |
|
dtypes: category(9), int64(6) |
|
memory usage: 2.7 MB |
|
""" |
|
if cache is False: |
|
|
|
data_home = None |
|
else: |
|
data_home = get_data_home(data_home=data_home) |
|
data_home = join(str(data_home), "openml") |
|
|
|
|
|
|
|
if name is not None: |
|
|
|
|
|
name = name.lower() |
|
if data_id is not None: |
|
raise ValueError( |
|
"Dataset data_id={} and name={} passed, but you can only " |
|
"specify a numeric data_id or a name, not " |
|
"both.".format(data_id, name) |
|
) |
|
data_info = _get_data_info_by_name( |
|
name, version, data_home, n_retries=n_retries, delay=delay |
|
) |
|
data_id = data_info["did"] |
|
elif data_id is not None: |
|
|
|
if version != "active": |
|
raise ValueError( |
|
"Dataset data_id={} and version={} passed, but you can only " |
|
"specify a numeric data_id or a version, not " |
|
"both.".format(data_id, version) |
|
) |
|
else: |
|
raise ValueError( |
|
"Neither name nor data_id are provided. Please provide name or data_id." |
|
) |
|
|
|
data_description = _get_data_description_by_id(data_id, data_home) |
|
if data_description["status"] != "active": |
|
warn( |
|
"Version {} of dataset {} is inactive, meaning that issues have " |
|
"been found in the dataset. Try using a newer version from " |
|
"this URL: {}".format( |
|
data_description["version"], |
|
data_description["name"], |
|
data_description["url"], |
|
) |
|
) |
|
if "error" in data_description: |
|
warn( |
|
"OpenML registered a problem with the dataset. It might be " |
|
"unusable. Error: {}".format(data_description["error"]) |
|
) |
|
if "warning" in data_description: |
|
warn( |
|
"OpenML raised a warning on the dataset. It might be " |
|
"unusable. Warning: {}".format(data_description["warning"]) |
|
) |
|
|
|
return_sparse = data_description["format"].lower() == "sparse_arff" |
|
as_frame = not return_sparse if as_frame == "auto" else as_frame |
|
if parser == "auto": |
|
parser_ = "liac-arff" if return_sparse else "pandas" |
|
else: |
|
parser_ = parser |
|
|
|
if parser_ == "pandas": |
|
try: |
|
check_pandas_support("`fetch_openml`") |
|
except ImportError as exc: |
|
if as_frame: |
|
err_msg = ( |
|
"Returning pandas objects requires pandas to be installed. " |
|
"Alternatively, explicitly set `as_frame=False` and " |
|
"`parser='liac-arff'`." |
|
) |
|
else: |
|
err_msg = ( |
|
f"Using `parser={parser!r}` with dense data requires pandas to be " |
|
"installed. Alternatively, explicitly set `parser='liac-arff'`." |
|
) |
|
raise ImportError(err_msg) from exc |
|
|
|
if return_sparse: |
|
if as_frame: |
|
raise ValueError( |
|
"Sparse ARFF datasets cannot be loaded with as_frame=True. " |
|
"Use as_frame=False or as_frame='auto' instead." |
|
) |
|
if parser_ == "pandas": |
|
raise ValueError( |
|
f"Sparse ARFF datasets cannot be loaded with parser={parser!r}. " |
|
"Use parser='liac-arff' or parser='auto' instead." |
|
) |
|
|
|
|
|
features_list = _get_data_features(data_id, data_home) |
|
|
|
if not as_frame: |
|
for feature in features_list: |
|
if "true" in (feature["is_ignore"], feature["is_row_identifier"]): |
|
continue |
|
if feature["data_type"] == "string": |
|
raise ValueError( |
|
"STRING attributes are not supported for " |
|
"array representation. Try as_frame=True" |
|
) |
|
|
|
if target_column == "default-target": |
|
|
|
|
|
|
|
target_columns = [ |
|
feature["name"] |
|
for feature in features_list |
|
if feature["is_target"] == "true" |
|
] |
|
elif isinstance(target_column, str): |
|
|
|
target_columns = [target_column] |
|
elif target_column is None: |
|
target_columns = [] |
|
else: |
|
|
|
target_columns = target_column |
|
data_columns = _valid_data_column_names(features_list, target_columns) |
|
|
|
shape: Optional[Tuple[int, int]] |
|
|
|
if not return_sparse: |
|
|
|
|
|
data_qualities = _get_data_qualities(data_id, data_home) |
|
shape = _get_num_samples(data_qualities), len(features_list) |
|
else: |
|
shape = None |
|
|
|
|
|
url = _DATA_FILE.format(data_description["file_id"]) |
|
bunch = _download_data_to_bunch( |
|
url, |
|
return_sparse, |
|
data_home, |
|
as_frame=bool(as_frame), |
|
openml_columns_info=features_list, |
|
shape=shape, |
|
target_columns=target_columns, |
|
data_columns=data_columns, |
|
md5_checksum=data_description["md5_checksum"], |
|
n_retries=n_retries, |
|
delay=delay, |
|
parser=parser_, |
|
read_csv_kwargs=read_csv_kwargs, |
|
) |
|
|
|
if return_X_y: |
|
return bunch.data, bunch.target |
|
|
|
description = "{}\n\nDownloaded from openml.org.".format( |
|
data_description.pop("description") |
|
) |
|
|
|
bunch.update( |
|
DESCR=description, |
|
details=data_description, |
|
url="https://www.openml.org/d/{}".format(data_id), |
|
) |
|
|
|
return bunch |
|
|