|
"""Labeled Faces in the Wild (LFW) dataset |
|
|
|
This dataset is a collection of JPEG pictures of famous people collected |
|
over the internet, all details are available on the official website: |
|
|
|
http://vis-www.cs.umass.edu/lfw/ |
|
""" |
|
|
|
|
|
|
|
|
|
import logging |
|
from numbers import Integral, Real |
|
from os import PathLike, listdir, makedirs, remove |
|
from os.path import exists, isdir, join |
|
|
|
import numpy as np |
|
from joblib import Memory |
|
|
|
from ..utils import Bunch |
|
from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params |
|
from ..utils.fixes import tarfile_extractall |
|
from ._base import ( |
|
RemoteFileMetadata, |
|
_fetch_remote, |
|
get_data_home, |
|
load_descr, |
|
) |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
ARCHIVE = RemoteFileMetadata( |
|
filename="lfw.tgz", |
|
url="https://ndownloader.figshare.com/files/5976018", |
|
checksum="055f7d9c632d7370e6fb4afc7468d40f970c34a80d4c6f50ffec63f5a8d536c0", |
|
) |
|
|
|
|
|
|
|
FUNNELED_ARCHIVE = RemoteFileMetadata( |
|
filename="lfw-funneled.tgz", |
|
url="https://ndownloader.figshare.com/files/5976015", |
|
checksum="b47c8422c8cded889dc5a13418c4bc2abbda121092b3533a83306f90d900100a", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
TARGETS = ( |
|
RemoteFileMetadata( |
|
filename="pairsDevTrain.txt", |
|
url="https://ndownloader.figshare.com/files/5976012", |
|
checksum="1d454dada7dfeca0e7eab6f65dc4e97a6312d44cf142207be28d688be92aabfa", |
|
), |
|
RemoteFileMetadata( |
|
filename="pairsDevTest.txt", |
|
url="https://ndownloader.figshare.com/files/5976009", |
|
checksum="7cb06600ea8b2814ac26e946201cdb304296262aad67d046a16a7ec85d0ff87c", |
|
), |
|
RemoteFileMetadata( |
|
filename="pairs.txt", |
|
url="https://ndownloader.figshare.com/files/5976006", |
|
checksum="ea42330c62c92989f9d7c03237ed5d591365e89b3e649747777b70e692dc1592", |
|
), |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _check_fetch_lfw( |
|
data_home=None, funneled=True, download_if_missing=True, n_retries=3, delay=1.0 |
|
): |
|
"""Helper function to download any missing LFW data""" |
|
|
|
data_home = get_data_home(data_home=data_home) |
|
lfw_home = join(data_home, "lfw_home") |
|
|
|
if not exists(lfw_home): |
|
makedirs(lfw_home) |
|
|
|
for target in TARGETS: |
|
target_filepath = join(lfw_home, target.filename) |
|
if not exists(target_filepath): |
|
if download_if_missing: |
|
logger.info("Downloading LFW metadata: %s", target.url) |
|
_fetch_remote( |
|
target, dirname=lfw_home, n_retries=n_retries, delay=delay |
|
) |
|
else: |
|
raise OSError("%s is missing" % target_filepath) |
|
|
|
if funneled: |
|
data_folder_path = join(lfw_home, "lfw_funneled") |
|
archive = FUNNELED_ARCHIVE |
|
else: |
|
data_folder_path = join(lfw_home, "lfw") |
|
archive = ARCHIVE |
|
|
|
if not exists(data_folder_path): |
|
archive_path = join(lfw_home, archive.filename) |
|
if not exists(archive_path): |
|
if download_if_missing: |
|
logger.info("Downloading LFW data (~200MB): %s", archive.url) |
|
_fetch_remote( |
|
archive, dirname=lfw_home, n_retries=n_retries, delay=delay |
|
) |
|
else: |
|
raise OSError("%s is missing" % archive_path) |
|
|
|
import tarfile |
|
|
|
logger.debug("Decompressing the data archive to %s", data_folder_path) |
|
with tarfile.open(archive_path, "r:gz") as fp: |
|
tarfile_extractall(fp, path=lfw_home) |
|
remove(archive_path) |
|
|
|
return lfw_home, data_folder_path |
|
|
|
|
|
def _load_imgs(file_paths, slice_, color, resize): |
|
"""Internally used to load images""" |
|
try: |
|
from PIL import Image |
|
except ImportError: |
|
raise ImportError( |
|
"The Python Imaging Library (PIL) is required to load data " |
|
"from jpeg files. Please refer to " |
|
"https://pillow.readthedocs.io/en/stable/installation.html " |
|
"for installing PIL." |
|
) |
|
|
|
|
|
|
|
default_slice = (slice(0, 250), slice(0, 250)) |
|
if slice_ is None: |
|
slice_ = default_slice |
|
else: |
|
slice_ = tuple(s or ds for s, ds in zip(slice_, default_slice)) |
|
|
|
h_slice, w_slice = slice_ |
|
h = (h_slice.stop - h_slice.start) // (h_slice.step or 1) |
|
w = (w_slice.stop - w_slice.start) // (w_slice.step or 1) |
|
|
|
if resize is not None: |
|
resize = float(resize) |
|
h = int(resize * h) |
|
w = int(resize * w) |
|
|
|
|
|
n_faces = len(file_paths) |
|
if not color: |
|
faces = np.zeros((n_faces, h, w), dtype=np.float32) |
|
else: |
|
faces = np.zeros((n_faces, h, w, 3), dtype=np.float32) |
|
|
|
|
|
|
|
for i, file_path in enumerate(file_paths): |
|
if i % 1000 == 0: |
|
logger.debug("Loading face #%05d / %05d", i + 1, n_faces) |
|
|
|
|
|
|
|
pil_img = Image.open(file_path) |
|
pil_img = pil_img.crop( |
|
(w_slice.start, h_slice.start, w_slice.stop, h_slice.stop) |
|
) |
|
if resize is not None: |
|
pil_img = pil_img.resize((w, h)) |
|
face = np.asarray(pil_img, dtype=np.float32) |
|
|
|
if face.ndim == 0: |
|
raise RuntimeError( |
|
"Failed to read the image file %s, " |
|
"Please make sure that libjpeg is installed" % file_path |
|
) |
|
|
|
face /= 255.0 |
|
if not color: |
|
|
|
|
|
face = face.mean(axis=2) |
|
|
|
faces[i, ...] = face |
|
|
|
return faces |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _fetch_lfw_people( |
|
data_folder_path, slice_=None, color=False, resize=None, min_faces_per_person=0 |
|
): |
|
"""Perform the actual data loading for the lfw people dataset |
|
|
|
This operation is meant to be cached by a joblib wrapper. |
|
""" |
|
|
|
|
|
person_names, file_paths = [], [] |
|
for person_name in sorted(listdir(data_folder_path)): |
|
folder_path = join(data_folder_path, person_name) |
|
if not isdir(folder_path): |
|
continue |
|
paths = [join(folder_path, f) for f in sorted(listdir(folder_path))] |
|
n_pictures = len(paths) |
|
if n_pictures >= min_faces_per_person: |
|
person_name = person_name.replace("_", " ") |
|
person_names.extend([person_name] * n_pictures) |
|
file_paths.extend(paths) |
|
|
|
n_faces = len(file_paths) |
|
if n_faces == 0: |
|
raise ValueError( |
|
"min_faces_per_person=%d is too restrictive" % min_faces_per_person |
|
) |
|
|
|
target_names = np.unique(person_names) |
|
target = np.searchsorted(target_names, person_names) |
|
|
|
faces = _load_imgs(file_paths, slice_, color, resize) |
|
|
|
|
|
|
|
|
|
|
|
|
|
indices = np.arange(n_faces) |
|
np.random.RandomState(42).shuffle(indices) |
|
faces, target = faces[indices], target[indices] |
|
return faces, target, target_names |
|
|
|
|
|
@validate_params( |
|
{ |
|
"data_home": [str, PathLike, None], |
|
"funneled": ["boolean"], |
|
"resize": [Interval(Real, 0, None, closed="neither"), None], |
|
"min_faces_per_person": [Interval(Integral, 0, None, closed="left"), None], |
|
"color": ["boolean"], |
|
"slice_": [tuple, Hidden(None)], |
|
"download_if_missing": ["boolean"], |
|
"return_X_y": ["boolean"], |
|
"n_retries": [Interval(Integral, 1, None, closed="left")], |
|
"delay": [Interval(Real, 0.0, None, closed="neither")], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def fetch_lfw_people( |
|
*, |
|
data_home=None, |
|
funneled=True, |
|
resize=0.5, |
|
min_faces_per_person=0, |
|
color=False, |
|
slice_=(slice(70, 195), slice(78, 172)), |
|
download_if_missing=True, |
|
return_X_y=False, |
|
n_retries=3, |
|
delay=1.0, |
|
): |
|
"""Load the Labeled Faces in the Wild (LFW) people dataset \ |
|
(classification). |
|
|
|
Download it if necessary. |
|
|
|
================= ======================= |
|
Classes 5749 |
|
Samples total 13233 |
|
Dimensionality 5828 |
|
Features real, between 0 and 255 |
|
================= ======================= |
|
|
|
For a usage example of this dataset, see |
|
:ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`. |
|
|
|
Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`. |
|
|
|
Parameters |
|
---------- |
|
data_home : str or path-like, default=None |
|
Specify another download and cache folder for the datasets. By default |
|
all scikit-learn data is stored in '~/scikit_learn_data' subfolders. |
|
|
|
funneled : bool, default=True |
|
Download and use the funneled variant of the dataset. |
|
|
|
resize : float or None, default=0.5 |
|
Ratio used to resize the each face picture. If `None`, no resizing is |
|
performed. |
|
|
|
min_faces_per_person : int, default=None |
|
The extracted dataset will only retain pictures of people that have at |
|
least `min_faces_per_person` different pictures. |
|
|
|
color : bool, default=False |
|
Keep the 3 RGB channels instead of averaging them to a single |
|
gray level channel. If color is True the shape of the data has |
|
one more dimension than the shape with color = False. |
|
|
|
slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172)) |
|
Provide a custom 2D slice (height, width) to extract the |
|
'interesting' part of the jpeg files and avoid use statistical |
|
correlation from the background. |
|
|
|
download_if_missing : bool, default=True |
|
If False, raise an OSError if the data is not locally available |
|
instead of trying to download the data from the source site. |
|
|
|
return_X_y : bool, default=False |
|
If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch |
|
object. See below for more information about the `dataset.data` and |
|
`dataset.target` object. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors are encountered. |
|
|
|
.. versionadded:: 1.5 |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
.. versionadded:: 1.5 |
|
|
|
Returns |
|
------- |
|
dataset : :class:`~sklearn.utils.Bunch` |
|
Dictionary-like object, with the following attributes. |
|
|
|
data : numpy array of shape (13233, 2914) |
|
Each row corresponds to a ravelled face image |
|
of original size 62 x 47 pixels. |
|
Changing the ``slice_`` or resize parameters will change the |
|
shape of the output. |
|
images : numpy array of shape (13233, 62, 47) |
|
Each row is a face image corresponding to one of the 5749 people in |
|
the dataset. Changing the ``slice_`` |
|
or resize parameters will change the shape of the output. |
|
target : numpy array of shape (13233,) |
|
Labels associated to each face image. |
|
Those labels range from 0-5748 and correspond to the person IDs. |
|
target_names : numpy array of shape (5749,) |
|
Names of all persons in the dataset. |
|
Position in array corresponds to the person ID in the target array. |
|
DESCR : str |
|
Description of the Labeled Faces in the Wild (LFW) dataset. |
|
|
|
(data, target) : tuple if ``return_X_y`` is True |
|
A tuple of two ndarray. The first containing a 2D array of |
|
shape (n_samples, n_features) with each row representing one |
|
sample and each column representing the features. The second |
|
ndarray of shape (n_samples,) containing the target samples. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import fetch_lfw_people |
|
>>> lfw_people = fetch_lfw_people() |
|
>>> lfw_people.data.shape |
|
(13233, 2914) |
|
>>> lfw_people.target.shape |
|
(13233,) |
|
>>> for name in lfw_people.target_names[:5]: |
|
... print(name) |
|
AJ Cook |
|
AJ Lamas |
|
Aaron Eckhart |
|
Aaron Guiel |
|
Aaron Patterson |
|
""" |
|
lfw_home, data_folder_path = _check_fetch_lfw( |
|
data_home=data_home, |
|
funneled=funneled, |
|
download_if_missing=download_if_missing, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
logger.debug("Loading LFW people faces from %s", lfw_home) |
|
|
|
|
|
|
|
m = Memory(location=lfw_home, compress=6, verbose=0) |
|
load_func = m.cache(_fetch_lfw_people) |
|
|
|
|
|
faces, target, target_names = load_func( |
|
data_folder_path, |
|
resize=resize, |
|
min_faces_per_person=min_faces_per_person, |
|
color=color, |
|
slice_=slice_, |
|
) |
|
|
|
X = faces.reshape(len(faces), -1) |
|
|
|
fdescr = load_descr("lfw.rst") |
|
|
|
if return_X_y: |
|
return X, target |
|
|
|
|
|
return Bunch( |
|
data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _fetch_lfw_pairs( |
|
index_file_path, data_folder_path, slice_=None, color=False, resize=None |
|
): |
|
"""Perform the actual data loading for the LFW pairs dataset |
|
|
|
This operation is meant to be cached by a joblib wrapper. |
|
""" |
|
|
|
|
|
with open(index_file_path, "rb") as index_file: |
|
split_lines = [ln.decode().strip().split("\t") for ln in index_file] |
|
pair_specs = [sl for sl in split_lines if len(sl) > 2] |
|
n_pairs = len(pair_specs) |
|
|
|
|
|
|
|
target = np.zeros(n_pairs, dtype=int) |
|
file_paths = list() |
|
for i, components in enumerate(pair_specs): |
|
if len(components) == 3: |
|
target[i] = 1 |
|
pair = ( |
|
(components[0], int(components[1]) - 1), |
|
(components[0], int(components[2]) - 1), |
|
) |
|
elif len(components) == 4: |
|
target[i] = 0 |
|
pair = ( |
|
(components[0], int(components[1]) - 1), |
|
(components[2], int(components[3]) - 1), |
|
) |
|
else: |
|
raise ValueError("invalid line %d: %r" % (i + 1, components)) |
|
for j, (name, idx) in enumerate(pair): |
|
try: |
|
person_folder = join(data_folder_path, name) |
|
except TypeError: |
|
person_folder = join(data_folder_path, str(name, "UTF-8")) |
|
filenames = list(sorted(listdir(person_folder))) |
|
file_path = join(person_folder, filenames[idx]) |
|
file_paths.append(file_path) |
|
|
|
pairs = _load_imgs(file_paths, slice_, color, resize) |
|
shape = list(pairs.shape) |
|
n_faces = shape.pop(0) |
|
shape.insert(0, 2) |
|
shape.insert(0, n_faces // 2) |
|
pairs.shape = shape |
|
|
|
return pairs, target, np.array(["Different persons", "Same person"]) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"subset": [StrOptions({"train", "test", "10_folds"})], |
|
"data_home": [str, PathLike, None], |
|
"funneled": ["boolean"], |
|
"resize": [Interval(Real, 0, None, closed="neither"), None], |
|
"color": ["boolean"], |
|
"slice_": [tuple, Hidden(None)], |
|
"download_if_missing": ["boolean"], |
|
"n_retries": [Interval(Integral, 1, None, closed="left")], |
|
"delay": [Interval(Real, 0.0, None, closed="neither")], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def fetch_lfw_pairs( |
|
*, |
|
subset="train", |
|
data_home=None, |
|
funneled=True, |
|
resize=0.5, |
|
color=False, |
|
slice_=(slice(70, 195), slice(78, 172)), |
|
download_if_missing=True, |
|
n_retries=3, |
|
delay=1.0, |
|
): |
|
"""Load the Labeled Faces in the Wild (LFW) pairs dataset (classification). |
|
|
|
Download it if necessary. |
|
|
|
================= ======================= |
|
Classes 2 |
|
Samples total 13233 |
|
Dimensionality 5828 |
|
Features real, between 0 and 255 |
|
================= ======================= |
|
|
|
In the official `README.txt`_ this task is described as the |
|
"Restricted" task. As I am not sure as to implement the |
|
"Unrestricted" variant correctly, I left it as unsupported for now. |
|
|
|
.. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt |
|
|
|
The original images are 250 x 250 pixels, but the default slice and resize |
|
arguments reduce them to 62 x 47. |
|
|
|
Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`. |
|
|
|
Parameters |
|
---------- |
|
subset : {'train', 'test', '10_folds'}, default='train' |
|
Select the dataset to load: 'train' for the development training |
|
set, 'test' for the development test set, and '10_folds' for the |
|
official evaluation set that is meant to be used with a 10-folds |
|
cross validation. |
|
|
|
data_home : str or path-like, default=None |
|
Specify another download and cache folder for the datasets. By |
|
default all scikit-learn data is stored in '~/scikit_learn_data' |
|
subfolders. |
|
|
|
funneled : bool, default=True |
|
Download and use the funneled variant of the dataset. |
|
|
|
resize : float, default=0.5 |
|
Ratio used to resize the each face picture. |
|
|
|
color : bool, default=False |
|
Keep the 3 RGB channels instead of averaging them to a single |
|
gray level channel. If color is True the shape of the data has |
|
one more dimension than the shape with color = False. |
|
|
|
slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172)) |
|
Provide a custom 2D slice (height, width) to extract the |
|
'interesting' part of the jpeg files and avoid use statistical |
|
correlation from the background. |
|
|
|
download_if_missing : bool, default=True |
|
If False, raise an OSError if the data is not locally available |
|
instead of trying to download the data from the source site. |
|
|
|
n_retries : int, default=3 |
|
Number of retries when HTTP errors are encountered. |
|
|
|
.. versionadded:: 1.5 |
|
|
|
delay : float, default=1.0 |
|
Number of seconds between retries. |
|
|
|
.. versionadded:: 1.5 |
|
|
|
Returns |
|
------- |
|
data : :class:`~sklearn.utils.Bunch` |
|
Dictionary-like object, with the following attributes. |
|
|
|
data : ndarray of shape (2200, 5828). Shape depends on ``subset``. |
|
Each row corresponds to 2 ravel'd face images |
|
of original size 62 x 47 pixels. |
|
Changing the ``slice_``, ``resize`` or ``subset`` parameters |
|
will change the shape of the output. |
|
pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset`` |
|
Each row has 2 face images corresponding |
|
to same or different person from the dataset |
|
containing 5749 people. Changing the ``slice_``, |
|
``resize`` or ``subset`` parameters will change the shape of the |
|
output. |
|
target : numpy array of shape (2200,). Shape depends on ``subset``. |
|
Labels associated to each pair of images. |
|
The two label values being different persons or the same person. |
|
target_names : numpy array of shape (2,) |
|
Explains the target values of the target array. |
|
0 corresponds to "Different person", 1 corresponds to "same person". |
|
DESCR : str |
|
Description of the Labeled Faces in the Wild (LFW) dataset. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import fetch_lfw_pairs |
|
>>> lfw_pairs_train = fetch_lfw_pairs(subset='train') |
|
>>> list(lfw_pairs_train.target_names) |
|
[np.str_('Different persons'), np.str_('Same person')] |
|
>>> lfw_pairs_train.pairs.shape |
|
(2200, 2, 62, 47) |
|
>>> lfw_pairs_train.data.shape |
|
(2200, 5828) |
|
>>> lfw_pairs_train.target.shape |
|
(2200,) |
|
""" |
|
lfw_home, data_folder_path = _check_fetch_lfw( |
|
data_home=data_home, |
|
funneled=funneled, |
|
download_if_missing=download_if_missing, |
|
n_retries=n_retries, |
|
delay=delay, |
|
) |
|
logger.debug("Loading %s LFW pairs from %s", subset, lfw_home) |
|
|
|
|
|
|
|
m = Memory(location=lfw_home, compress=6, verbose=0) |
|
load_func = m.cache(_fetch_lfw_pairs) |
|
|
|
|
|
label_filenames = { |
|
"train": "pairsDevTrain.txt", |
|
"test": "pairsDevTest.txt", |
|
"10_folds": "pairs.txt", |
|
} |
|
if subset not in label_filenames: |
|
raise ValueError( |
|
"subset='%s' is invalid: should be one of %r" |
|
% (subset, list(sorted(label_filenames.keys()))) |
|
) |
|
index_file_path = join(lfw_home, label_filenames[subset]) |
|
|
|
|
|
pairs, target, target_names = load_func( |
|
index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_ |
|
) |
|
|
|
fdescr = load_descr("lfw.rst") |
|
|
|
|
|
return Bunch( |
|
data=pairs.reshape(len(pairs), -1), |
|
pairs=pairs, |
|
target=target, |
|
target_names=target_names, |
|
DESCR=fdescr, |
|
) |
|
|