Spaces:
Running
Running
from contextlib import nullcontext | |
import numpy as np | |
from .numeric import uint8, ndarray, dtype | |
from numpy.compat import os_fspath, is_pathlib_path | |
from numpy.core.overrides import set_module | |
__all__ = ['memmap'] | |
dtypedescr = dtype | |
valid_filemodes = ["r", "c", "r+", "w+"] | |
writeable_filemodes = ["r+", "w+"] | |
mode_equivalents = { | |
"readonly":"r", | |
"copyonwrite":"c", | |
"readwrite":"r+", | |
"write":"w+" | |
} | |
class memmap(ndarray): | |
"""Create a memory-map to an array stored in a *binary* file on disk. | |
Memory-mapped files are used for accessing small segments of large files | |
on disk, without reading the entire file into memory. NumPy's | |
memmap's are array-like objects. This differs from Python's ``mmap`` | |
module, which uses file-like objects. | |
This subclass of ndarray has some unpleasant interactions with | |
some operations, because it doesn't quite fit properly as a subclass. | |
An alternative to using this subclass is to create the ``mmap`` | |
object yourself, then create an ndarray with ndarray.__new__ directly, | |
passing the object created in its 'buffer=' parameter. | |
This class may at some point be turned into a factory function | |
which returns a view into an mmap buffer. | |
Flush the memmap instance to write the changes to the file. Currently there | |
is no API to close the underlying ``mmap``. It is tricky to ensure the | |
resource is actually closed, since it may be shared between different | |
memmap instances. | |
Parameters | |
---------- | |
filename : str, file-like object, or pathlib.Path instance | |
The file name or file object to be used as the array data buffer. | |
dtype : data-type, optional | |
The data-type used to interpret the file contents. | |
Default is `uint8`. | |
mode : {'r+', 'r', 'w+', 'c'}, optional | |
The file is opened in this mode: | |
+------+-------------------------------------------------------------+ | |
| 'r' | Open existing file for reading only. | | |
+------+-------------------------------------------------------------+ | |
| 'r+' | Open existing file for reading and writing. | | |
+------+-------------------------------------------------------------+ | |
| 'w+' | Create or overwrite existing file for reading and writing. | | |
+------+-------------------------------------------------------------+ | |
| 'c' | Copy-on-write: assignments affect data in memory, but | | |
| | changes are not saved to disk. The file on disk is | | |
| | read-only. | | |
+------+-------------------------------------------------------------+ | |
Default is 'r+'. | |
offset : int, optional | |
In the file, array data starts at this offset. Since `offset` is | |
measured in bytes, it should normally be a multiple of the byte-size | |
of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of | |
file are valid; The file will be extended to accommodate the | |
additional data. By default, ``memmap`` will start at the beginning of | |
the file, even if ``filename`` is a file pointer ``fp`` and | |
``fp.tell() != 0``. | |
shape : tuple, optional | |
The desired shape of the array. If ``mode == 'r'`` and the number | |
of remaining bytes after `offset` is not a multiple of the byte-size | |
of `dtype`, you must specify `shape`. By default, the returned array | |
will be 1-D with the number of elements determined by file size | |
and data-type. | |
order : {'C', 'F'}, optional | |
Specify the order of the ndarray memory layout: | |
:term:`row-major`, C-style or :term:`column-major`, | |
Fortran-style. This only has an effect if the shape is | |
greater than 1-D. The default order is 'C'. | |
Attributes | |
---------- | |
filename : str or pathlib.Path instance | |
Path to the mapped file. | |
offset : int | |
Offset position in the file. | |
mode : str | |
File mode. | |
Methods | |
------- | |
flush | |
Flush any changes in memory to file on disk. | |
When you delete a memmap object, flush is called first to write | |
changes to disk. | |
See also | |
-------- | |
lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file. | |
Notes | |
----- | |
The memmap object can be used anywhere an ndarray is accepted. | |
Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns | |
``True``. | |
Memory-mapped files cannot be larger than 2GB on 32-bit systems. | |
When a memmap causes a file to be created or extended beyond its | |
current size in the filesystem, the contents of the new part are | |
unspecified. On systems with POSIX filesystem semantics, the extended | |
part will be filled with zero bytes. | |
Examples | |
-------- | |
>>> data = np.arange(12, dtype='float32') | |
>>> data.resize((3,4)) | |
This example uses a temporary file so that doctest doesn't write | |
files to your directory. You would use a 'normal' filename. | |
>>> from tempfile import mkdtemp | |
>>> import os.path as path | |
>>> filename = path.join(mkdtemp(), 'newfile.dat') | |
Create a memmap with dtype and shape that matches our data: | |
>>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4)) | |
>>> fp | |
memmap([[0., 0., 0., 0.], | |
[0., 0., 0., 0.], | |
[0., 0., 0., 0.]], dtype=float32) | |
Write data to memmap array: | |
>>> fp[:] = data[:] | |
>>> fp | |
memmap([[ 0., 1., 2., 3.], | |
[ 4., 5., 6., 7.], | |
[ 8., 9., 10., 11.]], dtype=float32) | |
>>> fp.filename == path.abspath(filename) | |
True | |
Flushes memory changes to disk in order to read them back | |
>>> fp.flush() | |
Load the memmap and verify data was stored: | |
>>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4)) | |
>>> newfp | |
memmap([[ 0., 1., 2., 3.], | |
[ 4., 5., 6., 7.], | |
[ 8., 9., 10., 11.]], dtype=float32) | |
Read-only memmap: | |
>>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4)) | |
>>> fpr.flags.writeable | |
False | |
Copy-on-write memmap: | |
>>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4)) | |
>>> fpc.flags.writeable | |
True | |
It's possible to assign to copy-on-write array, but values are only | |
written into the memory copy of the array, and not written to disk: | |
>>> fpc | |
memmap([[ 0., 1., 2., 3.], | |
[ 4., 5., 6., 7.], | |
[ 8., 9., 10., 11.]], dtype=float32) | |
>>> fpc[0,:] = 0 | |
>>> fpc | |
memmap([[ 0., 0., 0., 0.], | |
[ 4., 5., 6., 7.], | |
[ 8., 9., 10., 11.]], dtype=float32) | |
File on disk is unchanged: | |
>>> fpr | |
memmap([[ 0., 1., 2., 3.], | |
[ 4., 5., 6., 7.], | |
[ 8., 9., 10., 11.]], dtype=float32) | |
Offset into a memmap: | |
>>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16) | |
>>> fpo | |
memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32) | |
""" | |
__array_priority__ = -100.0 | |
def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0, | |
shape=None, order='C'): | |
# Import here to minimize 'import numpy' overhead | |
import mmap | |
import os.path | |
try: | |
mode = mode_equivalents[mode] | |
except KeyError as e: | |
if mode not in valid_filemodes: | |
raise ValueError( | |
"mode must be one of {!r} (got {!r})" | |
.format(valid_filemodes + list(mode_equivalents.keys()), mode) | |
) from None | |
if mode == 'w+' and shape is None: | |
raise ValueError("shape must be given") | |
if hasattr(filename, 'read'): | |
f_ctx = nullcontext(filename) | |
else: | |
f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b') | |
with f_ctx as fid: | |
fid.seek(0, 2) | |
flen = fid.tell() | |
descr = dtypedescr(dtype) | |
_dbytes = descr.itemsize | |
if shape is None: | |
bytes = flen - offset | |
if bytes % _dbytes: | |
raise ValueError("Size of available data is not a " | |
"multiple of the data-type size.") | |
size = bytes // _dbytes | |
shape = (size,) | |
else: | |
if not isinstance(shape, tuple): | |
shape = (shape,) | |
size = np.intp(1) # avoid default choice of np.int_, which might overflow | |
for k in shape: | |
size *= k | |
bytes = int(offset + size*_dbytes) | |
if mode in ('w+', 'r+') and flen < bytes: | |
fid.seek(bytes - 1, 0) | |
fid.write(b'\0') | |
fid.flush() | |
if mode == 'c': | |
acc = mmap.ACCESS_COPY | |
elif mode == 'r': | |
acc = mmap.ACCESS_READ | |
else: | |
acc = mmap.ACCESS_WRITE | |
start = offset - offset % mmap.ALLOCATIONGRANULARITY | |
bytes -= start | |
array_offset = offset - start | |
mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start) | |
self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm, | |
offset=array_offset, order=order) | |
self._mmap = mm | |
self.offset = offset | |
self.mode = mode | |
if is_pathlib_path(filename): | |
# special case - if we were constructed with a pathlib.path, | |
# then filename is a path object, not a string | |
self.filename = filename.resolve() | |
elif hasattr(fid, "name") and isinstance(fid.name, str): | |
# py3 returns int for TemporaryFile().name | |
self.filename = os.path.abspath(fid.name) | |
# same as memmap copies (e.g. memmap + 1) | |
else: | |
self.filename = None | |
return self | |
def __array_finalize__(self, obj): | |
if hasattr(obj, '_mmap') and np.may_share_memory(self, obj): | |
self._mmap = obj._mmap | |
self.filename = obj.filename | |
self.offset = obj.offset | |
self.mode = obj.mode | |
else: | |
self._mmap = None | |
self.filename = None | |
self.offset = None | |
self.mode = None | |
def flush(self): | |
""" | |
Write any changes in the array to the file on disk. | |
For further information, see `memmap`. | |
Parameters | |
---------- | |
None | |
See Also | |
-------- | |
memmap | |
""" | |
if self.base is not None and hasattr(self.base, 'flush'): | |
self.base.flush() | |
def __array_wrap__(self, arr, context=None): | |
arr = super().__array_wrap__(arr, context) | |
# Return a memmap if a memmap was given as the output of the | |
# ufunc. Leave the arr class unchanged if self is not a memmap | |
# to keep original memmap subclasses behavior | |
if self is arr or type(self) is not memmap: | |
return arr | |
# Return scalar instead of 0d memmap, e.g. for np.sum with | |
# axis=None | |
if arr.shape == (): | |
return arr[()] | |
# Return ndarray otherwise | |
return arr.view(np.ndarray) | |
def __getitem__(self, index): | |
res = super().__getitem__(index) | |
if type(res) is memmap and res._mmap is None: | |
return res.view(type=ndarray) | |
return res | |