|
"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes.""" |
|
|
|
from typing import BinaryIO, Optional |
|
|
|
from .insecure_hashlib import sha1, sha256 |
|
|
|
|
|
def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: |
|
""" |
|
Computes the sha256 hash of the given file object, by chunks of size `chunk_size`. |
|
|
|
Args: |
|
fileobj (file-like object): |
|
The File object to compute sha256 for, typically obtained with `open(path, "rb")` |
|
chunk_size (`int`, *optional*): |
|
The number of bytes to read from `fileobj` at once, defaults to 1MB. |
|
|
|
Returns: |
|
`bytes`: `fileobj`'s sha256 hash as bytes |
|
""" |
|
chunk_size = chunk_size if chunk_size is not None else 1024 * 1024 |
|
|
|
sha = sha256() |
|
while True: |
|
chunk = fileobj.read(chunk_size) |
|
sha.update(chunk) |
|
if not chunk: |
|
break |
|
return sha.digest() |
|
|
|
|
|
def git_hash(data: bytes) -> str: |
|
""" |
|
Computes the git-sha1 hash of the given bytes, using the same algorithm as git. |
|
|
|
This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object |
|
for more details. |
|
|
|
Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the |
|
pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of |
|
the LFS file content when we want to compare LFS files. |
|
|
|
Args: |
|
data (`bytes`): |
|
The data to compute the git-hash for. |
|
|
|
Returns: |
|
`str`: the git-hash of `data` as an hexadecimal string. |
|
|
|
Example: |
|
```python |
|
>>> from huggingface_hub.utils.sha import git_hash |
|
>>> git_hash(b"Hello, World!") |
|
'b45ef6fec89518d314f546fd6c3025367b721684' |
|
``` |
|
""" |
|
|
|
|
|
sha = sha1() |
|
sha.update(b"blob ") |
|
sha.update(str(len(data)).encode()) |
|
sha.update(b"\0") |
|
sha.update(data) |
|
return sha.hexdigest() |
|
|