Spaces:
Running
Running
#!/usr/bin/python3 | |
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import argparse | |
import fileinput | |
import hashlib | |
import sys | |
from multiprocessing import Pool | |
def get_hashes_and_lines(raw_line): | |
hash = hashlib.md5(raw_line).hexdigest() | |
return hash, raw_line | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--workers", type=int, default=10) | |
parser.add_argument("files", nargs="*", help="input files") | |
args = parser.parse_args() | |
seen = set() | |
with fileinput.input(args.files, mode="rb") as h: | |
pool = Pool(args.workers) | |
results = pool.imap_unordered(get_hashes_and_lines, h, 1000) | |
for i, (hash, raw_line) in enumerate(results): | |
if hash not in seen: | |
seen.add(hash) | |
sys.stdout.buffer.write(raw_line) | |
if i % 1000000 == 0: | |
print(i, file=sys.stderr, end="", flush=True) | |
elif i % 100000 == 0: | |
print(".", file=sys.stderr, end="", flush=True) | |
print(file=sys.stderr, flush=True) | |
if __name__ == "__main__": | |
main() | |