UnIVAL / fairseq /scripts /count_docs.py
mshukor
init
26fd00c
raw
history blame
1.78 kB
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Count the number of documents and average number of lines and tokens per
document in a large file. Documents should be separated by a single empty line.
"""
import argparse
import gzip
import sys
import numpy as np
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("--gzip", action="store_true")
args = parser.parse_args()
def gopen():
if args.gzip:
return gzip.open(args.input, "r")
else:
return open(args.input, "r", encoding="utf-8")
num_lines = []
num_toks = []
with gopen() as h:
num_docs = 1
num_lines_in_doc = 0
num_toks_in_doc = 0
for i, line in enumerate(h):
if len(line.strip()) == 0: # empty line indicates new document
num_docs += 1
num_lines.append(num_lines_in_doc)
num_toks.append(num_toks_in_doc)
num_lines_in_doc = 0
num_toks_in_doc = 0
else:
num_lines_in_doc += 1
num_toks_in_doc += len(line.rstrip().split())
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
print(file=sys.stderr, flush=True)
print("found {} docs".format(num_docs))
print("average num lines per doc: {}".format(np.mean(num_lines)))
print("average num toks per doc: {}".format(np.mean(num_toks)))
if __name__ == "__main__":
main()