File size: 1,784 Bytes
26fd00c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Count the number of documents and average number of lines and tokens per
document in a large file. Documents should be separated by a single empty line.
"""
import argparse
import gzip
import sys
import numpy as np
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("--gzip", action="store_true")
args = parser.parse_args()
def gopen():
if args.gzip:
return gzip.open(args.input, "r")
else:
return open(args.input, "r", encoding="utf-8")
num_lines = []
num_toks = []
with gopen() as h:
num_docs = 1
num_lines_in_doc = 0
num_toks_in_doc = 0
for i, line in enumerate(h):
if len(line.strip()) == 0: # empty line indicates new document
num_docs += 1
num_lines.append(num_lines_in_doc)
num_toks.append(num_toks_in_doc)
num_lines_in_doc = 0
num_toks_in_doc = 0
else:
num_lines_in_doc += 1
num_toks_in_doc += len(line.rstrip().split())
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
print(file=sys.stderr, flush=True)
print("found {} docs".format(num_docs))
print("average num lines per doc: {}".format(np.mean(num_lines)))
print("average num toks per doc: {}".format(np.mean(num_toks)))
if __name__ == "__main__":
main()
|