File size: 27,079 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
#! /usr/bin/env python
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
#
# Script contributed by Precision Translation Tools.

"""Run Moses `score` jobs in parallel.

This script is a replacement for `score-parallel.perl`.  The two are similar,
but there are differences in usage.  In addition, this script can be called
directly from Python code without the need to run it as a separate process.
"""

from __future__ import (
    absolute_import,
    print_function,
    unicode_literals,
    )

__metaclass__ = type

from argparse import ArgumentParser
from contextlib import contextmanager
from datetime import datetime
import errno
import gzip
from multiprocessing import Pool
import os
import os.path
import pipes
from shutil import rmtree
from subprocess import check_call
import sys
import tempfile


def get_unicode_type():
    """Return the Unicode string type appropriate to this Python version."""
    if sys.version_info.major <= 2:
        # Unicode string type.  In Python 2 this is the "unicode" type,
        # while "str" is a binary string type.
        return unicode
    else:
        # Unicode string type.  In Python 3 this is the default "str" type.
        # The binary string type is now called "bytes".
        return str


UNICODE_TYPE = get_unicode_type()


class CommandLineError(Exception):
    """Invalid command line."""


class ProgramFailure(Exception):
    """Failure, not a bug, which is reported neatly to the user."""


def parse_args():
    """Parse command line arguments, return as `Namespace`."""
    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        '--extract-file', '-e', metavar='PATH', required=True,
        help=(
            "Path to input file: extract file (e.g. 'extract.sorted.gz' or "
            "'extract.inv.sorted.gz').  Required."))
    parser.add_argument(
        '--lex-file', '-l', metavar='PATH', required=True,
        help=(
            "Path to input file: lex file (e.g. 'lex.f2e' or 'lex.e2f').  "
            "Required."))
    parser.add_argument(
        '--output', '-o', metavar='PATH', required=True,
        help=(
            "Write phrase table to file PATH (e.g. 'phrase-table.half.f2e' "
            "or 'phrase-table.half.e2f').  Required."))
    parser.add_argument(
        '--inverse', '-i', action='store_true',
        help="Inverse scoring.  Defaults to direct scoring.")
    parser.add_argument(
        '--labels-file', '-L', metavar='PATH',
        help="Also write source labels to file PATH.")
    parser.add_argument(
        '--parts-of-speech', '-p', metavar='PATH',
        help="Also write parts-of-speech file to PATH.")
    parser.add_argument(
        '--flexibility-score', '-F', metavar='PATH',
        help="Path to the 'flexibility_score.py' script.  Defaults to none.")
    parser.add_argument(
        '--hierarchical', '-H', action='store_true',
        help="Process hierarchical rules.")
    parser.add_argument(
        '--args', '-a', metavar='ARGUMENTS',
        help="Additional arguments for `score` and `flexibility_score`.")
    parser.add_argument(
        '--sort', '-s', action='store_true',
        help="Sort output file.")
    parser.add_argument(
        '--jobs', '-j', metavar='N', type=int, default=1,
        help="Run up to N jobs in parallel.  Defaults to %(default)s.")
    parser.add_argument(
        '--score-exe', '-x', metavar='PROGRAM',
        help="Name of, or path to, the 'score' executable.")
    parser.add_argument(
        '--sort-command', '-S', metavar='COMMAND-LINE',
        help=(
            "Command line for sorting text files to standard output.  "
            "Must support operation as a pipe, as well as input files named "
            "as command-line arguments."))
    parser.add_argument(
        '--gzip-command', '-z', metavar='PROGRAM',
        help="Path to a gzip or pigz executable.")
    parser.add_argument(
        '--verbose', '-v', action='store_true',
        help="Print what's going on.")
    parser.add_argument(
        '--debug', '-d', action='store_true',
        help="Don't delete temporary directories when done.")
    return parser.parse_args()


def normalize_path(optional_path=None):
    """Return a cleaned-up version of a given filesystem path, or None.

    Converts the path to the operating system's native conventions, and
    removes redundancies like `.`.

    The return value will be `None`, an absolute path, or a relative path,
    same as the argument.  But it will have redundant path separators,
    unnecessary detours through parent directories, and use of the current
    directory "." removed.
    """
    if optional_path is None:
        return None
    else:
        path = os.path.normpath(optional_path)
        path = path.replace('/', os.path.sep)
        path = path.replace('\\', os.path.sep)
        return path


def quote(path):
    """Quote and escape a filename for use in a shell command.

    The Windows implementation is very limited and will break on anything
    more advanced than a space.
    """
    if os.name == 'posix':
        return pipes.quote(path)
    else:
        # TODO: Improve escaping for Windows.
        return '"%s"' % path


def sanitize_args(args):
    """Check `args` for sanity, clean up, and set nontrivial defaults."""
    if args.jobs < 1:
        raise CommandLineError("Number of parallel jobs must be 1 or more.")
    if args.sort_command is None:
        args.sort_command = find_first_executable(
            ['neandersort', 'gsort', 'sort'])
    if args.sort_command is None:
        raise CommandLineError(
            "No 'sort' command is available.  "
            "Choose one using the --sort-command option.")
    if args.gzip_command is None:
        args.gzip_command = find_first_executable(['pigz', 'gzip'])
    if args.gzip_command is None:
        raise CommandLineError(
            "No 'gzip' or 'pigz' command is available.  "
            "Choose one using the --gzip-command option.")
    if args.score_exe is None:
        # Look for "score" executable.  It may be in the current project
        # directory somewhere, or in the PATH.
        moses_dir = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))
        args.score_exe = find_first_executable(
            ['score'],
            [
                moses_dir,
                os.path.join(moses_dir, 'phrase-extract'),
                os.path.join(moses_dir, 'binaries'),
            ])
    args.extract_file = normalize_path(args.extract_file)
    args.lex_file = normalize_path(args.lex_file)
    args.output = normalize_path(args.output)
    args.labels_file = normalize_path(args.labels_file)
    args.parts_of_speech = normalize_path(args.parts_of_speech)
    args.flexibility_score = normalize_path(args.flexibility_score)
    args.score_exe = normalize_path(args.score_exe)


def add_exe_suffix(program):
    """Return the full filename for an executable.

    On Windows, this adds a `.exe` suffix to the name.  On other
    systems, it returns the original name unchanged.
    """
    if os.name == 'nt':
        # Windows.
        return program + '.exe'
    else:
        # Assume POSIX or similar.
        return program


def find_executable(exe, extra_path=None):
    """Return full path to an executable of the given name, or `None`.

    If the given name is a qualified path to an executable, it will be returned
    unchanged.  A qualified path where no executable is found results in a
    `CommandLineError`.
    """
    if extra_path is None:
        extra_path = []

    if os.path.sep in exe:
        # The executable name includes a path.  Only one place it can be.
        if not os.path.isfile(exe) or not os.access(exe, os.X_OK):
            raise CommandLineError("Not an executable: '%s'." % exe)
        return exe

    for path in extra_path + os.getenv('PATH').split(os.pathsep):
        full_path = os.path.join(path, exe)
        if os.access(full_path, os.X_OK):
            return full_path
    return None


def find_first_executable(candidates, extra_path=None):
    """Find the first available of the given candidate programs.

    :raise ProgramFailure: If none of `candidates` was found.
    """
    for program in candidates:
        executable = find_executable(add_exe_suffix(program), extra_path)
        if executable is not None:
            return executable
    raise ProgramFailure(
        "Could not find any of these executables in path: %s."
        % ', '.join(candidates))


def execute_shell(command, verbose=False):
    """Run `command` string through the shell.

    Inherits environment, but sets `LC_ALL` to `C` for predictable results,
    especially from sort commands.

    This uses a full-featured shell, including pipes, substitution, etc.  So
    remember to quote/escape arguments where appropriate!
    """
    assert isinstance(command, UNICODE_TYPE), (
        "Wrong argument for execute_shell.")
    if verbose:
        print("Executing: %s" % command)
    env = os.environ.copy()
    if os.name == 'posix':
        env['LC_ALL'] = 'C'
    check_call(command, shell=True, env=env)


@contextmanager
def tempdir(keep=False):
    """Context manager: temporary directory."""
    directory = tempfile.mkdtemp()
    yield directory
    if not keep:
        rmtree(directory)


def make_dirs(path):
    """Equivalent to `mkdir -p -- path`."""
    try:
        os.makedirs(path)
    except OSError as error:
        if error.errno != errno.EEXIST:
            raise


def open_file(path, mode='r'):
    """Open a file, which may be gzip-compressed."""
    if path.endswith('.gz'):
        return gzip.open(path, mode)
    else:
        return open(path, mode)


def count_lines(filename):
    """Count the number of lines in `filename` (may be gzip-compressed)."""
    count = 0
    with open_file(filename) as stream:
        for _ in stream:
            count += 1
    return count


def set_temp_dir():
    """Set temporary directory to `$MOSES_TEMP_DIR`, if set.

    Create the directory if necessary.
    """
    temp_dir = os.getenv('MOSES_TEMP_DIR')
    if temp_dir is not None:
        make_dirs(temp_dir)
        tempfile.tempdir = temp_dir


def strip_newline(line):
    """Remove trailing carriage return and/or line feed, if present."""
    if line.endswith('\n'):
        line = line[:-1]
    if line.endswith('\r'):
        line = line[:-1]
    return line


def open_chunk_file(split_dir, chunk_number):
    """Open a file to write one chunk of the extract file."""
    return open_file(
        os.path.join(split_dir, 'extract.%d.gz' % chunk_number), 'w')


def name_context_chunk_file(split_dir, chunk_number):
    """Compose file name for one chunk of the extract context file."""
    return os.path.join(
        split_dir, 'extract.context.%d.gz' % chunk_number)


def extract_source_phrase(line):
    """Extract the source phrase from an extract-file line."""
    return line.split(b'|||', 1)[0]


def cut_context_file(last_source_phrase, chunk_file, last_line,
                     context_stream):
    """Write one chunk of extract context file into its own file.

    :param last_source_phrase: Last source phrase that should be in the
        chunk.  Stop processing after this source phrase.
    :param chunk_file: Path to the extract context file for this chunk.
    :param last_line: Previously read line that may still need writing.
    :param context_stream: Extract context file, opened for reading.
    :return: Last line read from `context_stream`.  This line will still
        need processing.
    """
    # TODO: Use open_file.
    with gzip.open(chunk_file, 'w') as chunk:
        if last_line is not None:
            chunk.write('%s\n' % last_line)

        # Are we processing our last source phrase yet?
        on_last_source_phrase = False

        # Write all lines in context file until we meet last source phrase
        # in extract file.
        for line in context_stream:
            # Reading from a gzip file returns lines *including the newline*.
            # Either way, we want to ignore carriage returns as well.
            line = strip_newline(line)
            source_phrase = extract_source_phrase(line)
            if on_last_source_phrase and source_phrase != last_source_phrase:
                # First new source phrase after our last one.  We're done.
                return line
            else:
                # Still adding lines to our chunk.
                chunk.write('%s\n' % line)
                if source_phrase == last_source_phrase:
                    # We're on our last source phrase now.
                    on_last_source_phrase = True


def split_extract_files(split_dir, extract_file, extract_context_file=None,
                        jobs=1):
    """Split extract file into chunks, so we can process them in parallel.

    :param split_dir: A temporary directory where this function can write
        temporary files.  The caller must ensure that this directory will be
        cleaned up after it's done with the files.
    :return: An iterable of tuples.  Each tuple hols a partial extract file,
        and the corresponding context file.  The files may be in `split_dir`,
        or there may just be the original extract file.
    """
    if jobs == 1:
        # No splitting needed.  Read the original file(s).
        return [(extract_file, extract_context_file)]

    # Otherwise: split files.
    files = []
    num_lines = count_lines(extract_file)
    chunk_size = (num_lines + jobs - 1) / jobs
    assert isinstance(chunk_size, int)

    line_count = 0
    chunk_number = 0
    prev_source_phrase = None
    last_line_context = None
    extract_stream = open_file(extract_file)
    chunk_file = open_chunk_file(split_dir, chunk_number)
    if extract_context_file is None:
        chunk_context_file = None
    if extract_context_file is not None:
        context_stream = open_file(extract_context_file)

    for line in extract_stream:
        line_count += 1
        line = line.decode('utf-8')
        line = strip_newline(line)
        if line_count >= chunk_size:
            # At or over chunk size.  Cut off at next source phrase change.
            source_phrase = extract_source_phrase(line)
            if prev_source_phrase is None:
                # Start looking for a different source phrase.
                prev_source_phrase = source_phrase
            elif source_phrase == prev_source_phrase:
                # Can't cut yet.  Still working on the same source phrase.
                pass
            else:
                # Hit first new source phrase after chunk limit.  Cut new
                # file(s).
                chunk_file.close()
                if extract_context_file is not None:
                    chunk_context_file = name_context_chunk_file(
                        split_dir, chunk_number)
                    last_line_context = cut_context_file(
                        prev_source_phrase, chunk_context_file,
                        last_line_context, context_stream)
                files.append((chunk_file.name, chunk_context_file))

                # Start on new chunk.
                prev_source_phrase = None
                line_count = 0
                chunk_number += 1
                chunk_file = open_chunk_file(split_dir, chunk_number)
        chunk_file.write(('%s\n' % line).encode('utf-8'))

    chunk_file.close()
    if extract_context_file is not None:
        chunk_context_file = name_context_chunk_file(split_dir, chunk_number)
        last_line_context = cut_context_file(
            prev_source_phrase, chunk_number, last_line_context,
            context_stream)
    files.append((chunk_file.name, chunk_context_file))
    return files


def compose_score_command(extract_file, context_file, half_file,
                          flex_half_file, args):
    """Compose command line text to run one instance of `score`.

    :param extract_file: One chunk of extract file.
    :param context_file: If doing flexibility scoring, one chunk of
        extract context file.  Otherwise, None.
    :param half_file: ???
    :param flex_half_file: ???
    :param args: Arguments namespace.
    """
    command = [
        args.score_exe,
        extract_file,
        args.lex_file,
        half_file,
        ]
    if args.args not in (None, ''):
        command.append(args.args)
    other_args = build_score_args(args)
    if other_args != '':
        command.append(other_args)
    if context_file is not None:
        command += [
            '&&',
            find_first_executable(['bzcat']),
            half_file,
            '|',
            quote(args.flexibility_score),
            quote(context_file),
            ]
        if args.inverse:
            command.append('--Inverse')
        if args.hierarchical:
            command.append('--Hierarchical')
        command += [
            '|',
            quote(args.gzip_command),
            '-c',
            '>%s' % quote(flex_half_file),
            ]
    return ' '.join(command)


def score_parallel(split_dir, file_pairs, args):
    """Run the `score` command in parallel.

    :param split_dir: Temporary directory where we can create split files.
    :param file_pairs: Sequence of tuples for the input files, one tuple
        per chunk of the work.  Each tuple consists of a partial extract
        file, and optionally a partial extract context file.
    :param args: Arguments namespace.
    :return: A list of tuples.  Each tuple contains two file paths.  The first
        is for a partial half-phrase-table file.  The second is for the
        corresponding partial flex file, if a context file is given; or
        `None` otherwise.
    """
    partial_files = []
    # Pool of worker processes for executing the partial "score" invocations
    # concurrently.
    pool = Pool(args.jobs)
    try:
        for chunk_num, file_pair in enumerate(file_pairs):
            half_file = os.path.join(
                split_dir, 'phrase-table.half.%06d.gz' % chunk_num)
            extract_file, context_file = file_pair
            if context_file is None:
                flex_half_file = None
            else:
                flex_half_file = os.path.join(
                    split_dir, 'phrase-table.half.%06d.flex.gz' % chunk_num)
            # Pickling of arguments for the pool is awkward on Windows, so
            # keep them simple.  Compose the command line in the parent
            # process, then hand them to worker processes which execute them.
            command_line = compose_score_command(
                extract_file, context_file, half_file, flex_half_file, args)
            pool.apply_async(
                execute_shell, (command_line, ), {'verbose': args.verbose})
            partial_files.append((half_file, flex_half_file))
        pool.close()
    except BaseException:
        pool.terminate()
        raise
    finally:
        pool.join()
    return partial_files


def merge_and_sort(files, output, sort_command=None, gzip_exe=None,
                   verbose=False):
    """Merge partial files.

    :param files: List of partial half-phrase-table files.
    :param output: Path for resulting combined phrase-table file.
    """
# TODO: The Perl code mentioned "sort" and "flexibility_score" here.
# What do we do with those?

    # Sort whether we're asked to or not, as a way of combining the input
    # files.
    if sort_command == 'neandersort':
        # Neandersort transparently decompresses input and compresses output.
        check_call([
            'neandersort',
            '-o', output,
            ] + files)
    else:
        command = (
            "%(gzip)s -c -d %(files)s | "
            "%(sort)s | "
            "%(gzip)s -c >>%(output)s"
            % {
                'gzip': quote(gzip_exe),
                'sort': sort_command,
                'files': ' '.join(map(quote, files)),
                'output': quote(output),
            })
        execute_shell(command, verbose=verbose)


def build_score_args(args):
    """Compose command line for the `score` program."""
    command_line = []
    if args.labels_file:
        command_line += [
            '--SourceLabels',
            '--SourceLabelCountsLHS',
            '--SourceLabelSet',
            ]
    if args.parts_of_speech:
        command_line.append('--PartsOfSpeech')
    if args.inverse:
        command_line.append('--Inverse')
    if args.args is not None:
        command_line.append(args.args)
    return ' '.join(command_line)


def list_existing(paths):
    """Return, in the same order, those of the given files which exist."""
    return filter(os.path.exists, paths)


def compose_coc_path_for(path):
    """Compose COC-file path for the given file."""
    return '%s.coc' % path


def read_cocs(path):
    """Read COC file at `path`, return contents as tuple of ints."""
    with open(path) as lines:
        return tuple(
            int(line.rstrip('\r\n'))
            for line in lines
            )


def add_cocs(original, additional):
    """Add two tuples of COCs.  Extend as needed."""
    assert not (original is None and additional is None), "No COCs to add!"
    if original is None:
        return additional
    elif additional is None:
        return original
    else:
        common = tuple(lhs + rhs for lhs, rhs in zip(original, additional))
        return (
            common +
            tuple(original[len(common):]) +
            tuple(additional[len(common):]))


def merge_coc(files, output):
    """Merge COC files for the given partial files.

    Each COC file is a series of integers, one per line.  This reads them, and
    adds them up line-wise into one file of the same format: the sum of the
    numbers the respective files have at line 1, the sum of the numbers the
    respective files have at line 2, and so on.
    """
    assert len(files) > 0, "No partial files - no work to do."
    extract_files = [extract_file for extract_file, _ in files]
    if not os.path.exists(compose_coc_path_for(extract_files[0])):
        # Nothing to merge.
        return
    totals = None
# TODO: Shouldn't we just fail if any of these files is missing?
    for coc_path in list_existing(map(compose_coc_path_for, extract_files)):
        totals = add_cocs(totals, read_cocs(coc_path))

    # Write to output file.
    with open(output, 'w') as output_stream:
        for entry in totals:
            output_stream.write('%d\n' % entry)


def suffix_line_numbers(infile, outfile):
    """Rewrite `infile` to `outfile`; suffix line number to each line.

    The line number is zero-based, and separated from the rest of the line
    by a single space.
    """
    temp_file = '%s.numbering' % outfile
    with open(infile, 'r') as instream, open(outfile, 'w') as outstream:
        line_no = 0
        for line in instream:
            outstream.write(line)
            outstream.write(' %d\n' % line_no)
            line_no += 1
    os.rename(temp_file, outfile)


def compose_source_labels_path_for(path):
    """Return source labels file path for given file."""
    return '%s.syntaxLabels.src' % path


def merge_numbered_files(inputs, output, header_lines, sort_command,
                         verbose=False):
    """Sort and merge files `inputs`, add header and line numbers.

    :param inputs: Iterable of input files.
    :param output: Output file.
    :header_lines: Iterable of header lines.
    :sort_command: Command line for sorting input files.
    """
    sort_temp = '%s.sorting' % output
    with open(sort_temp, 'w') as stream:
        for line in header_lines:
            stream.write(line)
            stream.write('\n')
    execute_shell(
        "%s %s >>%s" % (
            sort_command,
            ' '.join(map(quote, inputs)),
            quote(sort_temp)),
        verbose=verbose)
    suffix_line_numbers(sort_temp, output)


def merge_source_labels(files, output, sort_command, verbose=False):
    """Merge source labels files."""
# TODO: Shouldn't we just fail if any of these files is missing?
    labels_files = list_existing(map(compose_source_labels_path_for, files))
    header = [
        'GlueTop',
        'GlueX',
        'SSTART',
        'SEND',
        ]
    merge_numbered_files(
        labels_files, output, header, sort_command, verbose=verbose)


def compose_parts_of_speech_path_for(path):
    """Return parts-of-speech file path for given file."""
    return '%s.partsOfSpeech' % path


def merge_parts_of_speech(files, output, sort_command, verbose=False):
    """Merge parts-of-speech files into output."""
# TODO: Shouldn't we just fail if any of these files is missing?
    parts_files = list_existing(map(compose_parts_of_speech_path_for, files))
    header = [
        'SSTART',
        'SEND',
        ]
    merge_numbered_files(
        parts_files, output, header, sort_command, verbose=verbose)


def main():
    """Command-line entry point.  Marshals and forwards to `score_parallel`."""
    args = parse_args()
    sanitize_args(args)
    set_temp_dir()

    if args.flexibility_score is None:
        extract_context_file = None
    else:
        extract_context_file = args.extract_file.replace(
            'extract.', 'extract.context.')

    if args.verbose:
        print("Started %s." % datetime.now())
        print("Using '%s' for gzip." % args.gzip_command)

    with tempdir(args.debug) as split_dir:
        extract_files = split_extract_files(
            split_dir, args.extract_file,
            extract_context_file=extract_context_file, jobs=args.jobs)

        scored_files = score_parallel(split_dir, extract_files, args)

        if args.verbose:
            sys.stderr.write("Finished score %s.\n" % datetime.now())

# TODO: Pass on "sort" and "flexibility-score" arguments?
        merge_and_sort(
            [phrase_chunk for phrase_chunk, _ in scored_files], args.output,
            sort_command=args.sort_command, gzip_exe=args.gzip_command,
            verbose=args.verbose)
        merge_coc(extract_files, compose_coc_path_for(args.output))

        if not args.inverse and args.labels_file is not None:
            if args.verbose:
                print("Merging source labels files.")
            merge_source_labels(
                extract_files, args.labels_file,
                sort_command=args.sort_command, verbose=args.verbose)

        if not args.inverse and args.parts_of_speech is not None:
            if args.verbose:
                print("Merging parts-of-speech files.")
            merge_parts_of_speech(
                extract_files, args.parts_of_speech,
                sort_command=args.sort_command, verbose=args.verbose)


if __name__ == '__main__':
    try:
        main()
    except ProgramFailure as error:
        sys.stderr.write('%s\n' % error)
        sys.exit(1)
    except CommandLineError as error:
        sys.stderr.write("Command line error: %s\n" % error)
        sys.exit(2)