#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

# Compatible with sri LM-creating script, eg.
#    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
# To use it in the EMS, add this to the [LM] section
#    lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir"
#    settings = ""
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
# It should point to the root of the LM toolkit, eg
#    irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
# Set smoothing method in settings, if different from modified Kneser-Ney

use warnings;
use strict;
use FindBin qw($RealBin);
use Getopt::Long;

my $order = 3; # order of language model (default trigram)
my $corpusPath; # input text data
my $lmPath; # generated language model
my $cores = 2; # number of CPUs used
my $irstPath; # bin directory of IRSTLM
my $tempPath = "tmp"; # temp dir
my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons
my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney
my $dummy;

GetOptions("order=s"  => \$order,
           "text=s"   => \$corpusPath,
           "lm=s"     => \$lmPath,
           "cores=s"  => \$cores,
           "irst-dir=s"  => \$irstPath,
           "temp-dir=s"  => \$tempPath,
           "p=i" => \$pruneSingletons,   # irstlm parameter: prune singletons
           "s=s" => \$smoothing, # irstlm parameter: smoothing method
	   "interpolate!" => \$dummy,  #ignore
	   "kndiscount!" => \$dummy    #ignore
	   ) or exit 1;

#die("ERROR: please set order") unless defined($order);
die("ERROR: please set text") unless defined($corpusPath);
die("ERROR: please set lm") unless defined($lmPath);
die("ERROR: please set irst-dir") unless defined($irstPath);


$tempPath .= "/irstlm-build-tmp.$$";
`mkdir -p $tempPath`;

# add <s> and </s>
my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged";
print STDERR "EXECUTING $cmd\n";
`$cmd`;

# collect n-gram counts
$cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts";
print STDERR "EXECUTING $cmd\n";
`$cmd`;

# build lm
$cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts";
$cmd .= " -ps=no" unless $pruneSingletons;
print STDERR "EXECUTING $cmd\n";
`$cmd`;

$cmd = "rm -rf $tempPath";
print STDERR "EXECUTING $cmd\n";
`$cmd`;

print STDERR "FINISH.\n";