|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use strict; |
|
use warnings; |
|
|
|
use Getopt::Long; |
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
binmode(STDERR, ":utf8"); |
|
|
|
my $usage = 0; |
|
my $lowercase = 0; |
|
my $n = 2; |
|
GetOptions( |
|
"n=i" => \$n, |
|
"lc|lowercase" => \$lowercase, |
|
"h|help|usage" => \$usage, |
|
) or exit 1; |
|
|
|
my $nl = 0; |
|
my $ngrams; |
|
my $words; |
|
while (<>) { |
|
$nl++; |
|
print STDERR "." if $nl % 100000 == 0; |
|
print STDERR "($nl)" if $nl % 500000 == 0; |
|
chomp; |
|
$_ = lc($_) if $lowercase; |
|
my @words = split /\s+/; |
|
foreach my $w (@words) { |
|
$words->{$w}++; |
|
} |
|
$ngrams = ngrams($n, \@words, $ngrams); |
|
} |
|
print STDERR "Done.\n"; |
|
|
|
|
|
my $report; |
|
foreach my $ngr (keys %$ngrams) { |
|
my $w = $ngr; |
|
$w =~ s/ //g; |
|
my $untokcnt = $words->{$w}; |
|
next if ! $untokcnt; |
|
my $tokcnt = $ngrams->{$ngr}; |
|
$report->{$ngr}->{"tok"} = $tokcnt; |
|
$report->{$ngr}->{"untok"} = $untokcnt; |
|
$report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt); |
|
$report->{$ngr}->{"sum"} = $untokcnt+$tokcnt; |
|
} |
|
|
|
|
|
foreach my $ngr (sort { |
|
$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"} |
|
|| $report->{$b}->{"sum"} <=> $report->{$a}->{"sum"} |
|
} |
|
keys %$report) { |
|
print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n"; |
|
} |
|
|
|
sub ngrams { |
|
my $n = shift; |
|
my @words = @{shift()}; |
|
my $out = shift; |
|
if ($n == 1) { |
|
foreach my $w (@words) { |
|
$out->{$w}++; |
|
} |
|
} else { |
|
while ($#words >= $n-1) { |
|
$out->{join(" ", @words[0..$n-1])}++; |
|
shift @words; |
|
} |
|
} |
|
return $out; |
|
} |
|
|