|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
binmode(STDERR, ":utf8"); |
|
|
|
use warnings; |
|
use FindBin qw($RealBin); |
|
use strict; |
|
use utf8; |
|
|
|
my $mydir = "$RealBin/../../share/nonbreaking_prefixes"; |
|
|
|
my %NONBREAKING_PREFIX = (); |
|
my $language = "en"; |
|
my $prefixfile = ""; |
|
my $is_cjk = 0; |
|
my $QUIET = 0; |
|
my $HELP = 0; |
|
my $LIST_ITEM = 0; |
|
my $NOP = 0; |
|
my $KEEP_LINES = 0; |
|
|
|
while (@ARGV) { |
|
$_ = shift; |
|
/^-l$/ && ($language = shift, next); |
|
/^-p$/ && ($prefixfile = shift, next); |
|
/^-q$/ && ($QUIET = 1, next); |
|
/^-h$/ && ($HELP = 1, next); |
|
/^-i$/ && ($LIST_ITEM = 1, next); |
|
/^-n$/ && ($NOP = 1, next); |
|
/^-k$/ && ($KEEP_LINES = 1, next); |
|
/^-b$/ && ($|++, next); |
|
} |
|
|
|
if ($HELP) { |
|
print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] < textfile > splitfile\n"; |
|
print "-q: quiet mode\n"; |
|
print "-b: no output buffering (for use in bidirectional pipes)\n"; |
|
print "-p: use a custom prefix file, overriding the installed one\n"; |
|
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n"; |
|
print "-n: do not emit <P> after paragraphs\n"; |
|
print "-k: keep existing line boundaries\n"; |
|
exit; |
|
} |
|
if (!$QUIET) { |
|
print STDERR "Sentence Splitter v3\n"; |
|
print STDERR "Language: $language\n"; |
|
} |
|
|
|
|
|
if ($language eq "yue" || $language eq "zh" || $language eq "ja") { |
|
$is_cjk = 1; |
|
} |
|
|
|
if ($prefixfile ne "") { |
|
print STDERR "Loading non-breaking prefixes from $prefixfile\n"; |
|
} else { |
|
|
|
$prefixfile = "$mydir/nonbreaking_prefix.$language"; |
|
|
|
|
|
if (!(-e $prefixfile)) { |
|
$prefixfile = "$mydir/nonbreaking_prefix.en"; |
|
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; |
|
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); |
|
} |
|
} |
|
|
|
if (-e "$prefixfile") { |
|
open(PREFIX, "<:utf8", "$prefixfile") or die "Cannot open: $!"; |
|
while (<PREFIX>) { |
|
my $item = $_; |
|
chomp($item); |
|
if (($item) && (substr($item,0,1) ne "#")) { |
|
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) { |
|
$NONBREAKING_PREFIX{$1} = 2; |
|
} else { |
|
$NONBREAKING_PREFIX{$item} = 1; |
|
} |
|
} |
|
} |
|
close(PREFIX); |
|
} |
|
|
|
|
|
my $text = ""; |
|
while (<STDIN>) { |
|
chomp; |
|
if ($KEEP_LINES) { |
|
&do_it_for($_,""); |
|
} elsif (/^<.+>$/ || /^\s*$/) { |
|
|
|
&do_it_for($text, $_); |
|
print "<P>\n" if $NOP == 0 && (/^\s*$/ && $text); |
|
$text = ""; |
|
} else { |
|
|
|
$text .= $_. " "; |
|
} |
|
} |
|
|
|
&do_it_for($text,"") if $text; |
|
|
|
|
|
sub do_it_for { |
|
my($text,$markup) = @_; |
|
print &preprocess($text) if $text; |
|
print "$markup\n" if ($markup =~ /^<.+>$/); |
|
|
|
} |
|
|
|
sub preprocess { |
|
|
|
my($text) = @_; |
|
|
|
|
|
|
|
$text =~ s/ +/ /g; |
|
$text =~ s/\n /\n/g; |
|
$text =~ s/ \n/\n/g; |
|
$text =~ s/^ //g; |
|
$text =~ s/ $//g; |
|
|
|
|
|
|
|
|
|
my $sentence_start = "\\p{IsUpper}0-9"; |
|
$sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr"); |
|
$sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu"; |
|
$sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq "bn" || $language eq "mni"); |
|
$sentence_start .= "\\p{Block: Kannada}" if $language eq "kn"; |
|
$sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml"; |
|
$sentence_start .= "\\p{Block: Oriya}" if $language eq "or"; |
|
$sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa"; |
|
$sentence_start .= "\\p{Block: Tamil}" if $language eq "ta"; |
|
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; |
|
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko"; |
|
$sentence_start .= "\\p{Arabic}" if $language eq "fa"; |
|
|
|
|
|
|
|
|
|
$text =~ s/([?!؟\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g; |
|
|
|
|
|
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g; |
|
|
|
|
|
|
|
|
|
$text =~ s/([?!؟\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g; |
|
|
|
|
|
|
|
$text =~ s/([?!؟\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g; |
|
|
|
|
|
|
|
if ($is_cjk == 1) { |
|
|
|
|
|
|
|
|
|
|
|
$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g; |
|
|
|
|
|
|
|
$text =~ s/([\.?!؟]) *(\p{CJK})/$1\n$2/g; |
|
|
|
|
|
$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if ($language eq 'ur') { |
|
$text =~ s{ |
|
( (?: [\.\?!\x{06d4}] | \.\.+ ) |
|
[\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* |
|
) |
|
\s+ |
|
( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* |
|
[\x{0600}-\x{06ff}] |
|
) |
|
}{$1\n$2}gx; |
|
} |
|
|
|
|
|
my $word; |
|
my $i; |
|
my @words = split(/\h/,$text); |
|
|
|
$text = ""; |
|
for ($i=0;$i<(scalar(@words)-1);$i++) { |
|
|
|
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) { |
|
|
|
my $prefix = $1; |
|
my $starting_punct = $2; |
|
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) { |
|
|
|
|
|
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { |
|
|
|
|
|
} elsif ($LIST_ITEM |
|
&& ($i == 0 || substr($words[$i-1], -1) eq "\n") |
|
&& $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) { |
|
|
|
|
|
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) { |
|
|
|
|
|
|
|
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/)); |
|
|
|
|
|
} |
|
} |
|
$text = $text.$words[$i]." "; |
|
} |
|
|
|
|
|
|
|
$text = $text.$words[$i]; |
|
|
|
|
|
$text =~ s/ +/ /g; |
|
$text =~ s/\n /\n/g; |
|
$text =~ s/ \n/\n/g; |
|
$text =~ s/^ //g; |
|
$text =~ s/ $//g; |
|
|
|
|
|
$text .= "\n" unless $text =~ /\n$/; |
|
|
|
return $text; |
|
} |
|
|