File size: 9,148 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
#
# Based on a preprocessor written by Philipp Koehn.
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
use warnings;
use FindBin qw($RealBin);
use strict;
use utf8;
my $mydir = "$RealBin/../../share/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my $language = "en";
my $prefixfile = "";
my $is_cjk = 0;
my $QUIET = 0;
my $HELP = 0;
my $LIST_ITEM = 0;
my $NOP = 0;
my $KEEP_LINES = 0;
while (@ARGV) {
$_ = shift;
/^-l$/ && ($language = shift, next);
/^-p$/ && ($prefixfile = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-i$/ && ($LIST_ITEM = 1, next);
/^-n$/ && ($NOP = 1, next);
/^-k$/ && ($KEEP_LINES = 1, next);
/^-b$/ && ($|++, next); # no output buffering
}
if ($HELP) {
print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] < textfile > splitfile\n";
print "-q: quiet mode\n";
print "-b: no output buffering (for use in bidirectional pipes)\n";
print "-p: use a custom prefix file, overriding the installed one\n";
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
print "-n: do not emit <P> after paragraphs\n";
print "-k: keep existing line boundaries\n";
exit;
}
if (!$QUIET) {
print STDERR "Sentence Splitter v3\n";
print STDERR "Language: $language\n";
}
# Is it Chinese, Japanese, Korean?
if ($language eq "yue" || $language eq "zh" || $language eq "ja") {
$is_cjk = 1;
}
if ($prefixfile ne "") {
print STDERR "Loading non-breaking prefixes from $prefixfile\n";
} else {
$prefixfile = "$mydir/nonbreaking_prefix.$language";
# Default to English, if we don't have a language-specific prefix file.
if (!(-e $prefixfile)) {
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
}
if (-e "$prefixfile") {
open(PREFIX, "<:utf8", "$prefixfile") or die "Cannot open: $!";
while (<PREFIX>) {
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#")) {
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
$NONBREAKING_PREFIX{$1} = 2;
} else {
$NONBREAKING_PREFIX{$item} = 1;
}
}
}
close(PREFIX);
}
## Loop over text, add lines together until we get a blank line or a <p>
my $text = "";
while (<STDIN>) {
chomp;
if ($KEEP_LINES) {
&do_it_for($_,"");
} elsif (/^<.+>$/ || /^\s*$/) {
# Time to process this block; we've hit a blank or <p>
&do_it_for($text, $_);
print "<P>\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by <P>
$text = "";
} else {
# Append the text, with a space.
$text .= $_. " ";
}
}
# Do the leftover text.
&do_it_for($text,"") if $text;
sub do_it_for {
my($text,$markup) = @_;
print &preprocess($text) if $text;
print "$markup\n" if ($markup =~ /^<.+>$/);
#chop($text);
}
sub preprocess {
# Argument is one paragraph.
my($text) = @_;
# Clean up spaces at head and tail of each line, as well as
# any double-spacing.
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
##### Add sentence breaks as needed #####
# Sentences can start with upper-case, numnbers, or Indic characters
my $sentence_start = "\\p{IsUpper}0-9";
$sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr");
$sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
$sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq "bn" || $language eq "mni");
$sentence_start .= "\\p{Block: Kannada}" if $language eq "kn";
$sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml";
$sentence_start .= "\\p{Block: Oriya}" if $language eq "or";
$sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa";
$sentence_start .= "\\p{Block: Tamil}" if $language eq "ta";
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
$sentence_start .= "\\p{Arabic}" if $language eq "fa";
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
# Non-period end of sentence markers (?!) followed by sentence starters.
$text =~ s/([?!؟\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
# Multi-dots followed by sentence starters.
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
# Add breaks for sentences that end with some sort of punctuation
# inside a quote or parenthetical and are followed by a possible
# sentence starter punctuation and upper case.
$text =~ s/([?!؟\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g;
# Add breaks for sentences that end with some sort of punctuation,
# and are followed by a sentence starter punctuation and upper case.
$text =~ s/([?!؟\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
#NOTE: Korean no longer handled here, cos Korean has spaces.
if ($is_cjk == 1) {
# Chinese uses unusual end-of-sentence markers. These are NOT
# followed by whitespace. Nor is there any idea of capitalization.
# There does not appear to be any unicode category for full-stops
# in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01
#$text =~ s/([。.?!♪])/$1\n/g;
$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
# A normal full-stop or other Western sentence enders followed
# by an ideograph is an end-of-sentence, always.
$text =~ s/([\.?!؟]) *(\p{CJK})/$1\n$2/g;
# Split close-paren-then-comma into two.
$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g;
# Chinese does not use any sort of white-space between ideographs.
# Nominally, each single ideograph corresponds to one word. Add
# spaces here, so that later processing stages can tokenize readily.
# Note that this handles mixed latinate+CJK.
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
# what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
#$text =~ s/ +/ /g;
}
# Urdu support
# https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
if ($language eq 'ur') {
$text =~ s{
( (?: [\.\?!\x{06d4}] | \.\.+ )
[\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
)
\s+
( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
[\x{0600}-\x{06ff}]
)
}{$1\n$2}gx;
}
# Special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;
my @words = split(/\h/,$text);
#print "NOW $text\n";
$text = "";
for ($i=0;$i<(scalar(@words)-1);$i++) {
#print "Checking $words[$i] $words[$i+1]\n";
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
# Check if $1 is a known honorific and $2 is empty, never break.
my $prefix = $1;
my $starting_punct = $2;
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
# Not breaking;
## print "NBP1 $words[$i] $words[$i+1]\n";
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
# Not breaking - upper case acronym
#print "NBP2 $words[$i] $words[$i+1]\n";
} elsif ($LIST_ITEM
&& ($i == 0 || substr($words[$i-1], -1) eq "\n")
&& $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) {
# Maybe list item - non breaking
#print "NBP3 $words[$i] $words[$i+1]\n";
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
# The next word has a bunch of initial quotes, maybe a
# space, then either upper case or a number
#print "MAYBE $words[$i] $words[$i+1]\n";
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
# We always add a return for these, unless we have a
# numeric non-breaker and a number start.
}
}
$text = $text.$words[$i]." ";
}
# We stopped one token from the end to allow for easy look-ahead.
# Append it now.
$text = $text.$words[$i];
# Clean up spaces at head and tail of each line as well as any double-spacing
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# Add trailing break.
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
|