sts1 / uroman /bin /de-accent.pl
multimodalart's picture
First commit
7bcf8d7
#!/usr/bin/perl -w
sub print_version {
print STDERR "$0 version 1.1\n";
print STDERR " Author: Ulf Hermjakob\n";
print STDERR " Last changed: March 14, 2011\n";
}
sub print_usage {
print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
print STDERR " -h or -help\n";
print STDERR " -v or -version\n";
}
sub de_accent_string {
local($s) = @_;
# $s =~ tr/A-Z/a-z/;
unless (0) {
# Latin-1
if ($s =~ /\xC3[\x80-\xBF]/) {
$s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
$s =~ s/Æ/Ae/g;
$s =~ s/Ç/C/g;
$s =~ s/Ð/D/g;
$s =~ s/(È|É|Ê|Ë)/E/g;
$s =~ s/(Ì|Í|Î|Ï)/I/g;
$s =~ s/Ñ/N/g;
$s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
$s =~ s/(Ù|Ú|Û|Ü)/U/g;
$s =~ s/Þ/Th/g;
$s =~ s/Ý/Y/g;
$s =~ s/(à|á|â|ã|ä|å)/a/g;
$s =~ s/æ/ae/g;
$s =~ s/ç/c/g;
$s =~ s/(è|é|ê|ë)/e/g;
$s =~ s/(ì|í|î|ï)/i/g;
$s =~ s/ð/d/g;
$s =~ s/ñ/n/g;
$s =~ s/(ò|ó|ô|õ|ö)/o/g;
$s =~ s/ß/ss/g;
$s =~ s/þ/th/g;
$s =~ s/(ù|ú|û|ü)/u/g;
$s =~ s/(ý|ÿ)/y/g;
}
# Latin Extended-A
if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
$s =~ s/(Ā|Ă|Ą)/A/g;
$s =~ s/(ā|ă|ą)/a/g;
$s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
$s =~ s/(ć|ĉ|ċ|č)/c/g;
$s =~ s/(Ď|Đ)/D/g;
$s =~ s/(ď|đ)/d/g;
$s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
$s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
$s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
$s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
$s =~ s/(Ĥ|Ħ)/H/g;
$s =~ s/(ĥ|ħ)/h/g;
$s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
$s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
$s =~ s/IJ/Ij/g;
$s =~ s/ij/ij/g;
$s =~ s/Ĵ/J/g;
$s =~ s/ĵ/j/g;
$s =~ s/Ķ/K/g;
$s =~ s/(ķ|ĸ)/k/g;
$s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
$s =~ s/(ļ|ľ|ŀ|ł)/l/g;
$s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
$s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g;
$s =~ s/(Ō|Ŏ|Ő)/O/g;
$s =~ s/(ō|ŏ|ő)/o/g;
$s =~ s/Œ/Oe/g;
$s =~ s/œ/oe/g;
$s =~ s/(Ŕ|Ŗ|Ř)/R/g;
$s =~ s/(ŕ|ŗ|ř)/r/g;
$s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
$s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
$s =~ s/(Ţ|Ť|Ŧ)/T/g;
$s =~ s/(ţ|ť|ŧ)/t/g;
$s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
$s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
$s =~ s/Ŵ/W/g;
$s =~ s/ŵ/w/g;
$s =~ s/(Ŷ|Ÿ)/Y/g;
$s =~ s/ŷ/y/g;
$s =~ s/(Ź|Ż|Ž)/Z/g;
$s =~ s/(ź|ż|ž)/z/g;
}
# Latin Extended Additional
if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
$s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
$s =~ s/(ḃ|ḅ|ḇ)/b/g;
$s =~ s/(ḉ)/c/g;
$s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
$s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
$s =~ s/(ḟ)/f/g;
$s =~ s/(ḡ)/g/g;
$s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
$s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
$s =~ s/(ḱ|ḳ|ḵ)/k/g;
$s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
$s =~ s/(ḿ|ṁ|ṃ)/m/g;
$s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
$s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
$s =~ s/(ṕ|ṗ)/p/g;
$s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
$s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
$s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
$s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
$s =~ s/(ṽ|ṿ)/v/g;
$s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
$s =~ s/(ẋ|ẍ)/x/g;
$s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
$s =~ s/(ẑ|ẓ|ẕ)/z/g;
$s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
$s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
$s =~ s/(Ḉ)/C/g;
$s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
$s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
$s =~ s/(Ḟ)/F/g;
$s =~ s/(Ḡ)/G/g;
$s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
$s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
$s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
$s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
$s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
$s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
$s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
$s =~ s/(Ṕ|Ṗ)/P/g;
$s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
$s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
$s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
$s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
$s =~ s/(Ṽ|Ṿ)/V/g;
$s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
$s =~ s/(Ẍ)/X/g;
$s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
$s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
}
# Greek letters
if ($s =~ /\xCE[\x86-\xAB]/) {
$s =~ s/ά/α/g;
$s =~ s/έ/ε/g;
$s =~ s/ί/ι/g;
$s =~ s/ϊ/ι/g;
$s =~ s/ΐ/ι/g;
$s =~ s/ό/ο/g;
$s =~ s/ύ/υ/g;
$s =~ s/ϋ/υ/g;
$s =~ s/ΰ/υ/g;
$s =~ s/ώ/ω/g;
$s =~ s/Ά/Α/g;
$s =~ s/Έ/Ε/g;
$s =~ s/Ή/Η/g;
$s =~ s/Ί/Ι/g;
$s =~ s/Ϊ/Ι/g;
$s =~ s/Ύ/Υ/g;
$s =~ s/Ϋ/Υ/g;
$s =~ s/Ώ/Ω/g;
}
# Cyrillic letters
if ($s =~ /\xD0[\x80-\xAF]/) {
$s =~ s/Ѐ/Е/g;
$s =~ s/Ё/Е/g;
$s =~ s/Ѓ/Г/g;
$s =~ s/Ќ/К/g;
$s =~ s/Ѝ/И/g;
$s =~ s/Й/И/g;
$s =~ s/ѐ/е/g;
$s =~ s/ё/е/g;
$s =~ s/ѓ/г/g;
$s =~ s/ќ/к/g;
$s =~ s/ѝ/и/g;
$s =~ s/й/и/g;
}
}
return $s;
}
while (@ARGV) {
$arg = shift @ARGV;
if ($arg =~ /^-*(h|help)$/i) {
&print_usage;
exit 1;
} elsif ($arg =~ /^-*(v|version)$/i) {
&print_version;
exit 1;
} else {
print STDERR "Ignoring unrecognized argument $arg\n";
}
}
$line_number = 0;
while (<>) {
$line_number++;
print &de_accent_string($_);
}
exit 0;