Spaces:
Sleeping
Sleeping
package ExtUtils::MakeMaker::Locale; | |
use strict; | |
use warnings; | |
our $VERSION = "7.62"; | |
$VERSION =~ tr/_//d; | |
use base 'Exporter'; | |
our @EXPORT_OK = qw( | |
decode_argv env | |
$ENCODING_LOCALE $ENCODING_LOCALE_FS | |
$ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT | |
); | |
use Encode (); | |
use Encode::Alias (); | |
our $ENCODING_LOCALE; | |
our $ENCODING_LOCALE_FS; | |
our $ENCODING_CONSOLE_IN; | |
our $ENCODING_CONSOLE_OUT; | |
sub DEBUG () { 0 } | |
sub _init { | |
if ($^O eq "MSWin32") { | |
unless ($ENCODING_LOCALE) { | |
# Try to obtain what the Windows ANSI code page is | |
eval { | |
unless (defined &GetConsoleCP) { | |
require Win32; | |
# manually "import" it since Win32->import refuses | |
*GetConsoleCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP; | |
} | |
unless (defined &GetConsoleCP) { | |
require Win32::API; | |
Win32::API->Import('kernel32', 'int GetConsoleCP()'); | |
} | |
if (defined &GetConsoleCP) { | |
my $cp = GetConsoleCP(); | |
$ENCODING_LOCALE = "cp$cp" if $cp; | |
} | |
}; | |
} | |
unless ($ENCODING_CONSOLE_IN) { | |
# only test one since set together | |
unless (defined &GetInputCP) { | |
eval { | |
require Win32; | |
eval { | |
local $SIG{__WARN__} = sub {} if ( "$]" < 5.014 ); # suppress deprecation warning for inherited AUTOLOAD of Win32::GetConsoleCP() | |
Win32::GetConsoleCP(); | |
}; | |
# manually "import" it since Win32->import refuses | |
*GetInputCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP; | |
*GetOutputCP = sub { &Win32::GetConsoleOutputCP } if defined &Win32::GetConsoleOutputCP; | |
}; | |
unless (defined &GetInputCP) { | |
eval { | |
# try Win32::Console module for codepage to use | |
require Win32::Console; | |
*GetInputCP = sub { &Win32::Console::InputCP } | |
if defined &Win32::Console::InputCP; | |
*GetOutputCP = sub { &Win32::Console::OutputCP } | |
if defined &Win32::Console::OutputCP; | |
}; | |
} | |
unless (defined &GetInputCP) { | |
# final fallback | |
*GetInputCP = *GetOutputCP = sub { | |
# another fallback that could work is: | |
# reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP | |
((qx(chcp) || '') =~ /^Active code page: (\d+)/) | |
? $1 : (); | |
}; | |
} | |
} | |
my $cp = GetInputCP(); | |
$ENCODING_CONSOLE_IN = "cp$cp" if $cp; | |
$cp = GetOutputCP(); | |
$ENCODING_CONSOLE_OUT = "cp$cp" if $cp; | |
} | |
} | |
unless ($ENCODING_LOCALE) { | |
eval { | |
require I18N::Langinfo; | |
$ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET()); | |
# Workaround of Encode < v2.25. The "646" encoding alias was | |
# introduced in Encode-2.25, but we don't want to require that version | |
# quite yet. Should avoid the CPAN testers failure reported from | |
# openbsd-4.7/perl-5.10.0 combo. | |
$ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646"; | |
# https://rt.cpan.org/Ticket/Display.html?id=66373 | |
$ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8"; | |
}; | |
$ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN; | |
} | |
# Workaround of Encode < v2.71 for "cp65000" and "cp65001" | |
# The "cp65000" and "cp65001" aliases were added in [Encode v2.71](https://github.com/dankogai/p5-encode/commit/7874bd95aa10967a3b5dbae333d16bcd703ac6c6) | |
# via commit <https://github.com/dankogai/p5-encode/commit/84b9c1101d5251d37e226f80d1c6781718779047>. | |
# This will avoid test failures for Win32 machines using the UTF-7 or UTF-8 code pages. | |
$ENCODING_LOCALE = 'UTF-7' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65000"; | |
$ENCODING_LOCALE = 'utf-8-strict' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65001"; | |
if ($^O eq "darwin") { | |
$ENCODING_LOCALE_FS ||= "UTF-8"; | |
} | |
# final fallback | |
$ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8"; | |
$ENCODING_LOCALE_FS ||= $ENCODING_LOCALE; | |
$ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE; | |
$ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN; | |
unless (Encode::find_encoding($ENCODING_LOCALE)) { | |
my $foundit; | |
if (lc($ENCODING_LOCALE) eq "gb18030") { | |
eval { | |
require Encode::HanExtra; | |
}; | |
if ($@) { | |
die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped"; | |
} | |
$foundit++ if Encode::find_encoding($ENCODING_LOCALE); | |
} | |
die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped" | |
unless $foundit; | |
} | |
# use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT; | |
} | |
_init(); | |
Encode::Alias::define_alias(sub { | |
no strict 'refs'; | |
no warnings 'once'; | |
return ${"ENCODING_" . uc(shift)}; | |
}, "locale"); | |
sub _flush_aliases { | |
no strict 'refs'; | |
for my $a (sort keys %Encode::Alias::Alias) { | |
if (defined ${"ENCODING_" . uc($a)}) { | |
delete $Encode::Alias::Alias{$a}; | |
warn "Flushed alias cache for $a" if DEBUG; | |
} | |
} | |
} | |
sub reinit { | |
$ENCODING_LOCALE = shift; | |
$ENCODING_LOCALE_FS = shift; | |
$ENCODING_CONSOLE_IN = $ENCODING_LOCALE; | |
$ENCODING_CONSOLE_OUT = $ENCODING_LOCALE; | |
_init(); | |
_flush_aliases(); | |
} | |
sub decode_argv { | |
die if defined wantarray; | |
for (@ARGV) { | |
$_ = Encode::decode(locale => $_, @_); | |
} | |
} | |
sub env { | |
my $k = Encode::encode(locale => shift); | |
my $old = $ENV{$k}; | |
if (@_) { | |
my $v = shift; | |
if (defined $v) { | |
$ENV{$k} = Encode::encode(locale => $v); | |
} | |
else { | |
delete $ENV{$k}; | |
} | |
} | |
return Encode::decode(locale => $old) if defined wantarray; | |
} | |
1; | |
__END__ | |
=head1 NAME | |
ExtUtils::MakeMaker::Locale - bundled Encode::Locale | |
=head1 SYNOPSIS | |
use Encode::Locale; | |
use Encode; | |
$string = decode(locale => $bytes); | |
$bytes = encode(locale => $string); | |
if (-t) { | |
binmode(STDIN, ":encoding(console_in)"); | |
binmode(STDOUT, ":encoding(console_out)"); | |
binmode(STDERR, ":encoding(console_out)"); | |
} | |
# Processing file names passed in as arguments | |
my $uni_filename = decode(locale => $ARGV[0]); | |
open(my $fh, "<", encode(locale_fs => $uni_filename)) | |
|| die "Can't open '$uni_filename': $!"; | |
binmode($fh, ":encoding(locale)"); | |
... | |
=head1 DESCRIPTION | |
In many applications it's wise to let Perl use Unicode for the strings it | |
processes. Most of the interfaces Perl has to the outside world are still byte | |
based. Programs therefore need to decode byte strings that enter the program | |
from the outside and encode them again on the way out. | |
The POSIX locale system is used to specify both the language conventions | |
requested by the user and the preferred character set to consume and | |
output. The C<Encode::Locale> module looks up the charset and encoding (called | |
a CODESET in the locale jargon) and arranges for the L<Encode> module to know | |
this encoding under the name "locale". It means bytes obtained from the | |
environment can be converted to Unicode strings by calling C<< | |
Encode::encode(locale => $bytes) >> and converted back again with C<< | |
Encode::decode(locale => $string) >>. | |
Where file systems interfaces pass file names in and out of the program we also | |
need care. The trend is for operating systems to use a fixed file encoding | |
that don't actually depend on the locale; and this module determines the most | |
appropriate encoding for file names. The L<Encode> module will know this | |
encoding under the name "locale_fs". For traditional Unix systems this will | |
be an alias to the same encoding as "locale". | |
For programs running in a terminal window (called a "Console" on some systems) | |
the "locale" encoding is usually a good choice for what to expect as input and | |
output. Some systems allows us to query the encoding set for the terminal and | |
C<Encode::Locale> will do that if available and make these encodings known | |
under the C<Encode> aliases "console_in" and "console_out". For systems where | |
we can't determine the terminal encoding these will be aliased as the same | |
encoding as "locale". The advice is to use "console_in" for input known to | |
come from the terminal and "console_out" for output to the terminal. | |
In addition to arranging for various Encode aliases the following functions and | |
variables are provided: | |
=over | |
=item decode_argv( ) | |
=item decode_argv( Encode::FB_CROAK ) | |
This will decode the command line arguments to perl (the C<@ARGV> array) in-place. | |
The function will by default replace characters that can't be decoded by | |
"\x{FFFD}", the Unicode replacement character. | |
Any argument provided is passed as CHECK to underlying Encode::decode() call. | |
Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the | |
command line arguments can be decoded. See L<Encode/"Handling Malformed Data"> | |
for details on other options for CHECK. | |
=item env( $uni_key ) | |
=item env( $uni_key => $uni_value ) | |
Interface to get/set environment variables. Returns the current value as a | |
Unicode string. The $uni_key and $uni_value arguments are expected to be | |
Unicode strings as well. Passing C<undef> as $uni_value deletes the | |
environment variable named $uni_key. | |
The returned value will have the characters that can't be decoded replaced by | |
"\x{FFFD}", the Unicode replacement character. | |
There is no interface to request alternative CHECK behavior as for | |
decode_argv(). If you need that you need to call encode/decode yourself. | |
For example: | |
my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK); | |
my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK); | |
=item reinit( ) | |
=item reinit( $encoding ) | |
Reinitialize the encodings from the locale. You want to call this function if | |
you changed anything in the environment that might influence the locale. | |
This function will croak if the determined encoding isn't recognized by | |
the Encode module. | |
With argument force $ENCODING_... variables to set to the given value. | |
=item $ENCODING_LOCALE | |
The encoding name determined to be suitable for the current locale. | |
L<Encode> know this encoding as "locale". | |
=item $ENCODING_LOCALE_FS | |
The encoding name determined to be suitable for file system interfaces | |
involving file names. | |
L<Encode> know this encoding as "locale_fs". | |
=item $ENCODING_CONSOLE_IN | |
=item $ENCODING_CONSOLE_OUT | |
The encodings to be used for reading and writing output to the a console. | |
L<Encode> know these encodings as "console_in" and "console_out". | |
=back | |
=head1 NOTES | |
This table summarizes the mapping of the encodings set up | |
by the C<Encode::Locale> module: | |
Encode | | | | |
Alias | Windows | Mac OS X | POSIX | |
------------+---------+--------------+------------ | |
locale | ANSI | nl_langinfo | nl_langinfo | |
locale_fs | ANSI | UTF-8 | nl_langinfo | |
console_in | OEM | nl_langinfo | nl_langinfo | |
console_out | OEM | nl_langinfo | nl_langinfo | |
=head2 Windows | |
Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16 | |
strings) and a byte based API based a character set called ANSI. The | |
regular Perl interfaces to the OS currently only uses the ANSI APIs. | |
Unfortunately ANSI is not a single character set. | |
The encoding that corresponds to ANSI varies between different editions of | |
Windows. For many western editions of Windows ANSI corresponds to CP-1252 | |
which is a character set similar to ISO-8859-1. Conceptually the ANSI | |
character set is a similar concept to the POSIX locale CODESET so this module | |
figures out what the ANSI code page is and make this available as | |
$ENCODING_LOCALE and the "locale" Encoding alias. | |
Windows systems also operate with another byte based character set. | |
It's called the OEM code page. This is the encoding that the Console | |
takes as input and output. It's common for the OEM code page to | |
differ from the ANSI code page. | |
=head2 Mac OS X | |
On Mac OS X the file system encoding is always UTF-8 while the locale | |
can otherwise be set up as normal for POSIX systems. | |
File names on Mac OS X will at the OS-level be converted to | |
NFD-form. A file created by passing a NFC-filename will come | |
in NFD-form from readdir(). See L<Unicode::Normalize> for details | |
of NFD/NFC. | |
Actually, Apple does not follow the Unicode NFD standard since not all | |
character ranges are decomposed. The claim is that this avoids problems with | |
round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for | |
details. | |
=head2 POSIX (Linux and other Unixes) | |
File systems might vary in what encoding is to be used for | |
filenames. Since this module has no way to actually figure out | |
what the is correct it goes with the best guess which is to | |
assume filenames are encoding according to the current locale. | |
Users are advised to always specify UTF-8 as the locale charset. | |
=head1 SEE ALSO | |
L<I18N::Langinfo>, L<Encode>, L<Term::Encoding> | |
=head1 AUTHOR | |
Copyright 2010 Gisle Aas <[email protected]>. | |
This library is free software; you can redistribute it and/or | |
modify it under the same terms as Perl itself. | |
=cut | |