Current File : //opt/RZperl518/man/man3/utf8.3
.\" Automatically generated by Pod::Man 2.27 (Pod::Simple 3.28)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{
.    if \nF \{
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "utf8 3"
.TH utf8 3 "2014-10-01" "perl v5.18.4" "Perl Programmers Reference Guide"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
utf8 \- Perl pragma to enable/disable UTF\-8 (or UTF\-EBCDIC) in source code
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\&    use utf8;
\&    no utf8;
\&
\&    # Convert the internal representation of a Perl scalar to/from UTF\-8.
\&
\&    $num_octets = utf8::upgrade($string);
\&    $success    = utf8::downgrade($string[, FAIL_OK]);
\&
\&    # Change each character of a Perl scalar to/from a series of
\&    # characters that represent the UTF\-8 bytes of each original character.
\&
\&    utf8::encode($string);  # "\ex{100}"  becomes "\exc4\ex80"
\&    utf8::decode($string);  # "\exc4\ex80" becomes "\ex{100}"
\&
\&    $flag = utf8::is_utf8(STRING); # since Perl 5.8.1
\&    $flag = utf8::valid(STRING);
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The \f(CW\*(C`use utf8\*(C'\fR pragma tells the Perl parser to allow \s-1UTF\-8\s0 in the
program text in the current lexical scope (allow UTF-EBCDIC on \s-1EBCDIC\s0 based
platforms).  The \f(CW\*(C`no utf8\*(C'\fR pragma tells Perl to switch back to treating
the source text as literal bytes in the current lexical scope.
.PP
\&\fBDo not use this pragma for anything else than telling Perl that your
script is written in \s-1UTF\-8.\s0\fR The utility functions described below are
directly usable without \f(CW\*(C`use utf8;\*(C'\fR.
.PP
Because it is not possible to reliably tell \s-1UTF\-8\s0 from native 8 bit
encodings, you need either a Byte Order Mark at the beginning of your
source code, or \f(CW\*(C`use utf8;\*(C'\fR, to instruct perl.
.PP
When \s-1UTF\-8\s0 becomes the standard source format, this pragma will
effectively become a no-op.  For convenience in what follows the term
\&\fIUTF-X\fR is used to refer to \s-1UTF\-8\s0 on \s-1ASCII\s0 and \s-1ISO\s0 Latin based
platforms and UTF-EBCDIC on \s-1EBCDIC\s0 based platforms.
.PP
See also the effects of the \f(CW\*(C`\-C\*(C'\fR switch and its cousin, the
\&\f(CW$ENV{PERL_UNICODE}\fR, in perlrun.
.PP
Enabling the \f(CW\*(C`utf8\*(C'\fR pragma has the following effect:
.IP "\(bu" 4
Bytes in the source text that have their high-bit set will be treated
as being part of a literal UTF-X sequence.  This includes most
literals such as identifier names, string constants, and constant
regular expression patterns.
.Sp
On \s-1EBCDIC\s0 platforms characters in the Latin 1 character set are
treated as being part of a literal UTF-EBCDIC character.
.PP
Note that if you have bytes with the eighth bit on in your script
(for example embedded Latin\-1 in your string literals), \f(CW\*(C`use utf8\*(C'\fR
will be unhappy since the bytes are most probably not well-formed
UTF-X.  If you want to have such bytes under \f(CW\*(C`use utf8\*(C'\fR, you can disable
this pragma until the end the block (or file, if at top level) by
\&\f(CW\*(C`no utf8;\*(C'\fR.
.SS "Utility functions"
.IX Subsection "Utility functions"
The following functions are defined in the \f(CW\*(C`utf8::\*(C'\fR package by the
Perl core.  You do not need to say \f(CW\*(C`use utf8\*(C'\fR to use these and in fact
you should not say that  unless you really want to have \s-1UTF\-8\s0 source code.
.IP "\(bu" 4
\&\f(CW$num_octets\fR = utf8::upgrade($string)
.Sp
Converts in-place the internal representation of the string from an octet
sequence in the native encoding (Latin\-1 or \s-1EBCDIC\s0) to \fIUTF-X\fR. The
logical character sequence itself is unchanged.  If \fI\f(CI$string\fI\fR is already
stored as \fIUTF-X\fR, then this is a no-op. Returns the
number of octets necessary to represent the string as \fIUTF-X\fR.  Can be
used to make sure that the \s-1UTF\-8\s0 flag is on, so that \f(CW\*(C`\ew\*(C'\fR or \f(CW\*(C`lc()\*(C'\fR
work as Unicode on strings containing characters in the range 0x80\-0xFF
(on \s-1ASCII\s0 and derivatives).
.Sp
\&\fBNote that this function does not handle arbitrary encodings.\fR
Therefore Encode is recommended for the general purposes; see also
Encode.
.IP "\(bu" 4
\&\f(CW$success\fR = utf8::downgrade($string[, \s-1FAIL_OK\s0])
.Sp
Converts in-place the internal representation of the string from
\&\fIUTF-X\fR to the equivalent octet sequence in the native encoding (Latin\-1
or \s-1EBCDIC\s0). The logical character sequence itself is unchanged. If
\&\fI\f(CI$string\fI\fR is already stored as native 8 bit, then this is a no-op.  Can
be used to
make sure that the \s-1UTF\-8\s0 flag is off, e.g. when you want to make sure
that the \fIsubstr()\fR or \fIlength()\fR function works with the usually faster
byte algorithm.
.Sp
Fails if the original \fIUTF-X\fR sequence cannot be represented in the
native 8 bit encoding. On failure dies or, if the value of \f(CW\*(C`FAIL_OK\*(C'\fR is
true, returns false.
.Sp
Returns true on success.
.Sp
\&\fBNote that this function does not handle arbitrary encodings.\fR
Therefore Encode is recommended for the general purposes; see also
Encode.
.IP "\(bu" 4
utf8::encode($string)
.Sp
Converts in-place the character sequence to the corresponding octet
sequence in \fIUTF-X\fR. That is, every (possibly wide) character gets
replaced with a sequence of one or more characters that represent the
individual \fIUTF-X\fR bytes of the character.  The \s-1UTF8\s0 flag is turned off.
Returns nothing.
.Sp
.Vb 2
\&    my $a = "\ex{100}"; # $a contains one character, with ord 0x100
\&    utf8::encode($a);  # $a contains two characters, with ords 0xc4 and 0x80
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings.\fR
Therefore Encode is recommended for the general purposes; see also
Encode.
.IP "\(bu" 4
\&\f(CW$success\fR = utf8::decode($string)
.Sp
Attempts to convert in-place the octet sequence in \fIUTF-X\fR to the
corresponding character sequence. That is, it replaces each sequence of
characters in the string whose ords represent a valid UTF-X byte
sequence, with the corresponding single character.  The \s-1UTF\-8\s0 flag is
turned on only if the source string contains multiple-byte \fIUTF-X\fR
characters.  If \fI\f(CI$string\fI\fR is invalid as \fIUTF-X\fR, returns false;
otherwise returns true.
.Sp
.Vb 2
\&    my $a = "\exc4\ex80"; # $a contains two characters, with ords 0xc4 and 0x80
\&    utf8::decode($a);   # $a contains one character, with ord 0x100
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings.\fR
Therefore Encode is recommended for the general purposes; see also
Encode.
.IP "\(bu" 4
\&\f(CW$flag\fR = utf8::is_utf8(\s-1STRING\s0)
.Sp
(Since Perl 5.8.1)  Test whether \s-1STRING\s0 is encoded internally in \s-1UTF\-8.\s0
Functionally the same as \fIEncode::is_utf8()\fR.
.IP "\(bu" 4
\&\f(CW$flag\fR = utf8::valid(\s-1STRING\s0)
.Sp
[\s-1INTERNAL\s0] Test whether \s-1STRING\s0 is in a consistent state regarding
\&\s-1UTF\-8. \s0 Will return true if it is well-formed \s-1UTF\-8\s0 and has the \s-1UTF\-8\s0 flag
on \fBor\fR if \s-1STRING\s0 is held as bytes (both these states are 'consistent').
Main reason for this routine is to allow Perl's testsuite to check
that operations have left strings in a consistent state.  You most
probably want to use \fIutf8::is_utf8()\fR instead.
.PP
\&\f(CW\*(C`utf8::encode\*(C'\fR is like \f(CW\*(C`utf8::upgrade\*(C'\fR, but the \s-1UTF8\s0 flag is
cleared.  See perlunicode for more on the \s-1UTF8\s0 flag and the C \s-1API\s0
functions \f(CW\*(C`sv_utf8_upgrade\*(C'\fR, \f(CW\*(C`sv_utf8_downgrade\*(C'\fR, \f(CW\*(C`sv_utf8_encode\*(C'\fR,
and \f(CW\*(C`sv_utf8_decode\*(C'\fR, which are wrapped by the Perl functions
\&\f(CW\*(C`utf8::upgrade\*(C'\fR, \f(CW\*(C`utf8::downgrade\*(C'\fR, \f(CW\*(C`utf8::encode\*(C'\fR and
\&\f(CW\*(C`utf8::decode\*(C'\fR.  Also, the functions utf8::is_utf8, utf8::valid,
utf8::encode, utf8::decode, utf8::upgrade, and utf8::downgrade are
actually internal, and thus always available, without a \f(CW\*(C`require utf8\*(C'\fR
statement.
.SH "BUGS"
.IX Header "BUGS"
One can have Unicode in identifier names, but not in package/class or
subroutine names.  While some limited functionality towards this does
exist as of Perl 5.8.0, that is more accidental than designed; use of
Unicode for the said purposes is unsupported.
.PP
One reason of this unfinishedness is its (currently) inherent
unportability: since both package names and subroutine names may need
to be mapped to file and directory names, the Unicode capability of
the filesystem becomes important\*(-- and there unfortunately aren't
portable answers.
.SH "SEE ALSO"
.IX Header "SEE ALSO"
perlunitut, perluniintro, perlrun, bytes, perlunicode