summaryrefslogtreecommitdiff
path: root/contrib/perl5/pod
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/perl5/pod')
-rw-r--r--contrib/perl5/pod/Makefile.SH167
-rwxr-xr-xcontrib/perl5/pod/buildtoc.PL492
-rw-r--r--contrib/perl5/pod/perlclib.pod197
-rw-r--r--contrib/perl5/pod/perldebtut.pod721
-rw-r--r--contrib/perl5/pod/perlebcdic.pod1235
-rwxr-xr-xcontrib/perl5/pod/perlmodlib.PL1383
-rw-r--r--contrib/perl5/pod/perlnewmod.pod282
-rw-r--r--contrib/perl5/pod/perlrequick.pod503
-rw-r--r--contrib/perl5/pod/perlretut.pod2504
-rw-r--r--contrib/perl5/pod/perlutil.pod185
10 files changed, 7669 insertions, 0 deletions
diff --git a/contrib/perl5/pod/Makefile.SH b/contrib/perl5/pod/Makefile.SH
new file mode 100644
index 0000000000000..b8c8c8f24c730
--- /dev/null
+++ b/contrib/perl5/pod/Makefile.SH
@@ -0,0 +1,167 @@
+case $CONFIG in
+'')
+ if test -f config.sh; then TOP=.;
+ elif test -f ../config.sh; then TOP=..;
+ elif test -f ../../config.sh; then TOP=../..;
+ elif test -f ../../../config.sh; then TOP=../../..;
+ elif test -f ../../../../config.sh; then TOP=../../../..;
+ else
+ echo "Can't find config.sh."; exit 1
+ fi
+ . $TOP/config.sh
+ ;;
+esac
+: This forces SH files to create target in same directory as SH file.
+: This is so that make depend always knows where to find SH derivatives.
+case "$0" in
+*/*) cd `expr X$0 : 'X\(.*\)/'` ;;
+esac
+
+if test -d pod; then
+ cd pod || exit 1
+fi
+POD=`echo *.pod`
+MAN=`echo $POD|sed 's/\.pod/\.man/g'`
+HTML=`echo $POD|sed 's/perltoc.pod//'|sed 's/\.pod/\.html/g'`
+TEX=`echo $POD|sed 's/\.pod/\.tex/g'`
+
+echo "Extracting pod/Makefile (with variable substitutions)"
+: This section of the file will have variable substitutions done on it.
+: Move anything that needs config subs from !NO!SUBS! section to !GROK!THIS!.
+: Protect any dollar signs and backticks that you do not want interpreted
+: by putting a backslash in front. You may delete these comments.
+
+$spitshell >Makefile <<!GROK!THIS!
+# pod/Makefile
+# This file is derived from pod/Makefile.SH. Any changes made here will
+# be lost the next time you run Configure.
+
+POD = $POD
+
+MAN = $MAN
+
+# no perltoc.html
+HTML = $HTML
+
+TEX = $TEX
+
+!GROK!THIS!
+
+## In the following dollars and backticks do not need the extra backslash.
+$spitshell >>Makefile <<'!NO!SUBS!'
+
+CONVERTERS = pod2html pod2latex pod2man pod2text checkpods \
+ pod2usage podchecker podselect
+
+HTMLROOT = / # Change this to fix cross-references in HTML
+POD2HTML = pod2html \
+ --htmlroot=$(HTMLROOT) \
+ --podroot=.. --podpath=pod:lib:ext:vms \
+ --libpods=perlfunc:perlguts:perlvar:perlrun:perlop
+
+PERL = ../miniperl
+PERLILIB = $(PERL) -I../lib
+REALPERL = ../perl
+
+all: $(CONVERTERS) man
+
+converters: $(CONVERTERS)
+
+regen_pods: perlmodlib.pod toc
+
+buildtoc: buildtoc.PL perl.pod ../MANIFEST
+ $(PERLILIB) buildtoc.PL
+
+perltoc.pod: buildtoc
+
+man: pod2man $(MAN)
+
+html: pod2html $(HTML)
+
+tex: pod2latex $(TEX)
+
+toc: buildtoc
+ $(PERLILIB) buildtoc
+
+.SUFFIXES: .pm .pod
+
+.SUFFIXES: .man
+
+.pm.man: pod2man
+ $(PERL) -I../lib pod2man $*.pm >$*.man
+
+.pod.man: pod2man
+ $(PERL) -I../lib pod2man $*.pod >$*.man
+
+.SUFFIXES: .html
+
+.pm.html: pod2html
+ $(PERL) -I../lib $(POD2HTML) --infile=$*.pm --outfile=$*.html
+
+.pod.html: pod2html
+ $(PERL) -I../lib $(POD2HTML) --infile=$*.pod --outfile=$*.html
+
+.SUFFIXES: .tex
+
+.pm.tex: pod2latex
+ $(PERL) -I../lib pod2latex $*.pm
+
+.pod.tex: pod2latex
+ $(PERL) -I../lib pod2latex $*.pod
+
+clean:
+ rm -f $(MAN)
+ rm -f $(HTML)
+ rm -f $(TEX)
+ rm -f pod2html-*cache
+ rm -f *.aux *.log *.exe
+
+realclean: clean
+ rm -f $(CONVERTERS)
+
+distclean: realclean
+
+veryclean: distclean
+ -rm -f *~ *.orig
+
+check: checkpods
+ @echo "checking..."; \
+ $(PERL) -I../lib checkpods $(POD)
+
+# Dependencies.
+pod2latex: pod2latex.PL ../lib/Config.pm
+ $(PERL) -I../lib pod2latex.PL
+
+pod2html: pod2html.PL ../lib/Config.pm
+ $(PERL) -I ../lib pod2html.PL
+
+pod2man: pod2man.PL ../lib/Config.pm
+ $(PERL) -I ../lib pod2man.PL
+
+pod2text: pod2text.PL ../lib/Config.pm
+ $(PERL) -I ../lib pod2text.PL
+
+checkpods: checkpods.PL ../lib/Config.pm
+ $(PERL) -I ../lib checkpods.PL
+
+pod2usage: pod2usage.PL ../lib/Config.pm
+ $(PERL) -I ../lib pod2usage.PL
+
+podchecker: podchecker.PL ../lib/Config.pm
+ $(PERL) -I ../lib podchecker.PL
+
+podselect: podselect.PL ../lib/Config.pm
+ $(PERL) -I ../lib podselect.PL
+
+perlmodlib.pod: $(PERL) perlmodlib.PL ../mv-if-diff
+ rm -f perlmodlib.tmp
+ $(PERL) -I ../lib perlmodlib.PL
+ sh ../mv-if-diff perlmodlib.tmp perlmodlib.pod
+
+compile: all
+ $(REALPERL) -I../lib ../utils/perlcc -o pod2latex.exe pod2latex -log ../compilelog
+ $(REALPERL) -I../lib ../utils/perlcc -o pod2man.exe pod2man -log ../compilelog
+ $(REALPERL) -I../lib ../utils/perlcc -o pod2text.exe pod2text -log ../compilelog
+ $(REALPERL) -I../lib ../utils/perlcc -o checkpods.exe checkpods -log ../compilelog
+
+!NO!SUBS!
diff --git a/contrib/perl5/pod/buildtoc.PL b/contrib/perl5/pod/buildtoc.PL
new file mode 100755
index 0000000000000..7c5a45018e8e5
--- /dev/null
+++ b/contrib/perl5/pod/buildtoc.PL
@@ -0,0 +1,492 @@
+#!/usr/local/bin/perl
+
+use Config;
+use File::Basename qw(&basename &dirname);
+use Cwd;
+
+# List explicitly here the variables you want Configure to
+# generate. Metaconfig only looks for shell variables, so you
+# have to mention them as if they were shell variables, not
+# %Config entries. Thus you write
+# $startperl
+# to ensure Configure will look for $Config{startperl}.
+
+# This forces PL files to create target in same directory as PL file.
+# This is so that make depend always knows where to find PL derivatives.
+$origdir = cwd;
+chdir(dirname($0));
+($file = basename($0)) =~ s/\.PL$//;
+$file =~ s/\.pl$// if ($^O eq 'os2' or $^O eq 'dos'); # "case-forgiving"
+$file =~ s/\.pl$/.com/ if ($^O eq 'VMS'); # "case-forgiving"
+
+open OUT,">$file" or die "Can't create $file: $!";
+
+print "Extracting $file (with variable substitutions)\n";
+
+# In this section, perl variables will be expanded during extraction.
+# You can use $Config{...} to use Configure variables.
+
+print OUT <<"!GROK!THIS!";
+$Config{'startperl'}
+ eval 'exec perl -S \$0 "\$@"'
+ if 0;
+!GROK!THIS!
+
+# In the following, perl variables are not expanded during extraction.
+
+print OUT <<'!NO!SUBS!';
+
+#
+# buildtoc
+#
+# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
+# This file is autogenerated by buildtoc.PL.
+# Edit that file and run it to effect changes.
+#
+# Builds perltoc.pod and sanity checks the list of pods against all
+# of the MANIFEST, perl.pod, and ourselves.
+#
+
+use File::Find;
+use Cwd;
+use Text::Wrap;
+
+@PODS = glob("*.pod");
+
+sub output ($);
+
+if (-d "pod") {
+ die "$0: failed to chdir('pod'): $!\n" unless chdir("pod");
+}
+
+@pods = qw(
+ perl
+ perlfaq
+ perltoc
+ perlbook
+
+ perlsyn
+ perldata
+ perlop
+ perlsub
+ perlfunc
+ perlreftut
+ perldsc
+ perlrequick
+ perlpod
+ perlstyle
+ perltrap
+
+ perlrun
+ perldiag
+ perllexwarn
+ perldebtut
+ perldebug
+
+ perlvar
+ perllol
+ perlopentut
+ perlretut
+
+ perlre
+ perlref
+
+ perlform
+
+ perlboot
+ perltoot
+ perltootc
+ perlobj
+ perlbot
+ perltie
+
+ perlipc
+ perlfork
+ perlnumber
+ perlthrtut
+
+ perlport
+ perllocale
+ perlunicode
+ perlebcdic
+
+ perlsec
+
+ perlmod
+ perlmodlib
+ perlmodinstall
+ perlnewmod
+
+ perlfaq1
+ perlfaq2
+ perlfaq3
+ perlfaq4
+ perlfaq5
+ perlfaq6
+ perlfaq7
+ perlfaq8
+ perlfaq9
+
+ perlcompile
+
+ perlembed
+ perldebguts
+ perlxstut
+ perlxs
+ perlclib
+ perlguts
+ perlcall
+ perlutil
+ perlfilter
+ perldbmfilter
+ perlapi
+ perlintern
+ perlapio
+ perltodo
+ perlhack
+
+ perlhist
+ perldelta
+ perl5005delta
+ perl5004delta
+
+ perlaix
+ perlamiga
+ perlbs2000
+ perlcygwin
+ perldos
+ perlepoc
+ perlhpux
+ perlmachten
+ perlmacos
+ perlmpeix
+ perlos2
+ perlos390
+ perlsolaris
+ perlvmesa
+ perlvms
+ perlvos
+ perlwin32
+ );
+
+@ARCHPODS = qw(
+ perlaix
+ perlamiga
+ perlbs2000
+ perlcygwin
+ perldos
+ perlepoc
+ perlhpux
+ perlmachten
+ perlmacos
+ perlmpeix
+ perlos2
+ perlos390
+ perlsolaris
+ perlvmesa
+ perlvms
+ perlvos
+ perlwin32
+ );
+for (@ARCHPODS) { s/$/.pod/ }
+@ARCHPODS{@ARCHPODS} = ();
+
+for (@pods) { s/$/.pod/ }
+@pods{@pods} = ();
+@PODS{@PODS} = ();
+
+open(MANI, "../MANIFEST") || die "$0: opening ../MANIFEST failed: $!";
+while (<MANI>) {
+ if (m!^pod/([^.]+\.pod)\s+!i) {
+ push @MANIPODS, $1;
+ }
+}
+close(MANI);
+@MANIPODS{@MANIPODS} = ();
+
+open(PERLPOD, "perl.pod") || die "$0: opening perl.pod failed: $!\n";
+while (<PERLPOD>) {
+ if (/^For ease of access, /../^\(If you're intending /) {
+ if (/^\s+(perl\S*)\s+\w/) {
+ push @PERLPODS, "$1.pod";
+ }
+ }
+}
+close(PERLPOD);
+die "$0: could not find the pod listing of perl.pod\n"
+ unless @PERLPODS;
+@PERLPODS{@PERLPODS} = ();
+
+# Cross-check against ourselves
+# Cross-check against the MANIFEST
+# Cross-check against the perl.pod
+
+foreach my $i (sort keys %PODS) {
+ warn "$0: $i exists but is unknown by buildtoc\n"
+ unless exists $pods{$i};
+ warn "$0: $i exists but is unknown by ../MANIFEST\n"
+ if !exists $MANIPODS{$i} && !exists $ARCHPODS{$i};
+ warn "$0: $i exists but is unknown by perl.pod\n"
+ unless exists $PERLPODS{$i};
+}
+foreach my $i (sort keys %pods) {
+ warn "$0: $i is known by buildtoc but does not exist\n"
+ unless exists $PODS{$i};
+}
+foreach my $i (sort keys %MANIPODS) {
+ warn "$0: $i is known by ../MANIFEST but does not exist\n"
+ unless exists $PODS{$i};
+}
+foreach my $i (sort keys %PERLPODS) {
+ warn "$0: $i is known by perl.pod but does not exist\n"
+ unless exists $PODS{$i};
+}
+
+# We are ready to rock.
+open(OUT, ">perltoc.pod") || die "$0: creating perltoc.pod failed: $!";
+
+$/ = '';
+@ARGV = @pods;
+
+($_= <<EOPOD2B) =~ s/^\t//gm && output($_);
+
+ =head1 NAME
+
+ perltoc - perl documentation table of contents
+
+ =head1 DESCRIPTION
+
+ This page provides a brief table of contents for the rest of the Perl
+ documentation set. It is meant to be scanned quickly or grepped
+ through to locate the proper section you're looking for.
+
+ =head1 BASIC DOCUMENTATION
+
+EOPOD2B
+#' make emacs happy
+
+podset(@pods);
+
+find \&getpods => qw(../lib ../ext);
+
+sub getpods {
+ if (/\.p(od|m)$/) {
+ # Skip .pm files that have corresponding .pod files, and Functions.pm.
+ return if /(.*)\.pm$/ && -f "$1.pod";
+ my $file = $File::Find::name;
+ return if $file eq '../lib/Pod/Functions.pm'; # Used only by pod itself
+
+ die "tut $name" if $file =~ /TUT/;
+ unless (open (F, "< $_\0")) {
+ warn "bogus <$file>: $!";
+ system "ls", "-l", $file;
+ }
+ else {
+ my $line;
+ while ($line = <F>) {
+ if ($line =~ /^=head1\s+NAME\b/) {
+ push @modpods, $file;
+ #warn "GOOD $file\n";
+ return;
+ }
+ }
+ warn "$0: $file: cannot find =head1 NAME\n";
+ }
+ }
+}
+
+die "no pods" unless @modpods;
+
+for (@modpods) {
+ #($name) = /(\w+)\.p(m|od)$/;
+ $name = path2modname($_);
+ if ($name =~ /^[a-z]/) {
+ push @pragmata, $_;
+ } else {
+ if ($done{$name}++) {
+ # warn "already did $_\n";
+ next;
+ }
+ push @modules, $_;
+ push @modname, $name;
+ }
+}
+
+($_= <<EOPOD2B) =~ s/^\t//gm && output($_);
+
+
+
+ =head1 PRAGMA DOCUMENTATION
+
+EOPOD2B
+
+podset(sort @pragmata);
+
+($_= <<EOPOD2B) =~ s/^\t//gm && output($_);
+
+
+
+ =head1 MODULE DOCUMENTATION
+
+EOPOD2B
+
+podset( @modules[ sort { $modname[$a] cmp $modname[$b] } 0 .. $#modules ] );
+
+($_= <<EOPOD2B) =~ s/^\t//gm;
+
+
+ =head1 AUXILIARY DOCUMENTATION
+
+ Here should be listed all the extra programs' documentation, but they
+ don't all have manual pages yet:
+
+ =over 4
+
+ =item a2p
+
+ =item s2p
+
+ =item find2perl
+
+ =item h2ph
+
+ =item c2ph
+
+ =item h2xs
+
+ =item xsubpp
+
+ =item pod2man
+
+ =item wrapsuid
+
+ =back
+
+ =head1 AUTHOR
+
+ Larry Wall <F<larry\@wall.org>>, with the help of oodles
+ of other folks.
+
+
+EOPOD2B
+output $_;
+output "\n"; # flush $LINE
+exit;
+
+sub podset {
+ local @ARGV = @_;
+
+ while(<>) {
+ if (s/^=head1 (NAME)\s*/=head2 /) {
+ $pod = path2modname($ARGV);
+ unhead1();
+ output "\n \n\n=head2 ";
+ $_ = <>;
+ if ( /^\s*$pod\b/ ) {
+ s/$pod\.pm/$pod/; # '.pm' in NAME !?
+ output $_;
+ } else {
+ s/^/$pod, /;
+ output $_;
+ }
+ next;
+ }
+ if (s/^=head1 (.*)/=item $1/) {
+ unhead2();
+ output "=over 4\n\n" unless $inhead1;
+ $inhead1 = 1;
+ output $_; nl(); next;
+ }
+ if (s/^=head2 (.*)/=item $1/) {
+ unitem();
+ output "=over 4\n\n" unless $inhead2;
+ $inhead2 = 1;
+ output $_; nl(); next;
+ }
+ if (s/^=item ([^=].*)/$1/) {
+ next if $pod eq 'perldiag';
+ s/^\s*\*\s*$// && next;
+ s/^\s*\*\s*//;
+ s/\n/ /g;
+ s/\s+$//;
+ next if /^[\d.]+$/;
+ next if $pod eq 'perlmodlib' && /^ftp:/;
+ ##print "=over 4\n\n" unless $initem;
+ output ", " if $initem;
+ $initem = 1;
+ s/\.$//;
+ s/^-X\b/-I<X>/;
+ output $_; next;
+ }
+ if (s/^=cut\s*\n//) {
+ unhead1();
+ next;
+ }
+ }
+}
+
+sub path2modname {
+ local $_ = shift;
+ s/\.p(m|od)$//;
+ s-.*?/(lib|ext)/--;
+ s-/-::-g;
+ s/(\w+)::\1/$1/;
+ return $_;
+}
+
+sub unhead1 {
+ unhead2();
+ if ($inhead1) {
+ output "\n\n=back\n\n";
+ }
+ $inhead1 = 0;
+}
+
+sub unhead2 {
+ unitem();
+ if ($inhead2) {
+ output "\n\n=back\n\n";
+ }
+ $inhead2 = 0;
+}
+
+sub unitem {
+ if ($initem) {
+ output "\n\n";
+ ##print "\n\n=back\n\n";
+ }
+ $initem = 0;
+}
+
+sub nl {
+ output "\n";
+}
+
+my $NEWLINE; # how many newlines have we seen recently
+my $LINE; # what remains to be printed
+
+sub output ($) {
+ for (split /(\n)/, shift) {
+ if ($_ eq "\n") {
+ if ($LINE) {
+ print OUT wrap('', '', $LINE);
+ $LINE = '';
+ }
+ if ($NEWLINE < 2) {
+ print OUT;
+ $NEWLINE++;
+ }
+ }
+ elsif (/\S/ && length) {
+ $LINE .= $_;
+ $NEWLINE = 0;
+ }
+ }
+}
+
+!NO!SUBS!
+
+close OUT or die "Can't close $file: $!";
+chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
+exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
+chdir $origdir;
diff --git a/contrib/perl5/pod/perlclib.pod b/contrib/perl5/pod/perlclib.pod
new file mode 100644
index 0000000000000..a0f4a80eecd70
--- /dev/null
+++ b/contrib/perl5/pod/perlclib.pod
@@ -0,0 +1,197 @@
+=head1 NAME
+
+perlclib - Internal replacements for standard C library functions
+
+=head1 DESCRIPTION
+
+One thing Perl porters should note is that F<perl> doesn't tend to use that
+much of the C standard library internally; you'll see very little use of,
+for example, the F<ctype.h> functions in there. This is because Perl
+tends to reimplement or abstract standard library functions, so that we
+know exactly how they're going to operate.
+
+This is a reference card for people who are familiar with the C library
+and who want to do things the Perl way; to tell them which functions
+they ought to use instead of the more normal C functions.
+
+=head2 Conventions
+
+In the following tables:
+
+=over 3
+
+=item C<t>
+
+is a type.
+
+=item C<p>
+
+is a pointer.
+
+=item C<n>
+
+is a number.
+
+=item C<s>
+
+is a string.
+
+=back
+
+C<sv>, C<av>, C<hv>, etc. represent variables of their respective types.
+
+=head2 File Operations
+
+Instead of the F<stdio.h> functions, you should use the Perl abstraction
+layer. Instead of C<FILE*> types, you need to be handling C<PerlIO*>
+types. Don't forget that with the new PerlIO layered I/O abstraction
+C<FILE*> types may not even be available. See also the C<perlapio>
+documentation for more information about the following functions:
+
+ Instead Of: Use:
+
+ stdin PerlIO_stdin()
+ stdout PerlIO_stdout()
+ stderr PerlIO_stderr()
+
+ fopen(fn, mode) PerlIO_open(fn, mode)
+ freopen(fn, mode, stream) PerlIO_reopen(fn, mode, perlio) (Deprecated)
+ fflush(stream) PerlIO_flush(perlio)
+ fclose(stream) PerlIO_close(perlio)
+
+=head2 File Input and Output
+
+ Instead Of: Use:
+
+ fprintf(stream, fmt, ...) PerlIO_printf(perlio, fmt, ...)
+
+ [f]getc(stream) PerlIO_getc(perlio)
+ [f]putc(stream, n) PerlIO_putc(perlio, n)
+ ungetc(n, stream) PerlIO_ungetc(perlio, n)
+
+Note that the PerlIO equivalents of C<fread> and C<fwrite> are slightly
+different from their C library counterparts:
+
+ fread(p, size, n, stream) PerlIO_read(perlio, buf, numbytes)
+ fwrite(p, size, n, stream) PerlIO_write(perlio, buf, numbytes)
+
+ fputs(s, stream) PerlIO_puts(perlio, s)
+
+There is no equivalent to C<fgets>; one should use C<sv_gets> instead:
+
+ fgets(s, n, stream) sv_gets(sv, perlio, append)
+
+=head2 File Positioning
+
+ Instead Of: Use:
+
+ feof(stream) PerlIO_eof(perlio)
+ fseek(stream, n, whence) PerlIO_seek(perlio, n, whence)
+ rewind(stream) PerlIO_rewind(perlio)
+
+ fgetpos(stream, p) PerlIO_getpos(perlio, sv)
+ fsetpos(stream, p) PerlIO_setpos(perlio, sv)
+
+ ferror(stream) PerlIO_error(perlio)
+ clearerr(stream) PerlIO_clearerr(perlio)
+
+=head2 Memory Management and String Handling
+
+ Instead Of: Use:
+
+ t* p = malloc(n) New(id, p, n, t)
+ t* p = calloc(n, s) Newz(id, p, n, t)
+ p = realloc(p, n) Renew(p, n, t)
+ memcpy(dst, src, n) Copy(src, dst, n, t)
+ memmove(dst, src, n) Move(src, dst, n, t)
+ memcpy/*(struct foo *) StructCopy(src, dst, t)
+ free(p) Safefree(p)
+
+ strdup(p) savepv(p)
+ strndup(p, n) savepvn(p, n) (Hey, strndup doesn't exist!)
+
+ strstr(big, little) instr(big, little)
+ strcmp(s1, s2) strLE(s1, s2) / strEQ(s1, s2) / strGT(s1,s2)
+ strncmp(s1, s2, n) strnNE(s1, s2, n) / strnEQ(s1, s2, n)
+
+Notice the different order of arguments to C<Copy> and C<Move> than used
+in C<memcpy> and C<memmove>.
+
+Most of the time, though, you'll want to be dealing with SVs internally
+instead of raw C<char *> strings:
+
+ strlen(s) sv_len(sv)
+ strcpy(dt, src) sv_setpv(sv, s)
+ strncpy(dt, src, n) sv_setpvn(sv, s, n)
+ strcat(dt, src) sv_catpv(sv, s)
+ strncat(dt, src) sv_catpvn(sv, s)
+ sprintf(s, fmt, ...) sv_setpvf(sv, fmt, ...)
+
+Note also the existence of C<sv_catpvf> and C<sv_catpvfn>, combining
+concatenation with formatting.
+
+=head2 Character Class Tests
+
+There are two types of character class tests that Perl implements: one
+type deals in C<char>s and are thus B<not> Unicode aware (and hence
+deprecated unless you B<know> you should use them) and the other type
+deal in C<UV>s and know about Unicode properties. In the following
+table, C<c> is a C<char>, and C<u> is a Unicode codepoint.
+
+ Instead Of: Use: But better use:
+
+ isalnum(c) isALNUM(c) isALNUM_uni(u)
+ isalpha(c) isALPHA(c) isALPHA_uni(u)
+ iscntrl(c) isCNTRL(c) isCNTRL_uni(u)
+ isdigit(c) isDIGIT(c) isDIGIT_uni(u)
+ isgraph(c) isGRAPH(c) isGRAPH_uni(u)
+ islower(c) isLOWER(c) isLOWER_uni(u)
+ isprint(c) isPRINT(c) isPRINT_uni(u)
+ ispunct(c) isPUNCT(c) isPUNCT_uni(u)
+ isspace(c) isSPACE(c) isSPACE_uni(u)
+ isupper(c) isUPPER(c) isUPPER_uni(u)
+ isxdigit(c) isXDIGIT(c) isXDIGIT_uni(u)
+
+ tolower(c) toLOWER(c) toLOWER_uni(u)
+ toupper(c) toUPPER(c) toUPPER_uni(u)
+
+=head2 F<stdlib.h> functions
+
+ Instead Of: Use:
+
+ atof(s) Atof(s)
+ atol(s) Atol(s)
+ strtod(s, *p) Nothing. Just don't use it.
+ strtol(s, *p, n) Strtol(s, *p, n)
+ strtoul(s, *p, n) Strtoul(s, *p, n)
+
+Notice also the C<scan_bin>, C<scan_hex>, and C<scan_oct> functions in
+F<util.c> for converting strings representing numbers in the respective
+bases into C<NV>s.
+
+In theory C<Strtol> and C<Strtoul> may not be defined if the machine perl is
+built on doesn't actually have strtol and strtoul. But as those 2
+functions are part of the 1989 ANSI C spec we suspect you'll find them
+everywhere by now.
+
+ int rand() double Drand01()
+ srand(n) { seedDrand01((Rand_seed_t)n);
+ PL_srand_called = TRUE; }
+
+ exit(n) my_exit(n)
+ system(s) Don't. Look at pp_system or use my_popen
+
+ getenv(s) PerlEnv_getenv(s)
+ setenv(s, val) my_putenv(s, val)
+
+=head2 Miscellaneous functions
+
+You should not even B<want> to use F<setjmp.h> functions, but if you
+think you do, use the C<JMPENV> stack in F<scope.h> instead.
+
+For C<signal>/C<sigaction>, use C<rsignal(signo, handler)>.
+
+=head1 SEE ALSO
+
+C<perlapi>, C<perlapio>, C<perlguts>
+
diff --git a/contrib/perl5/pod/perldebtut.pod b/contrib/perl5/pod/perldebtut.pod
new file mode 100644
index 0000000000000..e11102e5676ef
--- /dev/null
+++ b/contrib/perl5/pod/perldebtut.pod
@@ -0,0 +1,721 @@
+=head1 NAME
+
+perldebtut - Perl debugging tutorial
+
+=head1 DESCRIPTION
+
+A (very) lightweight introduction in the use of the perl debugger, and a
+pointer to existing, deeper sources of information on the subject of debugging
+perl programs.
+
+There's an extraordinary number of people out there who don't appear to know
+anything about using the perl debugger, though they use the language every
+day.
+This is for them.
+
+
+=head1 use strict
+
+First of all, there's a few things you can do to make your life a lot more
+straightforward when it comes to debugging perl programs, without using the
+debugger at all. To demonstrate, here's a simple script with a problem:
+
+ #!/usr/bin/perl
+
+ $var1 = 'Hello World'; # always wanted to do that :-)
+ $var2 = "$varl\n";
+
+ print $var2;
+ exit;
+
+While this compiles and runs happily, it probably won't do what's expected,
+namely it doesn't print "Hello World\n" at all; It will on the other hand do
+exactly what it was told to do, computers being a bit that way inclined. That
+is, it will print out a newline character, and you'll get what looks like a
+blank line. It looks like there's 2 variables when (because of the typo)
+there's really 3:
+
+ $var1 = 'Hello World'
+ $varl = undef
+ $var2 = "\n"
+
+To catch this kind of problem, we can force each variable to be declared
+before use by pulling in the strict module, by putting 'use strict;' after the
+first line of the script.
+
+Now when you run it, perl complains about the 3 undeclared variables and we
+get four error messages because one variable is referenced twice:
+
+ Global symbol "$var1" requires explicit package name at ./t1 line 4.
+ Global symbol "$var2" requires explicit package name at ./t1 line 5.
+ Global symbol "$varl" requires explicit package name at ./t1 line 5.
+ Global symbol "$var2" requires explicit package name at ./t1 line 7.
+ Execution of ./hello aborted due to compilation errors.
+
+Luvverly! and to fix this we declare all variables explicitly and now our
+script looks like this:
+
+ #!/usr/bin/perl
+ use strict;
+
+ my $var1 = 'Hello World';
+ my $varl = '';
+ my $var2 = "$varl\n";
+
+ print $var2;
+ exit;
+
+We then do (always a good idea) a syntax check before we try to run it again:
+
+ > perl -c hello
+ hello syntax OK
+
+And now when we run it, we get "\n" still, but at least we know why. Just
+getting this script to compile has exposed the '$varl' (with the letter 'l)
+variable, and simply changing $varl to $var1 solves the problem.
+
+
+=head1 Looking at data and -w and w
+
+Ok, but how about when you want to really see your data, what's in that
+dynamic variable, just before using it?
+
+ #!/usr/bin/perl
+ use strict;
+
+ my $key = 'welcome';
+ my %data = (
+ 'this' => qw(that),
+ 'tom' => qw(and jerry),
+ 'welcome' => q(Hello World),
+ 'zip' => q(welcome),
+ );
+ my @data = keys %data;
+
+ print "$data{$key}\n";
+ exit;
+
+Looks OK, after it's been through the syntax check (perl -c scriptname), we
+run it and all we get is a blank line again! Hmmmm.
+
+One common debugging approach here, would be to liberally sprinkle a few print
+statements, to add a check just before we print out our data, and another just
+after:
+
+ print "All OK\n" if grep($key, keys %data);
+ print "$data{$key}\n";
+ print "done: '$data{$key}'\n";
+
+And try again:
+
+ > perl data
+ All OK
+
+ done: ''
+
+After much staring at the same piece of code and not seeing the wood for the
+trees for some time, we get a cup of coffee and try another approach. That
+is, we bring in the cavalry by giving perl the 'B<-d>' switch on the command
+line:
+
+ > perl -d data
+ Default die handler restored.
+
+ Loading DB routines from perl5db.pl version 1.07
+ Editor support available.
+
+ Enter h or `h h' for help, or `man perldebug' for more help.
+
+ main::(./data:4): my $key = 'welcome';
+
+Now, what we've done here is to launch the built-in perl debugger on our
+script. It's stopped at the first line of executable code and is waiting for
+input.
+
+Before we go any further, you'll want to know how to quit the debugger: use
+just the letter 'B<q>', not the words 'quit' or 'exit':
+
+ DB<1> q
+ >
+
+That's it, you're back on home turf again.
+
+
+=head1 help
+
+Fire the debugger up again on your script and we'll look at the help menu.
+There's a couple of ways of calling help: a simple 'B<h>' will get you a long
+scrolled list of help, 'B<|h>' (pipe-h) will pipe the help through your pager
+('more' or 'less' probably), and finally, 'B<h h>' (h-space-h) will give you a
+helpful mini-screen snapshot:
+
+ DB<1> h h
+ List/search source lines: Control script execution:
+ l [ln|sub] List source code T Stack trace
+ - or . List previous/current line s [expr] Single step [in expr]
+ w [line] List around line n [expr] Next, steps over subs
+ f filename View source in file <CR/Enter> Repeat last n or s
+ /pattern/ ?patt? Search forw/backw r Return from subroutine
+ v Show versions of modules c [ln|sub] Continue until position
+ Debugger controls: L List
+break/watch/actions
+ O [...] Set debugger options t [expr] Toggle trace [trace expr]
+ <[<]|{[{]|>[>] [cmd] Do pre/post-prompt b [ln|event|sub] [cnd] Set breakpoint
+ ! [N|pat] Redo a previous command d [ln] or D Delete a/all breakpoints
+ H [-num] Display last num commands a [ln] cmd Do cmd before line
+ = [a val] Define/list an alias W expr Add a watch expression
+ h [db_cmd] Get help on command A or W Delete all actions/watch
+ |[|]db_cmd Send output to pager ![!] syscmd Run cmd in a subprocess
+ q or ^D Quit R Attempt a restart
+ Data Examination: expr Execute perl code, also see: s,n,t expr
+ x|m expr Evals expr in list context, dumps the result or lists methods.
+ p expr Print expression (uses script's current package).
+ S [[!]pat] List subroutine names [not] matching pattern
+ V [Pk [Vars]] List Variables in Package. Vars can be ~pattern or !pattern.
+ X [Vars] Same as "V current_package [Vars]".
+ For more help, type h cmd_letter, or run man perldebug for all docs.
+
+More confusing options than you can shake a big stick at! It's not as bad as
+it looks and it's very useful to know more about all of it, and fun too!
+
+There's a couple of useful ones to know about straight away. You wouldn't
+think we're using any libraries at all at the moment, but 'B<v>' will show
+which modules are currently loaded, by the debugger as well your script.
+'B<V>' and 'B<X>' show variables in the program by package scope and can be
+constrained by pattern. 'B<m>' shows methods and 'B<S>' shows all subroutines
+(by pattern):
+
+ DB<2>S str
+ dumpvar::stringify
+ strict::bits
+ strict::import
+ strict::unimport
+
+Using 'X' and cousins requires you not to use the type identifiers ($@%), just
+the 'name':
+
+ DM<3>X ~err
+ FileHandle(stderr) => fileno(2)
+
+Remember we're in our tiny program with a problem, we should have a look at
+where we are, and what our data looks like. First of all let's have a window
+on our present position (the first line of code in this case), via the letter
+'B<w>':
+
+ DB<4> w
+ 1 #!/usr/bin/perl
+ 2: use strict;
+ 3
+ 4==> my $key = 'welcome';
+ 5: my %data = (
+ 6 'this' => qw(that),
+ 7 'tom' => qw(and jerry),
+ 8 'welcome' => q(Hello World),
+ 9 'zip' => q(welcome),
+ 10 );
+
+At line number 4 is a helpful pointer, that tells you where you are now. To
+see more code, type 'w' again:
+
+ DB<4> w
+ 8 'welcome' => q(Hello World),
+ 9 'zip' => q(welcome),
+ 10 );
+ 11: my @data = keys %data;
+ 12: print "All OK\n" if grep($key, keys %data);
+ 13: print "$data{$key}\n";
+ 14: print "done: '$data{$key}'\n";
+ 15: exit;
+
+And if you wanted to list line 5 again, type 'l 5', (note the space):
+
+ DB<4> l 5
+ 5: my %data = (
+
+In this case, there's not much to see, but of course normally there's pages of
+stuff to wade through, and 'l' can be very useful. To reset your view to the
+line we're about to execute, type a lone period '.':
+
+ DB<5> .
+ main::(./data_a:4): my $key = 'welcome';
+
+The line shown is the one that is about to be executed B<next>, it hasn't
+happened yet. So while we can print a variable with the letter 'B<p>', at
+this point all we'd get is an empty (undefined) value back. What we need to
+do is to step through the next executable statement with an 'B<s>':
+
+ DB<6> s
+ main::(./data_a:5): my %data = (
+ main::(./data_a:6): 'this' => qw(that),
+ main::(./data_a:7): 'tom' => qw(and jerry),
+ main::(./data_a:8): 'welcome' => q(Hello World),
+ main::(./data_a:9): 'zip' => q(welcome),
+ main::(./data_a:10): );
+
+Now we can have a look at that first ($key) variable:
+
+ DB<7> p $key
+ welcome
+
+line 13 is where the action is, so let's continue down to there via the letter
+'B<c>', which by the way, inserts a 'one-time-only' breakpoint at the given
+line or sub routine:
+
+ DB<8> c 13
+ All OK
+ main::(./data_a:13): print "$data{$key}\n";
+
+We've gone past our check (where 'All OK' was printed) and have stopped just
+before the meat of our task. We could try to print out a couple of variables
+to see what is happening:
+
+ DB<9> p $data{$key}
+
+Not much in there, lets have a look at our hash:
+
+ DB<10> p %data
+ Hello Worldziptomandwelcomejerrywelcomethisthat
+
+ DB<11> p keys %data
+ Hello Worldtomwelcomejerrythis
+
+Well, this isn't very easy to read, and using the helpful manual (B<h h>), the
+'B<x>' command looks promising:
+
+ DB<12> x %data
+ 0 'Hello World'
+ 1 'zip'
+ 2 'tom'
+ 3 'and'
+ 4 'welcome'
+ 5 undef
+ 6 'jerry'
+ 7 'welcome'
+ 8 'this'
+ 9 'that'
+
+That's not much help, a couple of welcomes in there, but no indication of
+which are keys, and which are values, it's just a listed array dump and, in
+this case, not particularly helpful. The trick here, is to use a B<reference>
+to the data structure:
+
+ DB<13> x \%data
+ 0 HASH(0x8194bc4)
+ 'Hello World' => 'zip'
+ 'jerry' => 'welcome'
+ 'this' => 'that'
+ 'tom' => 'and'
+ 'welcome' => undef
+
+The reference is truly dumped and we can finally see what we're dealing with.
+Our quoting was perfectly valid but wrong for our purposes, with 'and jerry'
+being treated as 2 separate words rather than a phrase, thus throwing the
+evenly paired hash structure out of alignment.
+
+The 'B<-w>' switch would have told us about this, had we used it at the start,
+and saved us a lot of trouble:
+
+ > perl -w data
+ Odd number of elements in hash assignment at ./data line 5.
+
+We fix our quoting: 'tom' => q(and jerry), and run it again, this time we get
+our expected output:
+
+ > perl -w data
+ Hello World
+
+
+While we're here, take a closer look at the 'B<x>' command, it's really useful
+and will merrily dump out nested references, complete objects, partial objects
+- just about whatever you throw at it:
+
+Let's make a quick object and x-plode it, first we'll start the the debugger:
+it wants some form of input from STDIN, so we give it something non-commital,
+a zero:
+
+ > perl -de 0
+ Default die handler restored.
+
+ Loading DB routines from perl5db.pl version 1.07
+ Editor support available.
+
+ Enter h or `h h' for help, or `man perldebug' for more help.
+
+ main::(-e:1): 0
+
+Now build an on-the-fly object over a couple of lines (note the backslash):
+
+ DB<1> $obj = bless({'unique_id'=>'123', 'attr'=> \
+ cont: {'col' => 'black', 'things' => [qw(this that etc)]}}, 'MY_class')
+
+And let's have a look at it:
+
+ DB<2> x $obj
+ 0 MY_class=HASH(0x828ad98)
+ 'attr' => HASH(0x828ad68)
+ 'col' => 'black'
+ 'things' => ARRAY(0x828abb8)
+ 0 'this'
+ 1 'that'
+ 2 'etc'
+ 'unique_id' => 123
+ DB<3>
+
+Useful, huh? You can eval nearly anything in there, and experiment with bits
+of code or regexes until the cows come home:
+
+ DB<3> @data = qw(this that the other atheism leather theory scythe)
+
+ DB<4> p 'saw -> '.($cnt += map { print "\t:\t$_\n" } grep(/the/, sort @data))
+ atheism
+ leather
+ other
+ scythe
+ the
+ theory
+ saw -> 6
+
+If you want to see the command History, type an 'B<H>':
+
+ DB<5> H
+ 4: p 'saw -> '.($cnt += map { print "\t:\t$_\n" } grep(/the/, sort @data))
+ 3: @data = qw(this that the other atheism leather theory scythe)
+ 2: x $obj
+ 1: $obj = bless({'unique_id'=>'123', 'attr'=>
+ {'col' => 'black', 'things' => [qw(this that etc)]}}, 'MY_class')
+ DB<5>
+
+And if you want to repeat any previous command, use the exclamation: 'B<!>':
+
+ DB<5> !4
+ p 'saw -> '.($cnt += map { print "$_\n" } grep(/the/, sort @data))
+ atheism
+ leather
+ other
+ scythe
+ the
+ theory
+ saw -> 12
+
+For more on references see L<perlref> and L<perlreftut>
+
+
+=head1 Stepping through code
+
+Here's a simple program which converts between Celsius and Fahrenheit, it too
+has a problem:
+
+ #!/usr/bin/perl -w
+ use strict;
+
+ my $arg = $ARGV[0] || '-c20';
+
+ if ($arg =~ /^\-(c|f)((\-|\+)*\d+(\.\d+)*)$/) {
+ my ($deg, $num) = ($1, $2);
+ my ($in, $out) = ($num, $num);
+ if ($deg eq 'c') {
+ $deg = 'f';
+ $out = &c2f($num);
+ } else {
+ $deg = 'c';
+ $out = &f2c($num);
+ }
+ $out = sprintf('%0.2f', $out);
+ $out =~ s/^((\-|\+)*\d+)\.0+$/$1/;
+ print "$out $deg\n";
+ } else {
+ print "Usage: $0 -[c|f] num\n";
+ }
+ exit;
+
+ sub f2c {
+ my $f = shift;
+ my $c = 5 * $f - 32 / 9;
+ return $c;
+ }
+
+ sub c2f {
+ my $c = shift;
+ my $f = 9 * $c / 5 + 32;
+ return $f;
+ }
+
+
+For some reason, the Fahrenheit to Celsius conversion fails to return the
+expected output. This is what it does:
+
+ > temp -c0.72
+ 33.30 f
+
+ > temp -f33.3
+ 162.94 c
+
+Not very consistent! We'll set a breakpoint in the code manually and run it
+under the debugger to see what's going on. A breakpoint is a flag, to which
+the debugger will run without interruption, when it reaches the breakpoint, it
+will stop execution and offer a prompt for further interaction. In normal
+use, these debugger commands are completely ignored, and they are safe - if a
+little messy, to leave in production code.
+
+ my ($in, $out) = ($num, $num);
+ $DB::single=2; # insert at line 9!
+ if ($deg eq 'c')
+ ...
+
+ > perl -d temp -f33.3
+ Default die handler restored.
+
+ Loading DB routines from perl5db.pl version 1.07
+ Editor support available.
+
+ Enter h or `h h' for help, or `man perldebug' for more help.
+
+ main::(temp:4): my $arg = $ARGV[0] || '-c100';
+
+We'll simply continue down to our pre-set breakpoint with a 'B<c>':
+
+ DB<1> c
+ main::(temp:10): if ($deg eq 'c') {
+
+Followed by a window command to see where we are:
+
+ DB<1> w
+ 7: my ($deg, $num) = ($1, $2);
+ 8: my ($in, $out) = ($num, $num);
+ 9: $DB::single=2;
+ 10==> if ($deg eq 'c') {
+ 11: $deg = 'f';
+ 12: $out = &c2f($num);
+ 13 } else {
+ 14: $deg = 'c';
+ 15: $out = &f2c($num);
+ 16 }
+
+And a print to show what values we're currently using:
+
+ DB<1> p $deg, $num
+ f33.3
+
+We can put another break point on any line beginning with a colon, we'll use
+line 17 as that's just as we come out of the subroutine, and we'd like to
+pause there later on:
+
+ DB<2> b 17
+
+There's no feedback from this, but you can see what breakpoints are set by
+using the list 'L' command:
+
+ DB<3> L
+ temp:
+ 17: print "$out $deg\n";
+ break if (1)
+
+Note that to delete a breakpoint you use 'd' or 'D'.
+
+Now we'll continue down into our subroutine, this time rather than by line
+number, we'll use the subroutine name, followed by the now familiar 'w':
+
+ DB<3> c f2c
+ main::f2c(temp:30): my $f = shift;
+
+ DB<4> w
+ 24: exit;
+ 25
+ 26 sub f2c {
+ 27==> my $f = shift;
+ 28: my $c = 5 * $f - 32 / 9;
+ 29: return $c;
+ 30 }
+ 31
+ 32 sub c2f {
+ 33: my $c = shift;
+
+
+Note that if there was a subroutine call between us and line 29, and we wanted
+to B<single-step> through it, we could use the 'B<s>' command, and to step
+over it we would use 'B<n>' which would execute the sub, but not descend into
+it for inspection. In this case though, we simply continue down to line 29:
+
+ DB<4> c 29
+ main::f2c(temp:29): return $c;
+
+And have a look at the return value:
+
+ DB<5> p $c
+ 162.944444444444
+
+This is not the right answer at all, but the sum looks correct. I wonder if
+it's anything to do with operator precedence? We'll try a couple of other
+possibilities with our sum:
+
+ DB<6> p (5 * $f - 32 / 9)
+ 162.944444444444
+
+ DB<7> p 5 * $f - (32 / 9)
+ 162.944444444444
+
+ DB<8> p (5 * $f) - 32 / 9
+ 162.944444444444
+
+ DB<9> p 5 * ($f - 32) / 9
+ 0.722222222222221
+
+:-) that's more like it! Ok, now we can set our return variable and we'll
+return out of the sub with an 'r':
+
+ DB<10> $c = 5 * ($f - 32) / 9
+
+ DB<11> r
+ scalar context return from main::f2c: 0.722222222222221
+
+Looks good, let's just continue off the end of the script:
+
+ DB<12> c
+ 0.72 c
+ Debugged program terminated. Use q to quit or R to restart,
+ use O inhibit_exit to avoid stopping after program termination,
+ h q, h R or h O to get additional info.
+
+A quick fix to the offending line (insert the missing parentheses) in the
+actual program and we're finished.
+
+
+=head1 Placeholder for a, w, t, T
+
+Actions, watch variables, stack traces etc.: on the TODO list.
+
+ a
+
+ W
+
+ t
+
+ T
+
+
+=head1 REGULAR EXPRESSIONS
+
+Ever wanted to know what a regex looked like? You'll need perl compiled with
+the DEBUGGING flag for this one:
+
+ > perl -Dr -e '/^pe(a)*rl$/i'
+ Compiling REx `^pe(a)*rl$'
+ size 17 first at 2
+ rarest char
+ at 0
+ 1: BOL(2)
+ 2: EXACTF <pe>(4)
+ 4: CURLYN[1] {0,32767}(14)
+ 6: NOTHING(8)
+ 8: EXACTF <a>(0)
+ 12: WHILEM(0)
+ 13: NOTHING(14)
+ 14: EXACTF <rl>(16)
+ 16: EOL(17)
+ 17: END(0)
+ floating `'$ at 4..2147483647 (checking floating) stclass `EXACTF <pe>'
+anchored(BOL) minlen 4
+ Omitting $` $& $' support.
+
+ EXECUTING...
+
+ Freeing REx: `^pe(a)*rl$'
+
+Did you really want to know? :-)
+For more gory details on getting regular expressions to work, have a look at
+L<perlre>, L<perlretut>, and to decode the mysterious labels (BOL and CURLYN,
+etc. above), see L<perldebguts>.
+
+
+=head1 OUTPUT TIPS
+
+To get all the output from your error log, and not miss any messages via
+helpful operating system buffering, insert a line like this, at the start of
+your script:
+
+ $|=1;
+
+To watch the tail of a dynamically growing logfile, (from the command line):
+
+ tail -f $error_log
+
+Wrapping all die calls in a handler routine can be useful to see how, and from
+where, they're being called, L<perlvar> has more information:
+
+ BEGIN { $SIG{__DIE__} = sub { require Carp; Carp::confess(@_) } }
+
+Various useful techniques for the redirection of STDOUT and STDERR filehandles
+are explained in L<perlopentut> and L<perlfaq8>.
+
+
+=head1 CGI
+
+Just a quick hint here for all those CGI programmers who can't figure out how
+on earth to get past that 'waiting for input' prompt, when running their CGI
+script from the command-line, try something like this:
+
+ > perl -d my_cgi.pl -nodebug
+
+Of course L<CGI> and L<perlfaq9> will tell you more.
+
+
+=head1 GUIs
+
+The command line interface is tightly integrated with an B<emacs> extension
+and there's a B<vi> interface too.
+
+You don't have to do this all on the command line, though, there are a few GUI
+options out there. The nice thing about these is you can wave a mouse over a
+variable and a dump of it's data will appear in an appropriate window, or in a
+popup balloon, no more tiresome typing of 'x $varname' :-)
+
+In particular have a hunt around for the following:
+
+B<ptkdb> perlTK based wrapper for the built-in debugger
+
+B<ddd> data display debugger
+
+B<PerlDevKit> and B<PerlBuilder> are NT specific
+
+NB. (more info on these and others would be appreciated).
+
+
+=head1 SUMMARY
+
+We've seen how to encourage good coding practices with B<use strict> and
+B<-w>. We can run the perl debugger B<perl -d scriptname> to inspect your
+data from within the perl debugger with the B<p> and B<x> commands. You can
+walk through your code, set breakpoints with B<b> and step through that code
+with B<s> or B<n>, continue with B<c> and return from a sub with B<r>. Fairly
+intuitive stuff when you get down to it.
+
+There is of course lots more to find out about, this has just scratched the
+surface. The best way to learn more is to use perldoc to find out more about
+the language, to read the on-line help (L<perldebug> is probably the next
+place to go), and of course, experiment.
+
+
+=head1 SEE ALSO
+
+L<perldebug>,
+L<perldebguts>,
+L<perldiag>,
+L<dprofpp>,
+L<perlrun>
+
+
+=head1 AUTHOR
+
+Richard Foley <richard@rfi.net> Copyright (c) 2000
+
+
+=head1 CONTRIBUTORS
+
+Various people have made helpful suggestions and contributions, in particular:
+
+Ronald J Kimball <rjk@linguist.dartmouth.edu>
+
+Hugo van der Sanden <hv@crypt0.demon.co.uk>
+
+Peter Scott <Peter@PSDT.com>
+
diff --git a/contrib/perl5/pod/perlebcdic.pod b/contrib/perl5/pod/perlebcdic.pod
new file mode 100644
index 0000000000000..12ea2f3ef4b16
--- /dev/null
+++ b/contrib/perl5/pod/perlebcdic.pod
@@ -0,0 +1,1235 @@
+=head1 NAME
+
+perlebcdic - Considerations for running Perl on EBCDIC platforms
+
+=head1 DESCRIPTION
+
+An exploration of some of the issues facing Perl programmers
+on EBCDIC based computers. We do not cover localization,
+internationalization, or multi byte character set issues (yet).
+
+Portions that are still incomplete are marked with XXX.
+
+=head1 COMMON CHARACTER CODE SETS
+
+=head2 ASCII
+
+The American Standard Code for Information Interchange is a set of
+integers running from 0 to 127 (decimal) that imply character
+interpretation by the display and other system(s) of computers.
+The range 0..127 can be covered by setting the bits in a 7-bit binary
+digit, hence the set is sometimes referred to as a "7-bit ASCII".
+ASCII was described by the American National Standards Institute
+document ANSI X3.4-1986. It was also described by ISO 646:1991
+(with localization for currency symbols). The full ASCII set is
+given in the table below as the first 128 elements. Languages that
+can be written adequately with the characters in ASCII include
+English, Hawaiian, Indonesian, Swahili and some Native American
+languages.
+
+There are many character sets that extend the range of integers
+from 0..2**7-1 up to 2**8-1, or 8 bit bytes (octets if you prefer).
+One common one is the ISO 8859-1 character set.
+
+=head2 ISO 8859
+
+The ISO 8859-$n are a collection of character code sets from the
+International Organization for Standardization (ISO) each of which
+adds characters to the ASCII set that are typically found in European
+languages many of which are based on the Roman, or Latin, alphabet.
+
+=head2 Latin 1 (ISO 8859-1)
+
+A particular 8-bit extension to ASCII that includes grave and acute
+accented Latin characters. Languages that can employ ISO 8859-1
+include all the languages covered by ASCII as well as Afrikaans,
+Albanian, Basque, Catalan, Danish, Faroese, Finnish, Norwegian,
+Portugese, Spanish, and Swedish. Dutch is covered albeit without
+the ij ligature. French is covered too but without the oe ligature.
+German can use ISO 8859-1 but must do so without German-style
+quotation marks. This set is based on Western European extensions
+to ASCII and is commonly encountered in world wide web work.
+In IBM character code set identification terminology ISO 8859-1 is
+also known as CCSID 819 (or sometimes 0819 or even 00819).
+
+=head2 EBCDIC
+
+The Extended Binary Coded Decimal Interchange Code refers to a
+large collection of slightly different single and multi byte
+coded character sets that are different from ASCII or ISO 8859-1
+and typically run on host computers. The EBCDIC encodings derive
+from 8 bit byte extensions of Hollerith punched card encodings.
+The layout on the cards was such that high bits were set for the
+upper and lower case alphabet characters [a-z] and [A-Z], but there
+were gaps within each latin alphabet range.
+
+Some IBM EBCDIC character sets may be known by character code set
+identification numbers (CCSID numbers) or code page numbers. Leading
+zero digits in CCSID numbers within this document are insignificant.
+E.g. CCSID 0037 may be referred to as 37 in places.
+
+=head2 13 variant characters
+
+Among IBM EBCDIC character code sets there are 13 characters that
+are often mapped to different integer values. Those characters
+are known as the 13 "variant" characters and are:
+
+ \ [ ] { } ^ ~ ! # | $ @ `
+
+=head2 0037
+
+Character code set ID 0037 is a mapping of the ASCII plus Latin-1
+characters (i.e. ISO 8859-1) to an EBCDIC set. 0037 is used
+in North American English locales on the OS/400 operating system
+that runs on AS/400 computers. CCSID 37 differs from ISO 8859-1
+in 237 places, in other words they agree on only 19 code point values.
+
+=head2 1047
+
+Character code set ID 1047 is also a mapping of the ASCII plus
+Latin-1 characters (i.e. ISO 8859-1) to an EBCDIC set. 1047 is
+used under Unix System Services for OS/390, and OpenEdition for VM/ESA.
+CCSID 1047 differs from CCSID 0037 in eight places.
+
+=head2 POSIX-BC
+
+The EBCDIC code page in use on Siemens' BS2000 system is distinct from
+1047 and 0037. It is identified below as the POSIX-BC set.
+
+=head1 SINGLE OCTET TABLES
+
+The following tables list the ASCII and Latin 1 ordered sets including
+the subsets: C0 controls (0..31), ASCII graphics (32..7e), delete (7f),
+C1 controls (80..9f), and Latin-1 (a.k.a. ISO 8859-1) (a0..ff). In the
+table non-printing control character names as well as the Latin 1
+extensions to ASCII have been labelled with character names roughly
+corresponding to I<The Unicode Standard, Version 2.0> albeit with
+substitutions such as s/LATIN// and s/VULGAR// in all cases,
+s/CAPITAL LETTER// in some cases, and s/SMALL LETTER ([A-Z])/\l$1/
+in some other cases (the C<charnames> pragma names unfortunately do
+not list explicit names for the C0 or C1 control characters). The
+"names" of the C1 control set (128..159 in ISO 8859-1) listed here are
+somewhat arbitrary. The differences between the 0037 and 1047 sets are
+flagged with ***. The differences between the 1047 and POSIX-BC sets
+are flagged with ###. All ord() numbers listed are decimal. If you
+would rather see this table listing octal values then run the table
+(that is, the pod version of this document since this recipe may not
+work with a pod2_other_format translation) through:
+
+=over 4
+
+=item recipe 0
+
+=back
+
+ perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \
+ -e '{printf("%s%-9o%-9o%-9o%-9o\n",$1,$2,$3,$4,$5)}' perlebcdic.pod
+
+If you would rather see this table listing hexadecimal values then
+run the table through:
+
+=over 4
+
+=item recipe 1
+
+=back
+
+ perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \
+ -e '{printf("%s%-9X%-9X%-9X%-9X\n",$1,$2,$3,$4,$5)}' perlebcdic.pod
+
+
+ 8859-1
+ chr 0819 0037 1047 POSIX-BC
+ ----------------------------------------------------------------
+ <NULL> 0 0 0 0
+ <START OF HEADING> 1 1 1 1
+ <START OF TEXT> 2 2 2 2
+ <END OF TEXT> 3 3 3 3
+ <END OF TRANSMISSION> 4 55 55 55
+ <ENQUIRY> 5 45 45 45
+ <ACKNOWLEDGE> 6 46 46 46
+ <BELL> 7 47 47 47
+ <BACKSPACE> 8 22 22 22
+ <HORIZONTAL TABULATION> 9 5 5 5
+ <LINE FEED> 10 37 21 21 ***
+ <VERTICAL TABULATION> 11 11 11 11
+ <FORM FEED> 12 12 12 12
+ <CARRIAGE RETURN> 13 13 13 13
+ <SHIFT OUT> 14 14 14 14
+ <SHIFT IN> 15 15 15 15
+ <DATA LINK ESCAPE> 16 16 16 16
+ <DEVICE CONTROL ONE> 17 17 17 17
+ <DEVICE CONTROL TWO> 18 18 18 18
+ <DEVICE CONTROL THREE> 19 19 19 19
+ <DEVICE CONTROL FOUR> 20 60 60 60
+ <NEGATIVE ACKNOWLEDGE> 21 61 61 61
+ <SYNCHRONOUS IDLE> 22 50 50 50
+ <END OF TRANSMISSION BLOCK> 23 38 38 38
+ <CANCEL> 24 24 24 24
+ <END OF MEDIUM> 25 25 25 25
+ <SUBSTITUTE> 26 63 63 63
+ <ESCAPE> 27 39 39 39
+ <FILE SEPARATOR> 28 28 28 28
+ <GROUP SEPARATOR> 29 29 29 29
+ <RECORD SEPARATOR> 30 30 30 30
+ <UNIT SEPARATOR> 31 31 31 31
+ <SPACE> 32 64 64 64
+ ! 33 90 90 90
+ " 34 127 127 127
+ # 35 123 123 123
+ $ 36 91 91 91
+ % 37 108 108 108
+ & 38 80 80 80
+ ' 39 125 125 125
+ ( 40 77 77 77
+ ) 41 93 93 93
+ * 42 92 92 92
+ + 43 78 78 78
+ , 44 107 107 107
+ - 45 96 96 96
+ . 46 75 75 75
+ / 47 97 97 97
+ 0 48 240 240 240
+ 1 49 241 241 241
+ 2 50 242 242 242
+ 3 51 243 243 243
+ 4 52 244 244 244
+ 5 53 245 245 245
+ 6 54 246 246 246
+ 7 55 247 247 247
+ 8 56 248 248 248
+ 9 57 249 249 249
+ : 58 122 122 122
+ ; 59 94 94 94
+ < 60 76 76 76
+ = 61 126 126 126
+ > 62 110 110 110
+ ? 63 111 111 111
+ @ 64 124 124 124
+ A 65 193 193 193
+ B 66 194 194 194
+ C 67 195 195 195
+ D 68 196 196 196
+ E 69 197 197 197
+ F 70 198 198 198
+ G 71 199 199 199
+ H 72 200 200 200
+ I 73 201 201 201
+ J 74 209 209 209
+ K 75 210 210 210
+ L 76 211 211 211
+ M 77 212 212 212
+ N 78 213 213 213
+ O 79 214 214 214
+ P 80 215 215 215
+ Q 81 216 216 216
+ R 82 217 217 217
+ S 83 226 226 226
+ T 84 227 227 227
+ U 85 228 228 228
+ V 86 229 229 229
+ W 87 230 230 230
+ X 88 231 231 231
+ Y 89 232 232 232
+ Z 90 233 233 233
+ [ 91 186 173 187 *** ###
+ \ 92 224 224 188 ###
+ ] 93 187 189 189 ***
+ ^ 94 176 95 106 *** ###
+ _ 95 109 109 109
+ ` 96 121 121 74 ###
+ a 97 129 129 129
+ b 98 130 130 130
+ c 99 131 131 131
+ d 100 132 132 132
+ e 101 133 133 133
+ f 102 134 134 134
+ g 103 135 135 135
+ h 104 136 136 136
+ i 105 137 137 137
+ j 106 145 145 145
+ k 107 146 146 146
+ l 108 147 147 147
+ m 109 148 148 148
+ n 110 149 149 149
+ o 111 150 150 150
+ p 112 151 151 151
+ q 113 152 152 152
+ r 114 153 153 153
+ s 115 162 162 162
+ t 116 163 163 163
+ u 117 164 164 164
+ v 118 165 165 165
+ w 119 166 166 166
+ x 120 167 167 167
+ y 121 168 168 168
+ z 122 169 169 169
+ { 123 192 192 251 ###
+ | 124 79 79 79
+ } 125 208 208 253 ###
+ ~ 126 161 161 255 ###
+ <DELETE> 127 7 7 7
+ <C1 0> 128 32 32 32
+ <C1 1> 129 33 33 33
+ <C1 2> 130 34 34 34
+ <C1 3> 131 35 35 35
+ <C1 4> 132 36 36 36
+ <C1 5> 133 21 37 37 ***
+ <C1 6> 134 6 6 6
+ <C1 7> 135 23 23 23
+ <C1 8> 136 40 40 40
+ <C1 9> 137 41 41 41
+ <C1 10> 138 42 42 42
+ <C1 11> 139 43 43 43
+ <C1 12> 140 44 44 44
+ <C1 13> 141 9 9 9
+ <C1 14> 142 10 10 10
+ <C1 15> 143 27 27 27
+ <C1 16> 144 48 48 48
+ <C1 17> 145 49 49 49
+ <C1 18> 146 26 26 26
+ <C1 19> 147 51 51 51
+ <C1 20> 148 52 52 52
+ <C1 21> 149 53 53 53
+ <C1 22> 150 54 54 54
+ <C1 23> 151 8 8 8
+ <C1 24> 152 56 56 56
+ <C1 25> 153 57 57 57
+ <C1 26> 154 58 58 58
+ <C1 27> 155 59 59 59
+ <C1 28> 156 4 4 4
+ <C1 29> 157 20 20 20
+ <C1 30> 158 62 62 62
+ <C1 31> 159 255 255 95 ###
+ <NON-BREAKING SPACE> 160 65 65 65
+ <INVERTED EXCLAMATION MARK> 161 170 170 170
+ <CENT SIGN> 162 74 74 176 ###
+ <POUND SIGN> 163 177 177 177
+ <CURRENCY SIGN> 164 159 159 159
+ <YEN SIGN> 165 178 178 178
+ <BROKEN BAR> 166 106 106 208 ###
+ <SECTION SIGN> 167 181 181 181
+ <DIAERESIS> 168 189 187 121 *** ###
+ <COPYRIGHT SIGN> 169 180 180 180
+ <FEMININE ORDINAL INDICATOR> 170 154 154 154
+ <LEFT POINTING GUILLEMET> 171 138 138 138
+ <NOT SIGN> 172 95 176 186 *** ###
+ <SOFT HYPHEN> 173 202 202 202
+ <REGISTERED TRADE MARK SIGN> 174 175 175 175
+ <MACRON> 175 188 188 161 ###
+ <DEGREE SIGN> 176 144 144 144
+ <PLUS-OR-MINUS SIGN> 177 143 143 143
+ <SUPERSCRIPT TWO> 178 234 234 234
+ <SUPERSCRIPT THREE> 179 250 250 250
+ <ACUTE ACCENT> 180 190 190 190
+ <MICRO SIGN> 181 160 160 160
+ <PARAGRAPH SIGN> 182 182 182 182
+ <MIDDLE DOT> 183 179 179 179
+ <CEDILLA> 184 157 157 157
+ <SUPERSCRIPT ONE> 185 218 218 218
+ <MASC. ORDINAL INDICATOR> 186 155 155 155
+ <RIGHT POINTING GUILLEMET> 187 139 139 139
+ <FRACTION ONE QUARTER> 188 183 183 183
+ <FRACTION ONE HALF> 189 184 184 184
+ <FRACTION THREE QUARTERS> 190 185 185 185
+ <INVERTED QUESTION MARK> 191 171 171 171
+ <A WITH GRAVE> 192 100 100 100
+ <A WITH ACUTE> 193 101 101 101
+ <A WITH CIRCUMFLEX> 194 98 98 98
+ <A WITH TILDE> 195 102 102 102
+ <A WITH DIAERESIS> 196 99 99 99
+ <A WITH RING ABOVE> 197 103 103 103
+ <CAPITAL LIGATURE AE> 198 158 158 158
+ <C WITH CEDILLA> 199 104 104 104
+ <E WITH GRAVE> 200 116 116 116
+ <E WITH ACUTE> 201 113 113 113
+ <E WITH CIRCUMFLEX> 202 114 114 114
+ <E WITH DIAERESIS> 203 115 115 115
+ <I WITH GRAVE> 204 120 120 120
+ <I WITH ACUTE> 205 117 117 117
+ <I WITH CIRCUMFLEX> 206 118 118 118
+ <I WITH DIAERESIS> 207 119 119 119
+ <CAPITAL LETTER ETH> 208 172 172 172
+ <N WITH TILDE> 209 105 105 105
+ <O WITH GRAVE> 210 237 237 237
+ <O WITH ACUTE> 211 238 238 238
+ <O WITH CIRCUMFLEX> 212 235 235 235
+ <O WITH TILDE> 213 239 239 239
+ <O WITH DIAERESIS> 214 236 236 236
+ <MULTIPLICATION SIGN> 215 191 191 191
+ <O WITH STROKE> 216 128 128 128
+ <U WITH GRAVE> 217 253 253 224 ###
+ <U WITH ACUTE> 218 254 254 254
+ <U WITH CIRCUMFLEX> 219 251 251 221 ###
+ <U WITH DIAERESIS> 220 252 252 252
+ <Y WITH ACUTE> 221 173 186 173 *** ###
+ <CAPITAL LETTER THORN> 222 174 174 174
+ <SMALL LETTER SHARP S> 223 89 89 89
+ <a WITH GRAVE> 224 68 68 68
+ <a WITH ACUTE> 225 69 69 69
+ <a WITH CIRCUMFLEX> 226 66 66 66
+ <a WITH TILDE> 227 70 70 70
+ <a WITH DIAERESIS> 228 67 67 67
+ <a WITH RING ABOVE> 229 71 71 71
+ <SMALL LIGATURE ae> 230 156 156 156
+ <c WITH CEDILLA> 231 72 72 72
+ <e WITH GRAVE> 232 84 84 84
+ <e WITH ACUTE> 233 81 81 81
+ <e WITH CIRCUMFLEX> 234 82 82 82
+ <e WITH DIAERESIS> 235 83 83 83
+ <i WITH GRAVE> 236 88 88 88
+ <i WITH ACUTE> 237 85 85 85
+ <i WITH CIRCUMFLEX> 238 86 86 86
+ <i WITH DIAERESIS> 239 87 87 87
+ <SMALL LETTER eth> 240 140 140 140
+ <n WITH TILDE> 241 73 73 73
+ <o WITH GRAVE> 242 205 205 205
+ <o WITH ACUTE> 243 206 206 206
+ <o WITH CIRCUMFLEX> 244 203 203 203
+ <o WITH TILDE> 245 207 207 207
+ <o WITH DIAERESIS> 246 204 204 204
+ <DIVISION SIGN> 247 225 225 225
+ <o WITH STROKE> 248 112 112 112
+ <u WITH GRAVE> 249 221 221 192 ###
+ <u WITH ACUTE> 250 222 222 222
+ <u WITH CIRCUMFLEX> 251 219 219 219
+ <u WITH DIAERESIS> 252 220 220 220
+ <y WITH ACUTE> 253 141 141 141
+ <SMALL LETTER thorn> 254 142 142 142
+ <y WITH DIAERESIS> 255 223 223 223
+
+If you would rather see the above table in CCSID 0037 order rather than
+ASCII + Latin-1 order then run the table through:
+
+=over 4
+
+=item recipe 2
+
+=back
+
+ perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\
+ -e '{push(@l,$_)}' \
+ -e 'END{print map{$_->[0]}' \
+ -e ' sort{$a->[1] <=> $b->[1]}' \
+ -e ' map{[$_,substr($_,42,3)]}@l;}' perlebcdic.pod
+
+If you would rather see it in CCSID 1047 order then change the digit
+42 in the last line to 51, like this:
+
+=over 4
+
+=item recipe 3
+
+=back
+
+ perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\
+ -e '{push(@l,$_)}' \
+ -e 'END{print map{$_->[0]}' \
+ -e ' sort{$a->[1] <=> $b->[1]}' \
+ -e ' map{[$_,substr($_,51,3)]}@l;}' perlebcdic.pod
+
+If you would rather see it in POSIX-BC order then change the digit
+51 in the last line to 60, like this:
+
+=over 4
+
+=item recipe 4
+
+=back
+
+ perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\
+ -e '{push(@l,$_)}' \
+ -e 'END{print map{$_->[0]}' \
+ -e ' sort{$a->[1] <=> $b->[1]}' \
+ -e ' map{[$_,substr($_,60,3)]}@l;}' perlebcdic.pod
+
+
+=head1 IDENTIFYING CHARACTER CODE SETS
+
+To determine the character set you are running under from perl one
+could use the return value of ord() or chr() to test one or more
+character values. For example:
+
+ $is_ascii = "A" eq chr(65);
+ $is_ebcdic = "A" eq chr(193);
+
+Also, "\t" is a C<HORIZONTAL TABULATION> character so that:
+
+ $is_ascii = ord("\t") == 9;
+ $is_ebcdic = ord("\t") == 5;
+
+To distinguish EBCDIC code pages try looking at one or more of
+the characters that differ between them. For example:
+
+ $is_ebcdic_37 = "\n" eq chr(37);
+ $is_ebcdic_1047 = "\n" eq chr(21);
+
+Or better still choose a character that is uniquely encoded in any
+of the code sets, e.g.:
+
+ $is_ascii = ord('[') == 91;
+ $is_ebcdic_37 = ord('[') == 186;
+ $is_ebcdic_1047 = ord('[') == 173;
+ $is_ebcdic_POSIX_BC = ord('[') == 187;
+
+However, it would be unwise to write tests such as:
+
+ $is_ascii = "\r" ne chr(13); # WRONG
+ $is_ascii = "\n" ne chr(10); # ILL ADVISED
+
+Obviously the first of these will fail to distinguish most ASCII machines
+from either a CCSID 0037, a 1047, or a POSIX-BC EBCDIC machine since "\r" eq
+chr(13) under all of those coded character sets. But note too that
+because "\n" is chr(13) and "\r" is chr(10) on the MacIntosh (which is an
+ASCII machine) the second C<$is_ascii> test will lead to trouble there.
+
+To determine whether or not perl was built under an EBCDIC
+code page you can use the Config module like so:
+
+ use Config;
+ $is_ebcdic = $Config{'ebcdic'} eq 'define';
+
+=head1 CONVERSIONS
+
+=head2 tr///
+
+In order to convert a string of characters from one character set to
+another a simple list of numbers, such as in the right columns in the
+above table, along with perl's tr/// operator is all that is needed.
+The data in the table are in ASCII order hence the EBCDIC columns
+provide easy to use ASCII to EBCDIC operations that are also easily
+reversed.
+
+For example, to convert ASCII to code page 037 take the output of the second
+column from the output of recipe 0 (modified to add \\ characters) and use
+it in tr/// like so:
+
+ $cp_037 =
+ '\000\001\002\003\234\011\206\177\227\215\216\013\014\015\016\017' .
+ '\020\021\022\023\235\205\010\207\030\031\222\217\034\035\036\037' .
+ '\200\201\202\203\204\012\027\033\210\211\212\213\214\005\006\007' .
+ '\220\221\026\223\224\225\226\004\230\231\232\233\024\025\236\032' .
+ '\040\240\342\344\340\341\343\345\347\361\242\056\074\050\053\174' .
+ '\046\351\352\353\350\355\356\357\354\337\041\044\052\051\073\254' .
+ '\055\057\302\304\300\301\303\305\307\321\246\054\045\137\076\077' .
+ '\370\311\312\313\310\315\316\317\314\140\072\043\100\047\075\042' .
+ '\330\141\142\143\144\145\146\147\150\151\253\273\360\375\376\261' .
+ '\260\152\153\154\155\156\157\160\161\162\252\272\346\270\306\244' .
+ '\265\176\163\164\165\166\167\170\171\172\241\277\320\335\336\256' .
+ '\136\243\245\267\251\247\266\274\275\276\133\135\257\250\264\327' .
+ '\173\101\102\103\104\105\106\107\110\111\255\364\366\362\363\365' .
+ '\175\112\113\114\115\116\117\120\121\122\271\373\374\371\372\377' .
+ '\134\367\123\124\125\126\127\130\131\132\262\324\326\322\323\325' .
+ '\060\061\062\063\064\065\066\067\070\071\263\333\334\331\332\237' ;
+
+ my $ebcdic_string = $ascii_string;
+ eval '$ebcdic_string =~ tr/\000-\377/' . $cp_037 . '/';
+
+To convert from EBCDIC 037 to ASCII just reverse the order of the tr///
+arguments like so:
+
+ my $ascii_string = $ebcdic_string;
+ eval '$ascii_string = tr/' . $cp_037 . '/\000-\377/';
+
+Similarly one could take the output of the third column from recipe 0 to
+obtain a C<$cp_1047> table. The fourth column of the output from recipe
+0 could provide a C<$cp_posix_bc> table suitable for transcoding as well.
+
+=head2 iconv
+
+XPG operability often implies the presence of an I<iconv> utility
+available from the shell or from the C library. Consult your system's
+documentation for information on iconv.
+
+On OS/390 see the iconv(1) man page. One way to invoke the iconv
+shell utility from within perl would be to:
+
+ # OS/390 example
+ $ascii_data = `echo '$ebcdic_data'| iconv -f IBM-1047 -t ISO8859-1`
+
+or the inverse map:
+
+ # OS/390 example
+ $ebcdic_data = `echo '$ascii_data'| iconv -f ISO8859-1 -t IBM-1047`
+
+For other perl based conversion options see the Convert::* modules on CPAN.
+
+=head2 C RTL
+
+The OS/390 C run time library provides _atoe() and _etoa() functions.
+
+=head1 OPERATOR DIFFERENCES
+
+The C<..> range operator treats certain character ranges with
+care on EBCDIC machines. For example the following array
+will have twenty six elements on either an EBCDIC machine
+or an ASCII machine:
+
+ @alphabet = ('A'..'Z'); # $#alphabet == 25
+
+The bitwise operators such as & ^ | may return different results
+when operating on string or character data in a perl program running
+on an EBCDIC machine than when run on an ASCII machine. Here is
+an example adapted from the one in L<perlop>:
+
+ # EBCDIC-based examples
+ print "j p \n" ^ " a h"; # prints "JAPH\n"
+ print "JA" | " ph\n"; # prints "japh\n"
+ print "JAPH\nJunk" & "\277\277\277\277\277"; # prints "japh\n";
+ print 'p N$' ^ " E<H\n"; # prints "Perl\n";
+
+An interesting property of the 32 C0 control characters
+in the ASCII table is that they can "literally" be constructed
+as control characters in perl, e.g. C<(chr(0) eq "\c@")>
+C<(chr(1) eq "\cA")>, and so on. Perl on EBCDIC machines has been
+ported to take "\c@" to chr(0) and "\cA" to chr(1) as well, but the
+thirty three characters that result depend on which code page you are
+using. The table below uses the character names from the previous table
+but with substitutions such as s/START OF/S.O./; s/END OF /E.O./;
+s/TRANSMISSION/TRANS./; s/TABULATION/TAB./; s/VERTICAL/VERT./;
+s/HORIZONTAL/HORIZ./; s/DEVICE CONTROL/D.C./; s/SEPARATOR/SEP./;
+s/NEGATIVE ACKNOWLEDGE/NEG. ACK./;. The POSIX-BC and 1047 sets are
+identical throughout this range and differ from the 0037 set at only
+one spot (21 decimal). Note that the C<LINE FEED> character
+may be generated by "\cJ" on ASCII machines but by "\cU" on 1047 or POSIX-BC
+machines and cannot be generated as a C<"\c.letter."> control character on
+0037 machines. Note also that "\c\\" maps to two characters
+not one.
+
+ chr ord 8859-1 0037 1047 && POSIX-BC
+ ------------------------------------------------------------------------
+ "\c?" 127 <DELETE> " " ***><
+ "\c@" 0 <NULL> <NULL> <NULL> ***><
+ "\cA" 1 <S.O. HEADING> <S.O. HEADING> <S.O. HEADING>
+ "\cB" 2 <S.O. TEXT> <S.O. TEXT> <S.O. TEXT>
+ "\cC" 3 <E.O. TEXT> <E.O. TEXT> <E.O. TEXT>
+ "\cD" 4 <E.O. TRANS.> <C1 28> <C1 28>
+ "\cE" 5 <ENQUIRY> <HORIZ. TAB.> <HORIZ. TAB.>
+ "\cF" 6 <ACKNOWLEDGE> <C1 6> <C1 6>
+ "\cG" 7 <BELL> <DELETE> <DELETE>
+ "\cH" 8 <BACKSPACE> <C1 23> <C1 23>
+ "\cI" 9 <HORIZ. TAB.> <C1 13> <C1 13>
+ "\cJ" 10 <LINE FEED> <C1 14> <C1 14>
+ "\cK" 11 <VERT. TAB.> <VERT. TAB.> <VERT. TAB.>
+ "\cL" 12 <FORM FEED> <FORM FEED> <FORM FEED>
+ "\cM" 13 <CARRIAGE RETURN> <CARRIAGE RETURN> <CARRIAGE RETURN>
+ "\cN" 14 <SHIFT OUT> <SHIFT OUT> <SHIFT OUT>
+ "\cO" 15 <SHIFT IN> <SHIFT IN> <SHIFT IN>
+ "\cP" 16 <DATA LINK ESCAPE> <DATA LINK ESCAPE> <DATA LINK ESCAPE>
+ "\cQ" 17 <D.C. ONE> <D.C. ONE> <D.C. ONE>
+ "\cR" 18 <D.C. TWO> <D.C. TWO> <D.C. TWO>
+ "\cS" 19 <D.C. THREE> <D.C. THREE> <D.C. THREE>
+ "\cT" 20 <D.C. FOUR> <C1 29> <C1 29>
+ "\cU" 21 <NEG. ACK.> <C1 5> <LINE FEED> ***
+ "\cV" 22 <SYNCHRONOUS IDLE> <BACKSPACE> <BACKSPACE>
+ "\cW" 23 <E.O. TRANS. BLOCK> <C1 7> <C1 7>
+ "\cX" 24 <CANCEL> <CANCEL> <CANCEL>
+ "\cY" 25 <E.O. MEDIUM> <E.O. MEDIUM> <E.O. MEDIUM>
+ "\cZ" 26 <SUBSTITUTE> <C1 18> <C1 18>
+ "\c[" 27 <ESCAPE> <C1 15> <C1 15>
+ "\c\\" 28 <FILE SEP.>\ <FILE SEP.>\ <FILE SEP.>\
+ "\c]" 29 <GROUP SEP.> <GROUP SEP.> <GROUP SEP.>
+ "\c^" 30 <RECORD SEP.> <RECORD SEP.> <RECORD SEP.> ***><
+ "\c_" 31 <UNIT SEP.> <UNIT SEP.> <UNIT SEP.> ***><
+
+
+=head1 FUNCTION DIFFERENCES
+
+=over 8
+
+=item chr()
+
+chr() must be given an EBCDIC code number argument to yield a desired
+character return value on an EBCDIC machine. For example:
+
+ $CAPITAL_LETTER_A = chr(193);
+
+=item ord()
+
+ord() will return EBCDIC code number values on an EBCDIC machine.
+For example:
+
+ $the_number_193 = ord("A");
+
+=item pack()
+
+The c and C templates for pack() are dependent upon character set
+encoding. Examples of usage on EBCDIC include:
+
+ $foo = pack("CCCC",193,194,195,196);
+ # $foo eq "ABCD"
+ $foo = pack("C4",193,194,195,196);
+ # same thing
+
+ $foo = pack("ccxxcc",193,194,195,196);
+ # $foo eq "AB\0\0CD"
+
+=item print()
+
+One must be careful with scalars and strings that are passed to
+print that contain ASCII encodings. One common place
+for this to occur is in the output of the MIME type header for
+CGI script writing. For example, many perl programming guides
+recommend something similar to:
+
+ print "Content-type:\ttext/html\015\012\015\012";
+ # this may be wrong on EBCDIC
+
+Under the IBM OS/390 USS Web Server for example you should instead
+write that as:
+
+ print "Content-type:\ttext/html\r\n\r\n"; # OK for DGW et alia
+
+That is because the translation from EBCDIC to ASCII is done
+by the web server in this case (such code will not be appropriate for
+the Macintosh however). Consult your web server's documentation for
+further details.
+
+=item printf()
+
+The formats that can convert characters to numbers and vice versa
+will be different from their ASCII counterparts when executed
+on an EBCDIC machine. Examples include:
+
+ printf("%c%c%c",193,194,195); # prints ABC
+
+=item sort()
+
+EBCDIC sort results may differ from ASCII sort results especially for
+mixed case strings. This is discussed in more detail below.
+
+=item sprintf()
+
+See the discussion of printf() above. An example of the use
+of sprintf would be:
+
+ $CAPITAL_LETTER_A = sprintf("%c",193);
+
+=item unpack()
+
+See the discussion of pack() above.
+
+=back
+
+=head1 REGULAR EXPRESSION DIFFERENCES
+
+As of perl 5.005_03 the letter range regular expression such as
+[A-Z] and [a-z] have been especially coded to not pick up gap
+characters. For example, characters such as E<ocirc> C<o WITH CIRCUMFLEX>
+that lie between I and J would not be matched by the
+regular expression range C</[H-K]/>.
+
+If you do want to match the alphabet gap characters in a single octet
+regular expression try matching the hex or octal code such
+as C</\313/> on EBCDIC or C</\364/> on ASCII machines to
+have your regular expression match C<o WITH CIRCUMFLEX>.
+
+Another construct to be wary of is the inappropriate use of hex or
+octal constants in regular expressions. Consider the following
+set of subs:
+
+ sub is_c0 {
+ my $char = substr(shift,0,1);
+ $char =~ /[\000-\037]/;
+ }
+
+ sub is_print_ascii {
+ my $char = substr(shift,0,1);
+ $char =~ /[\040-\176]/;
+ }
+
+ sub is_delete {
+ my $char = substr(shift,0,1);
+ $char eq "\177";
+ }
+
+ sub is_c1 {
+ my $char = substr(shift,0,1);
+ $char =~ /[\200-\237]/;
+ }
+
+ sub is_latin_1 {
+ my $char = substr(shift,0,1);
+ $char =~ /[\240-\377]/;
+ }
+
+The above would be adequate if the concern was only with numeric code points.
+However, the concern may be with characters rather than code points
+and on an EBCDIC machine it may be desirable for constructs such as
+C<if (is_print_ascii("A")) {print "A is a printable character\n";}> to print
+out the expected message. One way to represent the above collection
+of character classification subs that is capable of working across the
+four coded character sets discussed in this document is as follows:
+
+ sub Is_c0 {
+ my $char = substr(shift,0,1);
+ if (ord('^')==94) { # ascii
+ return $char =~ /[\000-\037]/;
+ }
+ if (ord('^')==176) { # 37
+ return $char =~ /[\000-\003\067\055-\057\026\005\045\013-\023\074\075\062\046\030\031\077\047\034-\037]/;
+ }
+ if (ord('^')==95 || ord('^')==106) { # 1047 || posix-bc
+ return $char =~ /[\000-\003\067\055-\057\026\005\025\013-\023\074\075\062\046\030\031\077\047\034-\037]/;
+ }
+ }
+
+ sub Is_print_ascii {
+ my $char = substr(shift,0,1);
+ $char =~ /[ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/;
+ }
+
+ sub Is_delete {
+ my $char = substr(shift,0,1);
+ if (ord('^')==94) { # ascii
+ return $char eq "\177";
+ }
+ else { # ebcdic
+ return $char eq "\007";
+ }
+ }
+
+ sub Is_c1 {
+ my $char = substr(shift,0,1);
+ if (ord('^')==94) { # ascii
+ return $char =~ /[\200-\237]/;
+ }
+ if (ord('^')==176) { # 37
+ return $char =~ /[\040-\044\025\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/;
+ }
+ if (ord('^')==95) { # 1047
+ return $char =~ /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/;
+ }
+ if (ord('^')==106) { # posix-bc
+ return $char =~
+ /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\137]/;
+ }
+ }
+
+ sub Is_latin_1 {
+ my $char = substr(shift,0,1);
+ if (ord('^')==94) { # ascii
+ return $char =~ /[\240-\377]/;
+ }
+ if (ord('^')==176) { # 37
+ return $char =~
+ /[\101\252\112\261\237\262\152\265\275\264\232\212\137\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/;
+ }
+ if (ord('^')==95) { # 1047
+ return $char =~
+ /[\101\252\112\261\237\262\152\265\273\264\232\212\260\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\272\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/;
+ }
+ if (ord('^')==106) { # posix-bc
+ return $char =~
+ /[\101\252\260\261\237\262\320\265\171\264\232\212\272\312\257\241\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\340\376\335\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\300\336\333\334\215\216\337]/;
+ }
+ }
+
+Note however that only the C<Is_ascii_print()> sub is really independent
+of coded character set. Another way to write C<Is_latin_1()> would be
+to use the characters in the range explicitly:
+
+ sub Is_latin_1 {
+ my $char = substr(shift,0,1);
+ $char =~ /[ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ]/;
+ }
+
+Although that form may run into trouble in network transit (due to the
+presence of 8 bit characters) or on non ISO-Latin character sets.
+
+=head1 SOCKETS
+
+Most socket programming assumes ASCII character encodings in network
+byte order. Exceptions can include CGI script writing under a
+host web server where the server may take care of translation for you.
+Most host web servers convert EBCDIC data to ISO-8859-1 or Unicode on
+output.
+
+=head1 SORTING
+
+One big difference between ASCII based character sets and EBCDIC ones
+are the relative positions of upper and lower case letters and the
+letters compared to the digits. If sorted on an ASCII based machine the
+two letter abbreviation for a physician comes before the two letter
+for drive, that is:
+
+ @sorted = sort(qw(Dr. dr.)); # @sorted holds ('Dr.','dr.') on ASCII,
+ # but ('dr.','Dr.') on EBCDIC
+
+The property of lower case before uppercase letters in EBCDIC is
+even carried to the Latin 1 EBCDIC pages such as 0037 and 1047.
+An example would be that E<Euml> C<E WITH DIAERESIS> (203) comes
+before E<euml> C<e WITH DIAERESIS> (235) on an ASCII machine, but
+the latter (83) comes before the former (115) on an EBCDIC machine.
+(Astute readers will note that the upper case version of E<szlig>
+C<SMALL LETTER SHARP S> is simply "SS" and that the upper case version of
+E<yuml> C<y WITH DIAERESIS> is not in the 0..255 range but it is
+at U+x0178 in Unicode, or C<"\x{178}"> in a Unicode enabled Perl).
+
+The sort order will cause differences between results obtained on
+ASCII machines versus EBCDIC machines. What follows are some suggestions
+on how to deal with these differences.
+
+=head2 Ignore ASCII vs. EBCDIC sort differences.
+
+This is the least computationally expensive strategy. It may require
+some user education.
+
+=head2 MONO CASE then sort data.
+
+In order to minimize the expense of mono casing mixed test try to
+C<tr///> towards the character set case most employed within the data.
+If the data are primarily UPPERCASE non Latin 1 then apply tr/[a-z]/[A-Z]/
+then sort(). If the data are primarily lowercase non Latin 1 then
+apply tr/[A-Z]/[a-z]/ before sorting. If the data are primarily UPPERCASE
+and include Latin-1 characters then apply:
+
+ tr/[a-z]/[A-Z]/;
+ tr/[àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ]/[ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ]/;
+ s/ß/SS/g;
+
+then sort(). Do note however that such Latin-1 manipulation does not
+address the E<yuml> C<y WITH DIAERESIS> character that will remain at
+code point 255 on ASCII machines, but 223 on most EBCDIC machines
+where it will sort to a place less than the EBCDIC numerals. With a
+Unicode enabled Perl you might try:
+
+ tr/^?/\x{178}/;
+
+The strategy of mono casing data before sorting does not preserve the case
+of the data and may not be acceptable for that reason.
+
+=head2 Convert, sort data, then re convert.
+
+This is the most expensive proposition that does not employ a network
+connection.
+
+=head2 Perform sorting on one type of machine only.
+
+This strategy can employ a network connection. As such
+it would be computationally expensive.
+
+=head1 TRANFORMATION FORMATS
+
+There are a variety of ways of transforming data with an intra character set
+mapping that serve a variety of purposes. Sorting was discussed in the
+previous section and a few of the other more popular mapping techniques are
+discussed next.
+
+=head2 URL decoding and encoding
+
+Note that some URLs have hexadecimal ASCII code points in them in an
+attempt to overcome character or protocol limitation issues. For example
+the tilde character is not on every keyboard hence a URL of the form:
+
+ http://www.pvhp.com/~pvhp/
+
+may also be expressed as either of:
+
+ http://www.pvhp.com/%7Epvhp/
+
+ http://www.pvhp.com/%7epvhp/
+
+where 7E is the hexadecimal ASCII code point for '~'. Here is an example
+of decoding such a URL under CCSID 1047:
+
+ $url = 'http://www.pvhp.com/%7Epvhp/';
+ # this array assumes code page 1047
+ my @a2e_1047 = (
+ 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 21, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 60, 61, 50, 38, 24, 25, 63, 39, 28, 29, 30, 31,
+ 64, 90,127,123, 91,108, 80,125, 77, 93, 92, 78,107, 96, 75, 97,
+ 240,241,242,243,244,245,246,247,248,249,122, 94, 76,126,110,111,
+ 124,193,194,195,196,197,198,199,200,201,209,210,211,212,213,214,
+ 215,216,217,226,227,228,229,230,231,232,233,173,224,189, 95,109,
+ 121,129,130,131,132,133,134,135,136,137,145,146,147,148,149,150,
+ 151,152,153,162,163,164,165,166,167,168,169,192, 79,208,161, 7,
+ 32, 33, 34, 35, 36, 37, 6, 23, 40, 41, 42, 43, 44, 9, 10, 27,
+ 48, 49, 26, 51, 52, 53, 54, 8, 56, 57, 58, 59, 4, 20, 62,255,
+ 65,170, 74,177,159,178,106,181,187,180,154,138,176,202,175,188,
+ 144,143,234,250,190,160,182,179,157,218,155,139,183,184,185,171,
+ 100,101, 98,102, 99,103,158,104,116,113,114,115,120,117,118,119,
+ 172,105,237,238,235,239,236,191,128,253,254,251,252,186,174, 89,
+ 68, 69, 66, 70, 67, 71,156, 72, 84, 81, 82, 83, 88, 85, 86, 87,
+ 140, 73,205,206,203,207,204,225,112,221,222,219,220,141,142,223
+ );
+ $url =~ s/%([0-9a-fA-F]{2})/pack("c",$a2e_1047[hex($1)])/ge;
+
+Conversely, here is a partial solution for the task of encoding such
+a URL under the 1047 code page:
+
+ $url = 'http://www.pvhp.com/~pvhp/';
+ # this array assumes code page 1047
+ my @e2a_1047 = (
+ 0, 1, 2, 3,156, 9,134,127,151,141,142, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19,157, 10, 8,135, 24, 25,146,143, 28, 29, 30, 31,
+ 128,129,130,131,132,133, 23, 27,136,137,138,139,140, 5, 6, 7,
+ 144,145, 22,147,148,149,150, 4,152,153,154,155, 20, 21,158, 26,
+ 32,160,226,228,224,225,227,229,231,241,162, 46, 60, 40, 43,124,
+ 38,233,234,235,232,237,238,239,236,223, 33, 36, 42, 41, 59, 94,
+ 45, 47,194,196,192,193,195,197,199,209,166, 44, 37, 95, 62, 63,
+ 248,201,202,203,200,205,206,207,204, 96, 58, 35, 64, 39, 61, 34,
+ 216, 97, 98, 99,100,101,102,103,104,105,171,187,240,253,254,177,
+ 176,106,107,108,109,110,111,112,113,114,170,186,230,184,198,164,
+ 181,126,115,116,117,118,119,120,121,122,161,191,208, 91,222,174,
+ 172,163,165,183,169,167,182,188,189,190,221,168,175, 93,180,215,
+ 123, 65, 66, 67, 68, 69, 70, 71, 72, 73,173,244,246,242,243,245,
+ 125, 74, 75, 76, 77, 78, 79, 80, 81, 82,185,251,252,249,250,255,
+ 92,247, 83, 84, 85, 86, 87, 88, 89, 90,178,212,214,210,211,213,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,179,219,220,217,218,159
+ );
+ # The following regular expression does not address the
+ # mappings for: ('.' => '%2E', '/' => '%2F', ':' => '%3A')
+ $url =~ s/([\t "#%&\(\),;<=>\?\@\[\\\]^`{|}~])/sprintf("%%%02X",$e2a_1047[ord($1)])/ge;
+
+where a more complete solution would split the URL into components
+and apply a full s/// substitution only to the appropriate parts.
+
+In the remaining examples a @e2a or @a2e array may be employed
+but the assignment will not be shown explicitly. For code page 1047
+you could use the @a2e_1047 or @e2a_1047 arrays just shown.
+
+=head2 uu encoding and decoding
+
+The C<u> template to pack() or unpack() will render EBCDIC data in EBCDIC
+characters equivalent to their ASCII counterparts. For example, the
+following will print "Yes indeed\n" on either an ASCII or EBCDIC computer:
+
+ $all_byte_chrs = '';
+ for (0..255) { $all_byte_chrs .= chr($_); }
+ $uuencode_byte_chrs = pack('u', $all_byte_chrs);
+ ($uu = <<' ENDOFHEREDOC') =~ s/^\s*//gm;
+ M``$"`P0%!@<("0H+#`T.#Q`1$A,4%187&!D:&QP='A\@(2(C)"4F)R@I*BLL
+ M+2XO,#$R,S0U-C<X.3H[/#T^/T!!0D-$149'2$E*2TQ-3D]045)35%565UA9
+ M6EM<75Y?8&%B8V1E9F=H:6IK;&UN;W!Q<G-T=79W>'EZ>WQ]?G^`@8*#A(6&
+ MAXB)BHN,C8Z/D)&2DY25EI>8F9J;G)V>GZ"AHJ.DI::GJ*FJJZRMKJ^PL;*S
+ MM+6VM[BYNKN\O;Z_P,'"P\3%QL?(R<K+S,W.S]#1TM/4U=;7V-G:V]S=WM_@
+ ?X>+CY.7FY^CIZNOL[>[O\/'R\_3U]O?X^?K[_/W^_P``
+ ENDOFHEREDOC
+ if ($uuencode_byte_chrs eq $uu) {
+ print "Yes ";
+ }
+ $uudecode_byte_chrs = unpack('u', $uuencode_byte_chrs);
+ if ($uudecode_byte_chrs eq $all_byte_chrs) {
+ print "indeed\n";
+ }
+
+Here is a very spartan uudecoder that will work on EBCDIC provided
+that the @e2a array is filled in appropriately:
+
+ #!/usr/local/bin/perl
+ @e2a = ( # this must be filled in
+ );
+ $_ = <> until ($mode,$file) = /^begin\s*(\d*)\s*(\S*)/;
+ open(OUT, "> $file") if $file ne "";
+ while(<>) {
+ last if /^end/;
+ next if /[a-z]/;
+ next unless int(((($e2a[ord()] - 32 ) & 077) + 2) / 3) ==
+ int(length() / 4);
+ print OUT unpack("u", $_);
+ }
+ close(OUT);
+ chmod oct($mode), $file;
+
+
+=head2 Quoted-Printable encoding and decoding
+
+On ASCII encoded machines it is possible to strip characters outside of
+the printable set using:
+
+ # This QP encoder works on ASCII only
+ $qp_string =~ s/([=\x00-\x1F\x80-\xFF])/sprintf("=%02X",ord($1))/ge;
+
+Whereas a QP encoder that works on both ASCII and EBCDIC machines
+would look somewhat like the following (where the EBCDIC branch @e2a
+array is omitted for brevity):
+
+ if (ord('A') == 65) { # ASCII
+ $delete = "\x7F"; # ASCII
+ @e2a = (0 .. 255) # ASCII to ASCII identity map
+ }
+ else { # EBCDIC
+ $delete = "\x07"; # EBCDIC
+ @e2a = # EBCDIC to ASCII map (as shown above)
+ }
+ $qp_string =~
+ s/([^ !"\#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~$delete])/sprintf("=%02X",$e2a[ord($1)])/ge;
+
+(although in production code the substitutions might be done
+in the EBCDIC branch with the @e2a array and separately in the
+ASCII branch without the expense of the identity map).
+
+Such QP strings can be decoded with:
+
+ # This QP decoder is limited to ASCII only
+ $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge;
+ $string =~ s/=[\n\r]+$//;
+
+Whereas a QP decoder that works on both ASCII and EBCDIC machines
+would look somewhat like the following (where the @a2e array is
+omitted for brevity):
+
+ $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr $a2e[hex $1]/ge;
+ $string =~ s/=[\n\r]+$//;
+
+=head2 Caesarian cyphers
+
+The practice of shifting an alphabet one or more characters for encipherment
+dates back thousands of years and was explicitly detailed by Gaius Julius
+Caesar in his B<Gallic Wars> text. A single alphabet shift is sometimes
+referred to as a rotation and the shift amount is given as a number $n after
+the string 'rot' or "rot$n". Rot0 and rot26 would designate identity maps
+on the 26 letter English version of the Latin alphabet. Rot13 has the
+interesting property that alternate subsequent invocations are identity maps
+(thus rot13 is its own non-trivial inverse in the group of 26 alphabet
+rotations). Hence the following is a rot13 encoder and decoder that will
+work on ASCII and EBCDIC machines:
+
+ #!/usr/local/bin/perl
+
+ while(<>){
+ tr/n-za-mN-ZA-M/a-zA-Z/;
+ print;
+ }
+
+In one-liner form:
+
+ perl -ne 'tr/n-za-mN-ZA-M/a-zA-Z/;print'
+
+
+=head1 Hashing order and checksums
+
+XXX
+
+=head1 I18N AND L10N
+
+Internationalization(I18N) and localization(L10N) are supported at least
+in principle even on EBCDIC machines. The details are system dependent
+and discussed under the L<perlebcdic/OS ISSUES> section below.
+
+=head1 MULTI OCTET CHARACTER SETS
+
+Multi byte EBCDIC code pages; Unicode, UTF-8, UTF-EBCDIC, XXX.
+
+=head1 OS ISSUES
+
+There may be a few system dependent issues
+of concern to EBCDIC Perl programmers.
+
+=head2 OS/400
+
+The PASE environment.
+
+=over 8
+
+=item IFS access
+
+XXX.
+
+=back
+
+=head2 OS/390
+
+Perl runs under Unix Systems Services or USS.
+
+=over 8
+
+=item chcp
+
+B<chcp> is supported as a shell utility for displaying and changing
+one's code page. See also L<chcp>.
+
+=item dataset access
+
+For sequential data set access try:
+
+ my @ds_records = `cat //DSNAME`;
+
+or:
+
+ my @ds_records = `cat //'HLQ.DSNAME'`;
+
+See also the OS390::Stdio module on CPAN.
+
+=item OS/390 iconv
+
+B<iconv> is supported as both a shell utility and a C RTL routine.
+See also the iconv(1) and iconv(3) manual pages.
+
+=item locales
+
+On OS/390 see L<locale> for information on locales. The L10N files
+are in F</usr/nls/locale>. $Config{d_setlocale} is 'define' on OS/390.
+
+=back
+
+=head2 VM/ESA?
+
+XXX.
+
+=head2 POSIX-BC?
+
+XXX.
+
+=head1 BUGS
+
+This pod document contains literal Latin 1 characters and may encounter
+translation difficulties. In particular one popular nroff implementation
+was known to strip accented characters to their unaccented counterparts
+while attempting to view this document through the B<pod2man> program
+(for example, you may see a plain C<y> rather than one with a diaeresis
+as in E<yuml>). Another nroff truncated the resultant man page at
+the first occurence of 8 bit characters.
+
+Not all shells will allow multiple C<-e> string arguments to perl to
+be concatenated together properly as recipes 2, 3, and 4 might seem
+to imply.
+
+Perl does not yet work with any Unicode features on EBCDIC platforms.
+
+=head1 SEE ALSO
+
+L<perllocale>, L<perlfunc>.
+
+=head1 REFERENCES
+
+http://anubis.dkuug.dk/i18n/charmaps
+
+http://www.unicode.org/
+
+http://www.unicode.org/unicode/reports/tr16/
+
+http://www.wps.com/texts/codes/
+B<ASCII: American Standard Code for Information Infiltration> Tom Jennings,
+September 1999.
+
+B<The Unicode Standard Version 2.0> The Unicode Consortium,
+ISBN 0-201-48345-9, Addison Wesley Developers Press, July 1996.
+
+B<The Unicode Standard Version 3.0> The Unicode Consortium, Lisa Moore ed.,
+ISBN 0-201-61633-5, Addison Wesley Developers Press, February 2000.
+
+B<CDRA: IBM - Character Data Representation Architecture -
+Reference and Registry>, IBM SC09-2190-00, December 1996.
+
+"Demystifying Character Sets", Andrea Vine, Multilingual Computing
+& Technology, B<#26 Vol. 10 Issue 4>, August/September 1999;
+ISSN 1523-0309; Multilingual Computing Inc. Sandpoint ID, USA.
+
+B<Codes, Ciphers, and Other Cryptic and Clandestine Communication>
+Fred B. Wrixon, ISBN 1-57912-040-7, Black Dog & Leventhal Publishers,
+1998.
+
+=head1 AUTHOR
+
+Peter Prymmer pvhp@best.com wrote this in 1999 and 2000
+with CCSID 0819 and 0037 help from Chris Leach and
+AndrE<eacute> Pirard A.Pirard@ulg.ac.be as well as POSIX-BC
+help from Thomas Dorner Thomas.Dorner@start.de.
+Thanks also to Vickie Cooper, Philip Newton, William Raffloer, and
+Joe Smith. Trademarks, registered trademarks, service marks and
+registered service marks used in this document are the property of
+their respective owners.
+
+
diff --git a/contrib/perl5/pod/perlmodlib.PL b/contrib/perl5/pod/perlmodlib.PL
new file mode 100755
index 0000000000000..0cdadb76c79cf
--- /dev/null
+++ b/contrib/perl5/pod/perlmodlib.PL
@@ -0,0 +1,1383 @@
+#!../miniperl
+
+open (OUT, ">perlmodlib.tmp") or die $!;
+my (@pragma, @mod);
+open (MANIFEST, "../MANIFEST") or die $!;
+
+while (<MANIFEST>) {
+ my $filename;
+ next unless s|^lib/|| or m|^ext/|;
+ ($filename) = /(\S+)/;
+ $filename =~ s|^[^/]+/|| if $filename =~ s|^ext/||;
+ next unless $filename =~ /\.p(m|od)$/;
+ next unless open (MOD, "../lib/$filename");
+
+ my ($name, $thing);
+ my $foundit=0;
+ {
+ local $/="";
+ while (<MOD>) {
+ next unless /^=head1 NAME/;
+ $foundit++;
+ last;
+ }
+ }
+ unless ($foundit) {
+ warn "$filename missing head1\n";
+ next;
+ }
+ my $title = <MOD>;
+ chomp($title);
+ close MOD;
+
+ my $perlname = $filename;
+ $perlname =~ s!\.p(m|od)$!!;
+ $perlname =~ s!/!::!g;
+
+ ($name, $thing) = split / --? /, $title, 2;
+
+ unless ($name and $thing) {
+ warn "$filename missing name\n" unless $name;
+ warn "$filename missing thing\n" unless $thing;
+ next;
+ }
+
+ $thing =~ s/^perl pragma to //i;
+ $thing = ucfirst($thing);
+ $title = "=item $perlname\n\n$thing\n\n";
+
+ # print "$perlname $thing\n";
+
+ if ($filename=~/[A-Z]/) {
+ push @mod, $title;
+ } else {
+ push @pragma, $title;
+ }
+}
+
+print OUT <<'EOF';
+# Generated by perlmodlib.PL DO NOT EDIT!
+
+=head1 NAME
+
+perlmodlib - constructing new Perl modules and finding existing ones
+
+=head1 DESCRIPTION
+
+=head1 THE PERL MODULE LIBRARY
+
+Many modules are included the Perl distribution. These are described
+below, and all end in F<.pm>. You may discover compiled library
+file (usually ending in F<.so>) or small pieces of modules to be
+autoloaded (ending in F<.al>); these were automatically generated
+by the installation process. You may also discover files in the
+library directory that end in either F<.pl> or F<.ph>. These are
+old libraries supplied so that old programs that use them still
+run. The F<.pl> files will all eventually be converted into standard
+modules, and the F<.ph> files made by B<h2ph> will probably end up
+as extension modules made by B<h2xs>. (Some F<.ph> values may
+already be available through the POSIX, Errno, or Fcntl modules.)
+The B<pl2pm> file in the distribution may help in your conversion,
+but it's just a mechanical process and therefore far from bulletproof.
+
+=head2 Pragmatic Modules
+
+They work somewhat like compiler directives (pragmata) in that they
+tend to affect the compilation of your program, and thus will usually
+work well only when used within a C<use>, or C<no>. Most of these
+are lexically scoped, so an inner BLOCK may countermand them
+by saying:
+
+ no integer;
+ no strict 'refs';
+ no warnings;
+
+which lasts until the end of that BLOCK.
+
+Some pragmas are lexically scoped--typically those that affect the
+C<$^H> hints variable. Others affect the current package instead,
+like C<use vars> and C<use subs>, which allow you to predeclare a
+variables or subroutines within a particular I<file> rather than
+just a block. Such declarations are effective for the entire file
+for which they were declared. You cannot rescind them with C<no
+vars> or C<no subs>.
+
+The following pragmas are defined (and have their own documentation).
+
+=over 12
+
+EOF
+
+print OUT $_ for (sort @pragma);
+
+print OUT <<EOF;
+=back
+
+=head2 Standard Modules
+
+Standard, bundled modules are all expected to behave in a well-defined
+manner with respect to namespace pollution because they use the
+Exporter module. See their own documentation for details.
+
+=over 12
+
+EOF
+
+print OUT $_ for (sort @mod);
+
+print OUT <<'EOF';
+=back
+
+To find out I<all> modules installed on your system, including
+those without documentation or outside the standard release,
+just do this:
+
+ % find `perl -e 'print "@INC"'` -name '*.pm' -print
+
+They should all have their own documentation installed and accessible
+via your system man(1) command. If you do not have a B<find>
+program, you can use the Perl B<find2perl> program instead, which
+generates Perl code as output you can run through perl. If you
+have a B<man> program but it doesn't find your modules, you'll have
+to fix your manpath. See L<perl> for details. If you have no
+system B<man> command, you might try the B<perldoc> program.
+
+=head2 Extension Modules
+
+Extension modules are written in C (or a mix of Perl and C). They
+are usually dynamically loaded into Perl if and when you need them,
+but may also be be linked in statically. Supported extension modules
+include Socket, Fcntl, and POSIX.
+
+Many popular C extension modules do not come bundled (at least, not
+completely) due to their sizes, volatility, or simply lack of time
+for adequate testing and configuration across the multitude of
+platforms on which Perl was beta-tested. You are encouraged to
+look for them on CPAN (described below), or using web search engines
+like Alta Vista or Deja News.
+
+=head1 CPAN
+
+CPAN stands for Comprehensive Perl Archive Network; it's a globally
+replicated trove of Perl materials, including documentation, style
+guides, tricks and traps, alternate ports to non-Unix systems and
+occasional binary distributions for these. Search engines for
+CPAN can be found at http://cpan.perl.com/ and at
+http://theory.uwinnipeg.ca/mod_perl/cpan-search.pl .
+
+Most importantly, CPAN includes around a thousand unbundled modules,
+some of which require a C compiler to build. Major categories of
+modules are:
+
+=over
+
+=item *
+
+Language Extensions and Documentation Tools
+
+=item *
+
+Development Support
+
+=item *
+
+Operating System Interfaces
+
+=item *
+
+Networking, Device Control (modems) and InterProcess Communication
+
+=item *
+
+Data Types and Data Type Utilities
+
+=item *
+
+Database Interfaces
+
+=item *
+
+User Interfaces
+
+=item *
+
+Interfaces to / Emulations of Other Programming Languages
+
+=item *
+
+File Names, File Systems and File Locking (see also File Handles)
+
+=item *
+
+String Processing, Language Text Processing, Parsing, and Searching
+
+=item *
+
+Option, Argument, Parameter, and Configuration File Processing
+
+=item *
+
+Internationalization and Locale
+
+=item *
+
+Authentication, Security, and Encryption
+
+=item *
+
+World Wide Web, HTML, HTTP, CGI, MIME
+
+=item *
+
+Server and Daemon Utilities
+
+=item *
+
+Archiving and Compression
+
+=item *
+
+Images, Pixmap and Bitmap Manipulation, Drawing, and Graphing
+
+=item *
+
+Mail and Usenet News
+
+=item *
+
+Control Flow Utilities (callbacks and exceptions etc)
+
+=item *
+
+File Handle and Input/Output Stream Utilities
+
+=item *
+
+Miscellaneous Modules
+
+=back
+
+Registered CPAN sites as of this writing include the following.
+You should try to choose one close to you:
+
+=head2 Africa
+
+=over 4
+
+=item *
+
+South Africa
+
+ ftp://ftp.is.co.za/programming/perl/CPAN/
+ ftp://ftp.saix.net/pub/CPAN/
+ ftp://ftpza.co.za/pub/mirrors/cpan/
+ ftp://ftp.sun.ac.za/CPAN/
+
+=back
+
+=head2 Asia
+
+=over 4
+
+=item *
+
+China
+
+ ftp://freesoft.cei.gov.cn/pub/languages/perl/CPAN/
+ http://www2.linuxforum.net/mirror/CPAN/
+ http://cpan.shellhung.org/
+ ftp://ftp.shellhung.org/pub/CPAN
+
+=item *
+
+Hong Kong
+
+ http://CPAN.pacific.net.hk/
+ ftp://ftp.pacific.net.hk/pub/mirror/CPAN/
+
+=item *
+
+Indonesia
+
+ http://piksi.itb.ac.id/CPAN/
+ ftp://mirrors.piksi.itb.ac.id/CPAN/
+ http://CPAN.mweb.co.id/
+ ftp://ftp.mweb.co.id/pub/languages/perl/CPAN/
+
+=item *
+
+Israel
+
+ http://www.iglu.org.il:/pub/CPAN/
+ ftp://ftp.iglu.org.il/pub/CPAN/
+ http://bioinfo.weizmann.ac.il/pub/software/perl/CPAN/
+ ftp://bioinfo.weizmann.ac.il/pub/software/perl/CPAN/
+
+=item *
+
+Japan
+
+ ftp://ftp.u-aizu.ac.jp/pub/lang/perl/CPAN/
+ ftp://ftp.kddlabs.co.jp/CPAN/
+ http://mirror.nucba.ac.jp/mirror/Perl/
+ ftp://mirror.nucba.ac.jp/mirror/Perl/
+ ftp://ftp.meisei-u.ac.jp/pub/CPAN/
+ ftp://ftp.jaist.ac.jp/pub/lang/perl/CPAN/
+ ftp://ftp.dti.ad.jp/pub/lang/CPAN/
+ ftp://ftp.ring.gr.jp/pub/lang/perl/CPAN/
+
+=item *
+
+Saudi Arabia
+
+ ftp://ftp.isu.net.sa/pub/CPAN/
+
+=item *
+
+Singapore
+
+ http://cpan.hjc.edu.sg
+ http://ftp.nus.edu.sg/unix/perl/CPAN/
+ ftp://ftp.nus.edu.sg/pub/unix/perl/CPAN/
+
+=item *
+
+South Korea
+
+ http://CPAN.bora.net/
+ ftp://ftp.bora.net/pub/CPAN/
+ http://ftp.kornet.net/CPAN/
+ ftp://ftp.kornet.net/pub/CPAN/
+ ftp://ftp.nuri.net/pub/CPAN/
+
+=item *
+
+Taiwan
+
+ ftp://coda.nctu.edu.tw/UNIX/perl/CPAN
+ ftp://ftp.ee.ncku.edu.tw/pub/perl/CPAN/
+ ftp://ftp1.sinica.edu.tw/pub1/perl/CPAN/
+
+=item *
+
+Thailand
+
+ http://download.nectec.or.th/CPAN/
+ ftp://ftp.nectec.or.th/pub/languages/CPAN/
+ ftp://ftp.cs.riubon.ac.th/pub/mirrors/CPAN/
+
+=back
+
+=head2 Central America
+
+=over 4
+
+=item *
+
+Costa Rica
+
+ ftp://ftp.linux.co.cr/mirrors/CPAN/
+ http://ftp.ucr.ac.cr/Unix/CPAN/
+ ftp://ftp.ucr.ac.cr/pub/Unix/CPAN/
+
+=back
+
+=head2 Europe
+
+=over 4
+
+=item *
+
+Austria
+
+ ftp://ftp.tuwien.ac.at/pub/languages/perl/CPAN/
+
+=item *
+
+Belgium
+
+ http://ftp.easynet.be/CPAN/
+ ftp://ftp.easynet.be/CPAN/
+ ftp://ftp.kulnet.kuleuven.ac.be/pub/mirror/CPAN/
+
+=item *
+
+Bulgaria
+
+ ftp://ftp.ntrl.net/pub/mirrors/CPAN/
+
+=item *
+
+Croatia
+
+ ftp://ftp.linux.hr/pub/CPAN/
+
+=item *
+
+Czech Republic
+
+ http://www.fi.muni.cz/pub/perl/
+ ftp://ftp.fi.muni.cz/pub/perl/
+ ftp://sunsite.mff.cuni.cz/MIRRORS/ftp.funet.fi/pub/languages/perl/CPAN/
+
+=item *
+
+Denmark
+
+ ftp://sunsite.auc.dk/pub/languages/perl/CPAN/
+ http://www.cpan.dk/CPAN/
+ ftp://www.cpan.dk/ftp.cpan.org/CPAN/
+
+=item *
+
+England
+
+ http://www.mirror.ac.uk/sites/ftp.funet.fi/pub/languages/perl/CPAN
+ ftp://ftp.mirror.ac.uk/sites/ftp.funet.fi/pub/languages/perl/CPAN/
+ ftp://ftp.demon.co.uk/pub/mirrors/perl/CPAN/
+ ftp://ftp.flirble.org/pub/languages/perl/CPAN/
+ ftp://ftp.plig.org/pub/CPAN/
+ ftp://sunsite.doc.ic.ac.uk/packages/CPAN/
+ http://mirror.uklinux.net/CPAN/
+ ftp://mirror.uklinux.net/pub/CPAN/
+ ftp://usit.shef.ac.uk/pub/packages/CPAN/
+
+=item *
+
+Estonia
+
+ ftp://ftp.ut.ee/pub/languages/perl/CPAN/
+
+=item *
+
+Finland
+
+ ftp://ftp.funet.fi/pub/languages/perl/CPAN/
+
+=item *
+
+France
+
+ ftp://cpan.ftp.worldonline.fr/pub/CPAN/
+ ftp://ftp.club-internet.fr/pub/perl/CPAN/
+ ftp://ftp.lip6.fr/pub/perl/CPAN/
+ ftp://ftp.oleane.net/pub/mirrors/CPAN/
+ ftp://ftp.pasteur.fr/pub/computing/CPAN/
+ ftp://cpan.cict.fr/pub/CPAN/
+ ftp://ftp.uvsq.fr/pub/perl/CPAN/
+
+=item *
+
+Germany
+
+ ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN/
+ ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/
+ ftp://ftp.uni-erlangen.de/pub/source/CPAN/
+ ftp://ftp-stud.fht-esslingen.de/pub/Mirrors/CPAN
+ ftp://ftp.gigabell.net/pub/CPAN/
+ http://ftp.gwdg.de/pub/languages/perl/CPAN/
+ ftp://ftp.gwdg.de/pub/languages/perl/CPAN/
+ ftp://ftp.uni-hamburg.de/pub/soft/lang/perl/CPAN/
+ ftp://ftp.leo.org/pub/comp/general/programming/languages/script/perl/CPAN/
+ ftp://ftp.mpi-sb.mpg.de/pub/perl/CPAN/
+ ftp://ftp.gmd.de/mirrors/CPAN/
+
+=item *
+
+Greece
+
+ ftp://ftp.forthnet.gr/pub/languages/perl/CPAN
+ ftp://ftp.ntua.gr/pub/lang/perl/
+
+=item *
+
+Hungary
+
+ http://cpan.artifact.hu/
+ ftp://cpan.artifact.hu/CPAN/
+ ftp://ftp.kfki.hu/pub/packages/perl/CPAN/
+
+=item *
+
+Iceland
+
+ http://cpan.gm.is/
+ ftp://ftp.gm.is/pub/CPAN/
+
+=item *
+
+Ireland
+
+ http://cpan.indigo.ie/
+ ftp://cpan.indigo.ie/pub/CPAN/
+ http://sunsite.compapp.dcu.ie/pub/perl/
+ ftp://sunsite.compapp.dcu.ie/pub/perl/
+
+=item *
+
+Italy
+
+ http://cpan.nettuno.it/
+ http://gusp.dyndns.org/CPAN/
+ ftp://gusp.dyndns.org/pub/CPAN
+ http://softcity.iol.it/cpan
+ ftp://softcity.iol.it/pub/cpan
+ ftp://ftp.unina.it/pub/Other/CPAN/
+ ftp://ftp.unipi.it/pub/mirror/perl/CPAN/
+ ftp://cis.uniRoma2.it/CPAN/
+ ftp://ftp.edisontel.it/pub/CPAN_Mirror/
+ ftp://ftp.flashnet.it/pub/CPAN/
+
+=item *
+
+Latvia
+
+ http://kvin.lv/pub/CPAN/
+
+=item *
+
+Netherlands
+
+ ftp://download.xs4all.nl/pub/mirror/CPAN/
+ ftp://ftp.nl.uu.net/pub/CPAN/
+ ftp://ftp.nluug.nl/pub/languages/perl/CPAN/
+ ftp://ftp.cpan.nl/pub/CPAN/
+ http://www.cs.uu.nl/mirror/CPAN/
+ ftp://ftp.cs.uu.nl/mirror/CPAN/
+
+=item *
+
+Norway
+
+ ftp://sunsite.uio.no/pub/languages/perl/CPAN/
+ ftp://ftp.uit.no/pub/languages/perl/cpan/
+
+=item *
+
+Poland
+
+ ftp://ftp.pk.edu.pl/pub/lang/perl/CPAN/
+ ftp://ftp.mega.net.pl/pub/mirrors/ftp.perl.com/
+ ftp://ftp.man.torun.pl/pub/doc/CPAN/
+ ftp://sunsite.icm.edu.pl/pub/CPAN/
+
+=item *
+
+Portugal
+
+ ftp://ftp.ua.pt/pub/CPAN/
+ ftp://perl.di.uminho.pt/pub/CPAN/
+ ftp://ftp.ist.utl.pt/pub/CPAN/
+ ftp://ftp.netc.pt/pub/CPAN/
+
+=item *
+
+Romania
+
+ ftp://archive.logicnet.ro/mirrors/ftp.cpan.org/CPAN/
+ ftp://ftp.kappa.ro/pub/mirrors/ftp.perl.org/pub/CPAN/
+ ftp://ftp.dntis.ro/pub/cpan/
+ ftp://ftp.opsynet.com/cpan/
+ ftp://ftp.dnttm.ro/pub/CPAN/
+ ftp://ftp.timisoara.roedu.net/mirrors/CPAN/
+
+=item *
+
+Russia
+
+ ftp://ftp.chg.ru/pub/lang/perl/CPAN/
+ http://cpan.rinet.ru/
+ ftp://cpan.rinet.ru/pub/mirror/CPAN/
+ ftp://ftp.aha.ru/pub/CPAN/
+ ftp://ftp.sai.msu.su/pub/lang/perl/CPAN/
+
+=item *
+
+Slovakia
+
+ ftp://ftp.entry.sk/pub/languages/perl/CPAN/
+
+=item *
+
+Slovenia
+
+ ftp://ftp.arnes.si/software/perl/CPAN/
+
+=item *
+
+Spain
+
+ ftp://ftp.rediris.es/mirror/CPAN/
+ ftp://ftp.etse.urv.es/pub/perl/
+
+=item *
+
+Sweden
+
+ http://ftp.du.se/CPAN/
+ ftp://ftp.du.se/pub/CPAN/
+ ftp://ftp.sunet.se/pub/lang/perl/CPAN/
+
+=item *
+
+Switzerland
+
+ ftp://ftp.danyk.ch/CPAN/
+ ftp://sunsite.cnlab-switch.ch/mirror/CPAN/
+
+=item *
+
+Turkey
+
+ ftp://sunsite.bilkent.edu.tr/pub/languages/CPAN/
+
+=back
+
+=head2 North America
+
+=over 4
+
+=item *
+
+Canada
+
+=over 8
+
+=item *
+
+Alberta
+
+ http://sunsite.ualberta.ca/pub/Mirror/CPAN/
+ ftp://sunsite.ualberta.ca/pub/Mirror/CPAN/
+
+=item *
+
+Manitoba
+
+ http://theoryx5.uwinnipeg.ca/pub/CPAN/
+ ftp://theoryx5.uwinnipeg.ca/pub/CPAN/
+
+=item *
+
+Nova Scotia
+
+ ftp://cpan.chebucto.ns.ca/pub/CPAN/
+
+=item *
+
+Ontario
+
+ ftp://ftp.crc.ca/pub/packages/lang/perl/CPAN/
+
+=item *
+
+Mexico
+
+ http://www.msg.com.mx/CPAN/
+ ftp://ftp.msg.com.mx/pub/CPAN/
+
+=back
+
+=item *
+
+United States
+
+=over 8
+
+=item *
+
+Alabama
+
+ http://mirror.hiwaay.net/CPAN/
+ ftp://mirror.hiwaay.net/CPAN/
+
+=item *
+
+California
+
+ http://www.cpan.org/
+ ftp://ftp.cpan.org/CPAN/
+ ftp://cpan.nas.nasa.gov/pub/perl/CPAN/
+ ftp://ftp.digital.com/pub/plan/perl/CPAN/
+ http://www.kernel.org/pub/mirrors/cpan/
+ ftp://ftp.kernel.org/pub/mirrors/cpan/
+ http://www.perl.com/CPAN/
+ http://download.sourceforge.net/mirrors/CPAN/
+
+=item *
+
+Colorado
+
+ ftp://ftp.cs.colorado.edu/pub/perl/CPAN/
+
+=item *
+
+Florida
+
+ ftp://ftp.cise.ufl.edu/pub/perl/CPAN/
+
+=item *
+
+Georgia
+
+ ftp://ftp.twoguys.org/CPAN/
+
+=item *
+
+Illinois
+
+ http://www.neurogames.com/mirrors/CPAN
+ http://uiarchive.uiuc.edu/mirrors/ftp/ftp.cpan.org/pub/CPAN/
+ ftp://uiarchive.uiuc.edu/mirrors/ftp/ftp.cpan.org/pub/CPAN/
+
+=item *
+
+Indiana
+
+ ftp://ftp.uwsg.indiana.edu/pub/perl/CPAN/
+ http://cpan.nitco.com/
+ ftp://cpan.nitco.com/pub/CPAN/
+ ftp://cpan.in-span.net/
+ http://csociety-ftp.ecn.purdue.edu/pub/CPAN
+ ftp://csociety-ftp.ecn.purdue.edu/pub/CPAN
+
+=item *
+
+Kentucky
+
+ http://cpan.uky.edu/
+ ftp://cpan.uky.edu/pub/CPAN/
+
+=item *
+
+Massachusetts
+
+ ftp://ftp.ccs.neu.edu/net/mirrors/ftp.funet.fi/pub/languages/perl/CPAN/
+ ftp://ftp.iguide.com/pub/mirrors/packages/perl/CPAN/
+
+=item *
+
+New Jersey
+
+ ftp://ftp.cpanel.net/pub/CPAN/
+
+=item *
+
+New York
+
+ ftp://ftp.freesoftware.com/pub/perl/CPAN/
+ http://www.deao.net/mirrors/CPAN/
+ ftp://ftp.deao.net/pub/CPAN/
+ ftp://ftp.stealth.net/pub/mirrors/ftp.cpan.org/pub/CPAN/
+ http://mirror.nyc.anidea.com/CPAN/
+ ftp://mirror.nyc.anidea.com/pub/CPAN/
+ http://www.rge.com/pub/languages/perl/
+ ftp://ftp.rge.com/pub/languages/perl/
+ ftp://mirrors.cloud9.net/pub/mirrors/CPAN/
+
+=item *
+
+North Carolina
+
+ ftp://ftp.duke.edu/pub/perl/
+
+=item *
+
+Ohio
+
+ ftp://ftp.loaded.net/pub/CPAN/
+
+=item *
+
+Oklahoma
+
+ ftp://ftp.ou.edu/mirrors/CPAN/
+
+=item *
+
+Oregon
+
+ ftp://ftp.orst.edu/pub/packages/CPAN/
+
+=item *
+
+Pennsylvania
+
+ http://ftp.epix.net/CPAN/
+ ftp://ftp.epix.net/pub/languages/perl/
+ ftp://carroll.cac.psu.edu/pub/CPAN/
+
+=item *
+
+Tennessee
+
+ ftp://ftp.sunsite.utk.edu/pub/CPAN/
+
+=item *
+
+Texas
+
+ http://ftp.sedl.org/pub/mirrors/CPAN/
+ http://jhcloos.com/pub/mirror/CPAN/
+ ftp://jhcloos.com/pub/mirror/CPAN/
+
+=item *
+
+Utah
+
+ ftp://mirror.xmission.com/CPAN/
+
+=item *
+
+Virginia
+
+ http://mirrors.rcn.net/pub/lang/CPAN/
+ ftp://mirrors.rcn.net/pub/lang/CPAN/
+ ftp://ruff.cs.jmu.edu/pub/CPAN/
+ http://perl.Liquidation.com/CPAN/
+
+=item *
+
+Washington
+
+ http://cpan.llarian.net/
+ ftp://cpan.llarian.net/pub/CPAN/
+ ftp://ftp-mirror.internap.com/pub/CPAN/
+ ftp://ftp.spu.edu/pub/CPAN/
+
+=back
+
+=back
+
+=head2 Oceania
+
+=over 4
+
+=item *
+
+Australia
+
+ http://ftp.planetmirror.com/pub/CPAN/
+ ftp://ftp.planetmirror.com/pub/CPAN/
+ ftp://mirror.aarnet.edu.au/pub/perl/CPAN/
+ ftp://cpan.topend.com.au/pub/CPAN/
+
+=item *
+
+New Zealand
+
+ ftp://ftp.auckland.ac.nz/pub/perl/CPAN/
+
+=back
+
+=head2 South America
+
+=over 4
+
+=item *
+
+Argentina
+
+ ftp://mirrors.bannerlandia.com.ar/mirrors/CPAN/
+
+=item *
+
+Brazil
+
+ ftp://cpan.pop-mg.com.br/pub/CPAN/
+ ftp://ftp.matrix.com.br/pub/perl/
+ ftp://cpan.if.usp.br/pub/mirror/CPAN/
+
+=item *
+
+Chile
+
+ ftp://ftp.psinet.cl/pub/programming/perl/CPAN/
+ ftp://sunsite.dcc.uchile.cl/pub/lang/perl/
+
+=back
+
+For an up-to-date listing of CPAN sites,
+see http://www.cpan.org/SITES or ftp://www.cpan.org/SITES .
+
+=head1 Modules: Creation, Use, and Abuse
+
+(The following section is borrowed directly from Tim Bunce's modules
+file, available at your nearest CPAN site.)
+
+Perl implements a class using a package, but the presence of a
+package doesn't imply the presence of a class. A package is just a
+namespace. A class is a package that provides subroutines that can be
+used as methods. A method is just a subroutine that expects, as its
+first argument, either the name of a package (for "static" methods),
+or a reference to something (for "virtual" methods).
+
+A module is a file that (by convention) provides a class of the same
+name (sans the .pm), plus an import method in that class that can be
+called to fetch exported symbols. This module may implement some of
+its methods by loading dynamic C or C++ objects, but that should be
+totally transparent to the user of the module. Likewise, the module
+might set up an AUTOLOAD function to slurp in subroutine definitions on
+demand, but this is also transparent. Only the F<.pm> file is required to
+exist. See L<perlsub>, L<perltoot>, and L<AutoLoader> for details about
+the AUTOLOAD mechanism.
+
+=head2 Guidelines for Module Creation
+
+=over 4
+
+=item *
+
+Do similar modules already exist in some form?
+
+If so, please try to reuse the existing modules either in whole or
+by inheriting useful features into a new class. If this is not
+practical try to get together with the module authors to work on
+extending or enhancing the functionality of the existing modules.
+A perfect example is the plethora of packages in perl4 for dealing
+with command line options.
+
+If you are writing a module to expand an already existing set of
+modules, please coordinate with the author of the package. It
+helps if you follow the same naming scheme and module interaction
+scheme as the original author.
+
+=item *
+
+Try to design the new module to be easy to extend and reuse.
+
+Try to C<use warnings;> (or C<use warnings qw(...);>).
+Remember that you can add C<no warnings qw(...);> to individual blocks
+of code that need less warnings.
+
+Use blessed references. Use the two argument form of bless to bless
+into the class name given as the first parameter of the constructor,
+e.g.,:
+
+ sub new {
+ my $class = shift;
+ return bless {}, $class;
+ }
+
+or even this if you'd like it to be used as either a static
+or a virtual method.
+
+ sub new {
+ my $self = shift;
+ my $class = ref($self) || $self;
+ return bless {}, $class;
+ }
+
+Pass arrays as references so more parameters can be added later
+(it's also faster). Convert functions into methods where
+appropriate. Split large methods into smaller more flexible ones.
+Inherit methods from other modules if appropriate.
+
+Avoid class name tests like: C<die "Invalid" unless ref $ref eq 'FOO'>.
+Generally you can delete the C<eq 'FOO'> part with no harm at all.
+Let the objects look after themselves! Generally, avoid hard-wired
+class names as far as possible.
+
+Avoid C<< $r->Class::func() >> where using C<@ISA=qw(... Class ...)> and
+C<< $r->func() >> would work (see L<perlbot> for more details).
+
+Use autosplit so little used or newly added functions won't be a
+burden to programs that don't use them. Add test functions to
+the module after __END__ either using AutoSplit or by saying:
+
+ eval join('',<main::DATA>) || die $@ unless caller();
+
+Does your module pass the 'empty subclass' test? If you say
+C<@SUBCLASS::ISA = qw(YOURCLASS);> your applications should be able
+to use SUBCLASS in exactly the same way as YOURCLASS. For example,
+does your application still work if you change: C<$obj = new YOURCLASS;>
+into: C<$obj = new SUBCLASS;> ?
+
+Avoid keeping any state information in your packages. It makes it
+difficult for multiple other packages to use yours. Keep state
+information in objects.
+
+Always use B<-w>.
+
+Try to C<use strict;> (or C<use strict qw(...);>).
+Remember that you can add C<no strict qw(...);> to individual blocks
+of code that need less strictness.
+
+Always use B<-w>.
+
+Follow the guidelines in the perlstyle(1) manual.
+
+Always use B<-w>.
+
+=item *
+
+Some simple style guidelines
+
+The perlstyle manual supplied with Perl has many helpful points.
+
+Coding style is a matter of personal taste. Many people evolve their
+style over several years as they learn what helps them write and
+maintain good code. Here's one set of assorted suggestions that
+seem to be widely used by experienced developers:
+
+Use underscores to separate words. It is generally easier to read
+$var_names_like_this than $VarNamesLikeThis, especially for
+non-native speakers of English. It's also a simple rule that works
+consistently with VAR_NAMES_LIKE_THIS.
+
+Package/Module names are an exception to this rule. Perl informally
+reserves lowercase module names for 'pragma' modules like integer
+and strict. Other modules normally begin with a capital letter and
+use mixed case with no underscores (need to be short and portable).
+
+You may find it helpful to use letter case to indicate the scope
+or nature of a variable. For example:
+
+ $ALL_CAPS_HERE constants only (beware clashes with Perl vars)
+ $Some_Caps_Here package-wide global/static
+ $no_caps_here function scope my() or local() variables
+
+Function and method names seem to work best as all lowercase.
+e.g., C<< $obj->as_string() >>.
+
+You can use a leading underscore to indicate that a variable or
+function should not be used outside the package that defined it.
+
+=item *
+
+Select what to export.
+
+Do NOT export method names!
+
+Do NOT export anything else by default without a good reason!
+
+Exports pollute the namespace of the module user. If you must
+export try to use @EXPORT_OK in preference to @EXPORT and avoid
+short or common names to reduce the risk of name clashes.
+
+Generally anything not exported is still accessible from outside the
+module using the ModuleName::item_name (or C<< $blessed_ref->method >>)
+syntax. By convention you can use a leading underscore on names to
+indicate informally that they are 'internal' and not for public use.
+
+(It is actually possible to get private functions by saying:
+C<my $subref = sub { ... }; &$subref;>. But there's no way to call that
+directly as a method, because a method must have a name in the symbol
+table.)
+
+As a general rule, if the module is trying to be object oriented
+then export nothing. If it's just a collection of functions then
+@EXPORT_OK anything but use @EXPORT with caution.
+
+=item *
+
+Select a name for the module.
+
+This name should be as descriptive, accurate, and complete as
+possible. Avoid any risk of ambiguity. Always try to use two or
+more whole words. Generally the name should reflect what is special
+about what the module does rather than how it does it. Please use
+nested module names to group informally or categorize a module.
+There should be a very good reason for a module not to have a nested name.
+Module names should begin with a capital letter.
+
+Having 57 modules all called Sort will not make life easy for anyone
+(though having 23 called Sort::Quick is only marginally better :-).
+Imagine someone trying to install your module alongside many others.
+If in any doubt ask for suggestions in comp.lang.perl.misc.
+
+If you are developing a suite of related modules/classes it's good
+practice to use nested classes with a common prefix as this will
+avoid namespace clashes. For example: Xyz::Control, Xyz::View,
+Xyz::Model etc. Use the modules in this list as a naming guide.
+
+If adding a new module to a set, follow the original author's
+standards for naming modules and the interface to methods in
+those modules.
+
+If developing modules for private internal or project specific use,
+that will never be released to the public, then you should ensure
+that their names will not clash with any future public module. You
+can do this either by using the reserved Local::* category or by
+using a category name that includes an underscore like Foo_Corp::*.
+
+To be portable each component of a module name should be limited to
+11 characters. If it might be used on MS-DOS then try to ensure each is
+unique in the first 8 characters. Nested modules make this easier.
+
+=item *
+
+Have you got it right?
+
+How do you know that you've made the right decisions? Have you
+picked an interface design that will cause problems later? Have
+you picked the most appropriate name? Do you have any questions?
+
+The best way to know for sure, and pick up many helpful suggestions,
+is to ask someone who knows. Comp.lang.perl.misc is read by just about
+all the people who develop modules and it's the best place to ask.
+
+All you need to do is post a short summary of the module, its
+purpose and interfaces. A few lines on each of the main methods is
+probably enough. (If you post the whole module it might be ignored
+by busy people - generally the very people you want to read it!)
+
+Don't worry about posting if you can't say when the module will be
+ready - just say so in the message. It might be worth inviting
+others to help you, they may be able to complete it for you!
+
+=item *
+
+README and other Additional Files.
+
+It's well known that software developers usually fully document the
+software they write. If, however, the world is in urgent need of
+your software and there is not enough time to write the full
+documentation please at least provide a README file containing:
+
+=over 10
+
+=item *
+
+A description of the module/package/extension etc.
+
+=item *
+
+A copyright notice - see below.
+
+=item *
+
+Prerequisites - what else you may need to have.
+
+=item *
+
+How to build it - possible changes to Makefile.PL etc.
+
+=item *
+
+How to install it.
+
+=item *
+
+Recent changes in this release, especially incompatibilities
+
+=item *
+
+Changes / enhancements you plan to make in the future.
+
+=back
+
+If the README file seems to be getting too large you may wish to
+split out some of the sections into separate files: INSTALL,
+Copying, ToDo etc.
+
+=over 4
+
+=item Adding a Copyright Notice.
+
+
+How you choose to license your work is a personal decision.
+The general mechanism is to assert your Copyright and then make
+a declaration of how others may copy/use/modify your work.
+
+Perl, for example, is supplied with two types of licence: The GNU
+GPL and The Artistic Licence (see the files README, Copying, and
+Artistic). Larry has good reasons for NOT just using the GNU GPL.
+
+My personal recommendation, out of respect for Larry, Perl, and the
+Perl community at large is to state something simply like:
+
+ Copyright (c) 1995 Your Name. All rights reserved.
+ This program is free software; you can redistribute it and/or
+ modify it under the same terms as Perl itself.
+
+This statement should at least appear in the README file. You may
+also wish to include it in a Copying file and your source files.
+Remember to include the other words in addition to the Copyright.
+
+=item *
+
+Give the module a version/issue/release number.
+
+To be fully compatible with the Exporter and MakeMaker modules you
+should store your module's version number in a non-my package
+variable called $VERSION. This should be a floating point
+number with at least two digits after the decimal (i.e., hundredths,
+e.g, C<$VERSION = "0.01">). Don't use a "1.3.2" style version.
+See L<Exporter> for details.
+
+It may be handy to add a function or method to retrieve the number.
+Use the number in announcements and archive file names when
+releasing the module (ModuleName-1.02.tar.Z).
+See perldoc ExtUtils::MakeMaker.pm for details.
+
+=item *
+
+How to release and distribute a module.
+
+It's good idea to post an announcement of the availability of your
+module (or the module itself if small) to the comp.lang.perl.announce
+Usenet newsgroup. This will at least ensure very wide once-off
+distribution.
+
+If possible, register the module with CPAN. You should
+include details of its location in your announcement.
+
+Some notes about ftp archives: Please use a long descriptive file
+name that includes the version number. Most incoming directories
+will not be readable/listable, i.e., you won't be able to see your
+file after uploading it. Remember to send your email notification
+message as soon as possible after uploading else your file may get
+deleted automatically. Allow time for the file to be processed
+and/or check the file has been processed before announcing its
+location.
+
+FTP Archives for Perl Modules:
+
+Follow the instructions and links on:
+
+ http://www.cpan.org/modules/00modlist.long.html
+ http://www.cpan.org/modules/04pause.html
+
+or upload to one of these sites:
+
+ https://pause.kbx.de/pause/
+ http://pause.perl.org/pause/
+
+and notify <modules@perl.org>.
+
+By using the WWW interface you can ask the Upload Server to mirror
+your modules from your ftp or WWW site into your own directory on
+CPAN!
+
+Please remember to send me an updated entry for the Module list!
+
+=item *
+
+Take care when changing a released module.
+
+Always strive to remain compatible with previous released versions.
+Otherwise try to add a mechanism to revert to the
+old behavior if people rely on it. Document incompatible changes.
+
+=back
+
+=back
+
+=head2 Guidelines for Converting Perl 4 Library Scripts into Modules
+
+=over 4
+
+=item *
+
+There is no requirement to convert anything.
+
+If it ain't broke, don't fix it! Perl 4 library scripts should
+continue to work with no problems. You may need to make some minor
+changes (like escaping non-array @'s in double quoted strings) but
+there is no need to convert a .pl file into a Module for just that.
+
+=item *
+
+Consider the implications.
+
+All Perl applications that make use of the script will need to
+be changed (slightly) if the script is converted into a module. Is
+it worth it unless you plan to make other changes at the same time?
+
+=item *
+
+Make the most of the opportunity.
+
+If you are going to convert the script to a module you can use the
+opportunity to redesign the interface. The guidelines for module
+creation above include many of the issues you should consider.
+
+=item *
+
+The pl2pm utility will get you started.
+
+This utility will read *.pl files (given as parameters) and write
+corresponding *.pm files. The pl2pm utilities does the following:
+
+=over 10
+
+=item *
+
+Adds the standard Module prologue lines
+
+=item *
+
+Converts package specifiers from ' to ::
+
+=item *
+
+Converts die(...) to croak(...)
+
+=item *
+
+Several other minor changes
+
+=back
+
+Being a mechanical process pl2pm is not bullet proof. The converted
+code will need careful checking, especially any package statements.
+Don't delete the original .pl file till the new .pm one works!
+
+=back
+
+=head2 Guidelines for Reusing Application Code
+
+=over 4
+
+=item *
+
+Complete applications rarely belong in the Perl Module Library.
+
+=item *
+
+Many applications contain some Perl code that could be reused.
+
+Help save the world! Share your code in a form that makes it easy
+to reuse.
+
+=item *
+
+Break-out the reusable code into one or more separate module files.
+
+=item *
+
+Take the opportunity to reconsider and redesign the interfaces.
+
+=item *
+
+In some cases the 'application' can then be reduced to a small
+
+fragment of code built on top of the reusable modules. In these cases
+the application could invoked as:
+
+ % perl -e 'use Module::Name; method(@ARGV)' ...
+or
+ % perl -mModule::Name ... (in perl5.002 or higher)
+
+=back
+
+=head1 NOTE
+
+Perl does not enforce private and public parts of its modules as you may
+have been used to in other languages like C++, Ada, or Modula-17. Perl
+doesn't have an infatuation with enforced privacy. It would prefer
+that you stayed out of its living room because you weren't invited, not
+because it has a shotgun.
+
+The module and its user have a contract, part of which is common law,
+and part of which is "written". Part of the common law contract is
+that a module doesn't pollute any namespace it wasn't asked to. The
+written contract for the module (A.K.A. documentation) may make other
+provisions. But then you know when you C<use RedefineTheWorld> that
+you're redefining the world and willing to take the consequences.
+EOF
+
+close MANIFEST or warn "$0: failed to close MANIFEST (../MANIFEST): $!";
+close OUT or warn "$0: failed to close OUT (perlmodlib.tmp): $!";
+
diff --git a/contrib/perl5/pod/perlnewmod.pod b/contrib/perl5/pod/perlnewmod.pod
new file mode 100644
index 0000000000000..ace8d85130f65
--- /dev/null
+++ b/contrib/perl5/pod/perlnewmod.pod
@@ -0,0 +1,282 @@
+=head1 NAME
+
+perlnewmod - preparing a new module for distribution
+
+=head1 DESCRIPTION
+
+This document gives you some suggestions about how to go about writing
+Perl modules, preparing them for distribution, and making them available
+via CPAN.
+
+One of the things that makes Perl really powerful is the fact that Perl
+hackers tend to want to share the solutions to problems they've faced,
+so you and I don't have to battle with the same problem again.
+
+The main way they do this is by abstracting the solution into a Perl
+module. If you don't know what one of these is, the rest of this
+document isn't going to be much use to you. You're also missing out on
+an awful lot of useful code; consider having a look at L<perlmod>,
+L<perlmodlib> and L<perlmodinstall> before coming back here.
+
+When you've found that there isn't a module available for what you're
+trying to do, and you've had to write the code yourself, consider
+packaging up the solution into a module and uploading it to CPAN so that
+others can benefit.
+
+=head2 Warning
+
+We're going to primarily concentrate on Perl-only modules here, rather
+than XS modules. XS modules serve a rather different purpose, and
+you should consider different things before distributing them - the
+popularity of the library you are gluing, the portability to other
+operating systems, and so on. However, the notes on preparing the Perl
+side of the module and packaging and distributing it will apply equally
+well to an XS module as a pure-Perl one.
+
+=head2 What should I make into a module?
+
+You should make a module out of any code that you think is going to be
+useful to others. Anything that's likely to fill a hole in the communal
+library and which someone else can slot directly into their program. Any
+part of your code which you can isolate and extract and plug into
+something else is a likely candidate.
+
+Let's take an example. Suppose you're reading in data from a local
+format into a hash-of-hashes in Perl, turning that into a tree, walking
+the tree and then piping each node to an Acme Transmogrifier Server.
+
+Now, quite a few people have the Acme Transmogrifier, and you've had to
+write something to talk the protocol from scratch - you'd almost
+certainly want to make that into a module. The level at which you pitch
+it is up to you: you might want protocol-level modules analogous to
+L<Net::SMTP|Net::SMTP> which then talk to higher level modules analogous
+to L<Mail::Send|Mail::Send>. The choice is yours, but you do want to get
+a module out for that server protocol.
+
+Nobody else on the planet is going to talk your local data format, so we
+can ignore that. But what about the thing in the middle? Building tree
+structures from Perl variables and then traversing them is a nice,
+general problem, and if nobody's already written a module that does
+that, you might want to modularise that code too.
+
+So hopefully you've now got a few ideas about what's good to modularise.
+Let's now see how it's done.
+
+=head2 Step-by-step: Preparing the ground
+
+Before we even start scraping out the code, there are a few things we'll
+want to do in advance.
+
+=over 3
+
+=item Look around
+
+Dig into a bunch of modules to see how they're written. I'd suggest
+starting with L<Text::Tabs|Text::Tabs>, since it's in the standard
+library and is nice and simple, and then looking at something like
+L<Time::Zone|Time::Zone>, L<File::Copy|File::Copy> and then some of the
+C<Mail::*> modules if you're planning on writing object oriented code.
+
+These should give you an overall feel for how modules are laid out and
+written.
+
+=item Check it's new
+
+There are a lot of modules on CPAN, and it's easy to miss one that's
+similar to what you're planning on contributing. Have a good plough
+through the modules list and the F<by-module> directories, and make sure
+you're not the one reinventing the wheel!
+
+=item Discuss the need
+
+You might love it. You might feel that everyone else needs it. But there
+might not actually be any real demand for it out there. If you're unsure
+about the demand you're module will have, consider sending out feelers
+on the C<comp.lang.perl.modules> newsgroup, or as a last resort, ask the
+modules list at C<modules@perl.org>. Remember that this is a closed list
+with a very long turn-around time - be prepared to wait a good while for
+a response from them.
+
+=item Choose a name
+
+Perl modules included on CPAN have a naming hierarchy you should try to
+fit in with. See L<perlmodlib> for more details on how this works, and
+browse around CPAN and the modules list to get a feel of it. At the very
+least, remember this: modules should be title capitalised, (This::Thing)
+fit in with a category, and explain their purpose succinctly.
+
+=item Check again
+
+While you're doing that, make really sure you haven't missed a module
+similar to the one you're about to write.
+
+When you've got your name sorted out and you're sure that your module is
+wanted and not currently available, it's time to start coding.
+
+=back
+
+=head2 Step-by-step: Making the module
+
+=over 3
+
+=item Start with F<h2xs>
+
+Originally a utility to convert C header files into XS modules,
+L<h2xs|h2xs> has become a useful utility for churning out skeletons for
+Perl-only modules as well. If you don't want to use the
+L<Autoloader|Autoloader> which splits up big modules into smaller
+subroutine-sized chunks, you'll say something like this:
+
+ h2xs -AX -n Net::Acme
+
+The C<-A> omits the Autoloader code, C<-X> omits XS elements, and C<-n>
+specifies the name of the module.
+
+=item Use L<strict|strict> and L<warnings|warnings>
+
+A module's code has to be warning and strict-clean, since you can't
+guarantee the conditions that it'll be used under. Besides, you wouldn't
+want to distribute code that wasn't warning or strict-clean anyway,
+right?
+
+=item Use L<Carp|Carp>
+
+The L<Carp|Carp> module allows you to present your error messages from
+the caller's perspective; this gives you a way to signal a problem with
+the caller and not your module. For instance, if you say this:
+
+ warn "No hostname given";
+
+the user will see something like this:
+
+ No hostname given at /usr/local/lib/perl5/site_perl/5.6.0/Net/Acme.pm
+ line 123.
+
+which looks like your module is doing something wrong. Instead, you want
+to put the blame on the user, and say this:
+
+ No hostname given at bad_code, line 10.
+
+You do this by using L<Carp|Carp> and replacing your C<warn>s with
+C<carp>s. If you need to C<die>, say C<croak> instead. However, keep
+C<warn> and C<die> in place for your sanity checks - where it really is
+your module at fault.
+
+=item Use L<Exporter|Exporter> - wisely!
+
+C<h2xs> provides stubs for L<Exporter|Exporter>, which gives you a
+standard way of exporting symbols and subroutines from your module into
+the caller's namespace. For instance, saying C<use Net::Acme qw(&frob)>
+would import the C<frob> subroutine.
+
+The package variable C<@EXPORT> will determine which symbols will get
+exported when the caller simply says C<use Net::Acme> - you will hardly
+ever want to put anything in there. C<@EXPORT_OK>, on the other hand,
+specifies which symbols you're willing to export. If you do want to
+export a bunch of symbols, use the C<%EXPORT_TAGS> and define a standard
+export set - look at L<Exporter> for more details.
+
+=item Use L<plain old documentation|perlpod>
+
+The work isn't over until the paperwork is done, and you're going to
+need to put in some time writing some documentation for your module.
+C<h2xs> will provide a stub for you to fill in; if you're not sure about
+the format, look at L<perlpod> for an introduction. Provide a good
+synopsis of how your module is used in code, a description, and then
+notes on the syntax and function of the individual subroutines or
+methods. Use Perl comments for developer notes and POD for end-user
+notes.
+
+=item Write tests
+
+You're encouraged to create self-tests for your module to ensure it's
+working as intended on the myriad platforms Perl supports; if you upload
+your module to CPAN, a host of testers will build your module and send
+you the results of the tests. Again, C<h2xs> provides a test framework
+which you can extend - you should do something more than just checking
+your module will compile.
+
+=item Write the README
+
+If you're uploading to CPAN, the automated gremlins will extract the
+README file and place that in your CPAN directory. It'll also appear in
+the main F<by-module> and F<by-category> directories if you make it onto
+the modules list. It's a good idea to put here what the module actually
+does in detail, and the user-visible changes since the last release.
+
+=back
+
+=head2 Step-by-step: Distributing your module
+
+=over 3
+
+=item Get a CPAN user ID
+
+Every developer publishing modules on CPAN needs a CPAN ID. See the
+instructions at C<http://www.cpan.org/modules/04pause.html> (or
+equivalent on your nearest mirror) to find out how to do this.
+
+=item C<perl Makefile.PL; make test; make dist>
+
+Once again, C<h2xs> has done all the work for you. It produces the
+standard C<Makefile.PL> you'll have seen when you downloaded and
+installs modules, and this produces a Makefile with a C<dist> target.
+
+Once you've ensured that your module passes its own tests - always a
+good thing to make sure - you can C<make dist>, and the Makefile will
+hopefully produce you a nice tarball of your module, ready for upload.
+
+=item Upload the tarball
+
+The email you got when you received your CPAN ID will tell you how to
+log in to PAUSE, the Perl Authors Upload SErver. From the menus there,
+you can upload your module to CPAN.
+
+=item Announce to the modules list
+
+Once uploaded, it'll sit unnoticed in your author directory. If you want
+it connected to the rest of the CPAN, you'll need to tell the modules
+list about it. The best way to do this is to email them a line in the
+style of the modules list, like this:
+
+ Net::Acme bdpO Interface to Acme Frobnicator servers FOOBAR
+ ^ ^^^^ ^ ^
+ | |||| Module description Your ID
+ | ||||
+ | |||\- Interface: (O)OP, (r)eferences, (h)ybrid, (f)unctions
+ | |||
+ | ||\-- Language: (p)ure Perl, C(+)+, (h)ybrid, (C), (o)ther
+ | ||
+ Module |\--- Support: (d)eveloper, (m)ailing list, (u)senet, (n)one
+ Name |
+ \---- Maturity: (i)dea, (c)onstructions, (a)lpha, (b)eta,
+ (R)eleased, (M)ature, (S)tandard
+
+plus a description of the module and why you think it should be
+included. If you hear nothing back, that means your module will
+probably appear on the modules list at the next update. Don't try
+subscribing to C<modules@perl.org>; it's not another mailing list. Just
+have patience.
+
+=item Announce to clpa
+
+If you have a burning desire to tell the world about your release, post
+an announcement to the moderated C<comp.lang.perl.announce> newsgroup.
+
+=item Fix bugs!
+
+Once you start accumulating users, they'll send you bug reports. If
+you're lucky, they'll even send you patches. Welcome to the joys of
+maintaining a software project...
+
+=back
+
+=head1 AUTHOR
+
+Simon Cozens, C<simon@cpan.org>
+
+=head1 SEE ALSO
+
+L<perlmod>, L<perlmodlib>, L<perlmodinstall>, L<h2xs>, L<strict>,
+L<Carp>, L<Exporter>, L<perlpod>, L<Test>, L<ExtUtils::MakeMaker>,
+http://www.cpan.org/
diff --git a/contrib/perl5/pod/perlrequick.pod b/contrib/perl5/pod/perlrequick.pod
new file mode 100644
index 0000000000000..5b72a35187faf
--- /dev/null
+++ b/contrib/perl5/pod/perlrequick.pod
@@ -0,0 +1,503 @@
+=head1 NAME
+
+perlrequick - Perl regular expressions quick start
+
+=head1 DESCRIPTION
+
+This page covers the very basics of understanding, creating and
+using regular expressions ('regexes') in Perl.
+
+
+=head1 The Guide
+
+=head2 Simple word matching
+
+The simplest regex is simply a word, or more generally, a string of
+characters. A regex consisting of a word matches any string that
+contains that word:
+
+ "Hello World" =~ /World/; # matches
+
+In this statement, C<World> is a regex and the C<//> enclosing
+C</World/> tells perl to search a string for a match. The operator
+C<=~> associates the string with the regex match and produces a true
+value if the regex matched, or false if the regex did not match. In
+our case, C<World> matches the second word in C<"Hello World">, so the
+expression is true. This idea has several variations.
+
+Expressions like this are useful in conditionals:
+
+ print "It matches\n" if "Hello World" =~ /World/;
+
+The sense of the match can be reversed by using C<!~> operator:
+
+ print "It doesn't match\n" if "Hello World" !~ /World/;
+
+The literal string in the regex can be replaced by a variable:
+
+ $greeting = "World";
+ print "It matches\n" if "Hello World" =~ /$greeting/;
+
+If you're matching against C<$_>, the C<$_ =~> part can be omitted:
+
+ $_ = "Hello World";
+ print "It matches\n" if /World/;
+
+Finally, the C<//> default delimiters for a match can be changed to
+arbitrary delimiters by putting an C<'m'> out front:
+
+ "Hello World" =~ m!World!; # matches, delimited by '!'
+ "Hello World" =~ m{World}; # matches, note the matching '{}'
+ "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin',
+ # '/' becomes an ordinary char
+
+Regexes must match a part of the string I<exactly> in order for the
+statement to be true:
+
+ "Hello World" =~ /world/; # doesn't match, case sensitive
+ "Hello World" =~ /o W/; # matches, ' ' is an ordinary char
+ "Hello World" =~ /World /; # doesn't match, no ' ' at end
+
+perl will always match at the earliest possible point in the string:
+
+ "Hello World" =~ /o/; # matches 'o' in 'Hello'
+ "That hat is red" =~ /hat/; # matches 'hat' in 'That'
+
+Not all characters can be used 'as is' in a match. Some characters,
+called B<metacharacters>, are reserved for use in regex notation.
+The metacharacters are
+
+ {}[]()^$.|*+?\
+
+A metacharacter can be matched by putting a backslash before it:
+
+ "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter
+ "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary +
+ 'C:\WIN32' =~ /C:\\WIN/; # matches
+ "/usr/bin/perl" =~ /\/usr\/local\/bin\/perl/; # matches
+
+In the last regex, the forward slash C<'/'> is also backslashed,
+because it is used to delimit the regex.
+
+Non-printable ASCII characters are represented by B<escape sequences>.
+Common examples are C<\t> for a tab, C<\n> for a newline, and C<\r>
+for a carriage return. Arbitrary bytes are represented by octal
+escape sequences, e.g., C<\033>, or hexadecimal escape sequences,
+e.g., C<\x1B>:
+
+ "1000\t2000" =~ m(0\t2) # matches
+ "cat" =~ /\143\x61\x74/ # matches, but a weird way to spell cat
+
+Regexes are treated mostly as double quoted strings, so variable
+substitution works:
+
+ $foo = 'house';
+ 'cathouse' =~ /cat$foo/; # matches
+ 'housecat' =~ /${foo}cat/; # matches
+
+With all of the regexes above, if the regex matched anywhere in the
+string, it was considered a match. To specify I<where> it should
+match, we would use the B<anchor> metacharacters C<^> and C<$>. The
+anchor C<^> means match at the beginning of the string and the anchor
+C<$> means match at the end of the string, or before a newline at the
+end of the string. Some examples:
+
+ "housekeeper" =~ /keeper/; # matches
+ "housekeeper" =~ /^keeper/; # doesn't match
+ "housekeeper" =~ /keeper$/; # matches
+ "housekeeper\n" =~ /keeper$/; # matches
+ "housekeeper" =~ /^housekeeper$/; # matches
+
+=head2 Using character classes
+
+A B<character class> allows a set of possible characters, rather than
+just a single character, to match at a particular point in a regex.
+Character classes are denoted by brackets C<[...]>, with the set of
+characters to be possibly matched inside. Here are some examples:
+
+ /cat/; # matches 'cat'
+ /[bcr]at/; # matches 'bat', 'cat', or 'rat'
+ "abc" =~ /[cab]/; # matches 'a'
+
+In the last statement, even though C<'c'> is the first character in
+the class, the earliest point at which the regex can match is C<'a'>.
+
+ /[yY][eE][sS]/; # match 'yes' in a case-insensitive way
+ # 'yes', 'Yes', 'YES', etc.
+ /yes/i; # also match 'yes' in a case-insensitive way
+
+The last example shows a match with an C<'i'> B<modifier>, which makes
+the match case-insensitive.
+
+Character classes also have ordinary and special characters, but the
+sets of ordinary and special characters inside a character class are
+different than those outside a character class. The special
+characters for a character class are C<-]\^$> and are matched using an
+escape:
+
+ /[\]c]def/; # matches ']def' or 'cdef'
+ $x = 'bcr';
+ /[$x]at/; # matches 'bat, 'cat', or 'rat'
+ /[\$x]at/; # matches '$at' or 'xat'
+ /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat'
+
+The special character C<'-'> acts as a range operator within character
+classes, so that the unwieldy C<[0123456789]> and C<[abc...xyz]>
+become the svelte C<[0-9]> and C<[a-z]>:
+
+ /item[0-9]/; # matches 'item0' or ... or 'item9'
+ /[0-9a-fA-F]/; # matches a hexadecimal digit
+
+If C<'-'> is the first or last character in a character class, it is
+treated as an ordinary character.
+
+The special character C<^> in the first position of a character class
+denotes a B<negated character class>, which matches any character but
+those in the brackets. Both C<[...]> and C<[^...]> must match a
+character, or the match fails. Then
+
+ /[^a]at/; # doesn't match 'aat' or 'at', but matches
+ # all other 'bat', 'cat, '0at', '%at', etc.
+ /[^0-9]/; # matches a non-numeric character
+ /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary
+
+Perl has several abbreviations for common character classes:
+
+=over 4
+
+=item *
+
+\d is a digit and represents [0-9]
+
+=item *
+
+\s is a whitespace character and represents [\ \t\r\n\f]
+
+=item *
+
+\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_]
+
+=item *
+
+\D is a negated \d; it represents any character but a digit [^0-9]
+
+=item *
+
+\S is a negated \s; it represents any non-whitespace character [^\s]
+
+=item *
+
+\W is a negated \w; it represents any non-word character [^\w]
+
+=item *
+
+The period '.' matches any character but "\n"
+
+=back
+
+The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside
+of character classes. Here are some in use:
+
+ /\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format
+ /[\d\s]/; # matches any digit or whitespace character
+ /\w\W\w/; # matches a word char, followed by a
+ # non-word char, followed by a word char
+ /..rt/; # matches any two chars, followed by 'rt'
+ /end\./; # matches 'end.'
+ /end[.]/; # same thing, matches 'end.'
+
+The S<B<word anchor> > C<\b> matches a boundary between a word
+character and a non-word character C<\w\W> or C<\W\w>:
+
+ $x = "Housecat catenates house and cat";
+ $x =~ /\bcat/; # matches cat in 'catenates'
+ $x =~ /cat\b/; # matches cat in 'housecat'
+ $x =~ /\bcat\b/; # matches 'cat' at end of string
+
+In the last example, the end of the string is considered a word
+boundary.
+
+=head2 Matching this or that
+
+We can match match different character strings with the B<alternation>
+metacharacter C<'|'>. To match C<dog> or C<cat>, we form the regex
+C<dog|cat>. As before, perl will try to match the regex at the
+earliest possible point in the string. At each character position,
+perl will first try to match the the first alternative, C<dog>. If
+C<dog> doesn't match, perl will then try the next alternative, C<cat>.
+If C<cat> doesn't match either, then the match fails and perl moves to
+the next position in the string. Some examples:
+
+ "cats and dogs" =~ /cat|dog|bird/; # matches "cat"
+ "cats and dogs" =~ /dog|cat|bird/; # matches "cat"
+
+Even though C<dog> is the first alternative in the second regex,
+C<cat> is able to match earlier in the string.
+
+ "cats" =~ /c|ca|cat|cats/; # matches "c"
+ "cats" =~ /cats|cat|ca|c/; # matches "cats"
+
+At a given character position, the first alternative that allows the
+regex match to succeed wil be the one that matches. Here, all the
+alternatives match at the first string position, so th first matches.
+
+=head2 Grouping things and hierarchical matching
+
+The B<grouping> metacharacters C<()> allow a part of a regex to be
+treated as a single unit. Parts of a regex are grouped by enclosing
+them in parentheses. The regex C<house(cat|keeper)> means match
+C<house> followed by either C<cat> or C<keeper>. Some more examples
+are
+
+ /(a|b)b/; # matches 'ab' or 'bb'
+ /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere
+
+ /house(cat|)/; # matches either 'housecat' or 'house'
+ /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or
+ # 'house'. Note groups can be nested.
+
+ "20" =~ /(19|20|)\d\d/; # matches the null alternative '()\d\d',
+ # because '20\d\d' can't match
+
+=head2 Extracting matches
+
+The grouping metacharacters C<()> also allow the extraction of the
+parts of a string that matched. For each grouping, the part that
+matched inside goes into the special variables C<$1>, C<$2>, etc.
+They can be used just as ordinary variables:
+
+ # extract hours, minutes, seconds
+ $time =~ /(\d\d):(\d\d):(\d\d)/; # match hh:mm:ss format
+ $hours = $1;
+ $minutes = $2;
+ $seconds = $3;
+
+In list context, a match C</regex/> with groupings will return the
+list of matched values C<($1,$2,...)>. So we could rewrite it as
+
+ ($hours, $minutes, $second) = ($time =~ /(\d\d):(\d\d):(\d\d)/);
+
+If the groupings in a regex are nested, C<$1> gets the group with the
+leftmost opening parenthesis, C<$2> the next opening parenthesis,
+etc. For example, here is a complex regex and the matching variables
+indicated below it:
+
+ /(ab(cd|ef)((gi)|j))/;
+ 1 2 34
+
+Associated with the matching variables C<$1>, C<$2>, ... are
+the B<backreferences> C<\1>, C<\2>, ... Backreferences are
+matching variables that can be used I<inside> a regex:
+
+ /(\w\w\w)\s\1/; # find sequences like 'the the' in string
+
+C<$1>, C<$2>, ... should only be used outside of a regex, and C<\1>,
+C<\2>, ... only inside a regex.
+
+=head2 Matching repetitions
+
+The B<quantifier> metacharacters C<?>, C<*>, C<+>, and C<{}> allow us
+to determine the number of repeats of a portion of a regex we
+consider to be a match. Quantifiers are put immediately after the
+character, character class, or grouping that we want to specify. They
+have the following meanings:
+
+=over 4
+
+=item *
+
+C<a?> = match 'a' 1 or 0 times
+
+=item *
+
+C<a*> = match 'a' 0 or more times, i.e., any number of times
+
+=item *
+
+C<a+> = match 'a' 1 or more times, i.e., at least once
+
+=item *
+
+C<a{n,m}> = match at least C<n> times, but not more than C<m>
+times.
+
+=item *
+
+C<a{n,}> = match at least C<n> or more times
+
+=item *
+
+C<a{n}> = match exactly C<n> times
+
+=back
+
+Here are some examples:
+
+ /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and
+ # any number of digits
+ /(\w+)\s+\1/; # match doubled words of arbitrary length
+ $year =~ /\d{2,4}/; # make sure year is at least 2 but not more
+ # than 4 digits
+ $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates
+
+These quantifiers will try to match as much of the string as possible,
+while still allowing the regex to match. So we have
+
+ $x = 'the cat in the hat';
+ $x =~ /^(.*)(at)(.*)$/; # matches,
+ # $1 = 'the cat in the h'
+ # $2 = 'at'
+ # $3 = '' (0 matches)
+
+The first quantifier C<.*> grabs as much of the string as possible
+while still having the regex match. The second quantifier C<.*> has
+no string left to it, so it matches 0 times.
+
+=head2 More matching
+
+There are a few more things you might want to know about matching
+operators. In the code
+
+ $pattern = 'Seuss';
+ while (<>) {
+ print if /$pattern/;
+ }
+
+perl has to re-evaluate C<$pattern> each time through the loop. If
+C<$pattern> won't be changing, use the C<//o> modifier, to only
+perform variable substitutions once. If you don't want any
+substitutions at all, use the special delimiter C<m''>:
+
+ $pattern = 'Seuss';
+ m'$pattern'; # matches '$pattern', not 'Seuss'
+
+The global modifier C<//g> allows the matching operator to match
+within a string as many times as possible. In scalar context,
+successive matches against a string will have C<//g> jump from match
+to match, keeping track of position in the string as it goes along.
+You can get or set the position with the C<pos()> function.
+For example,
+
+ $x = "cat dog house"; # 3 words
+ while ($x =~ /(\w+)/g) {
+ print "Word is $1, ends at position ", pos $x, "\n";
+ }
+
+prints
+
+ Word is cat, ends at position 3
+ Word is dog, ends at position 7
+ Word is house, ends at position 13
+
+A failed match or changing the target string resets the position. If
+you don't want the position reset after failure to match, add the
+C<//c>, as in C</regex/gc>.
+
+In list context, C<//g> returns a list of matched groupings, or if
+there are no groupings, a list of matches to the whole regex. So
+
+ @words = ($x =~ /(\w+)/g); # matches,
+ # $word[0] = 'cat'
+ # $word[1] = 'dog'
+ # $word[2] = 'house'
+
+=head2 Search and replace
+
+Search and replace is performed using C<s/regex/replacement/modifiers>.
+The C<replacement> is a Perl double quoted string that replaces in the
+string whatever is matched with the C<regex>. The operator C<=~> is
+also used here to associate a string with C<s///>. If matching
+against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match,
+C<s///> returns the number of substitutions made, otherwise it returns
+false. Here are a few examples:
+
+ $x = "Time to feed the cat!";
+ $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!"
+ $y = "'quoted words'";
+ $y =~ s/^'(.*)'$/$1/; # strip single quotes,
+ # $y contains "quoted words"
+
+With the C<s///> operator, the matched variables C<$1>, C<$2>, etc.
+are immediately available for use in the replacement expression. With
+the global modifier, C<s///g> will search and replace all occurrences
+of the regex in the string:
+
+ $x = "I batted 4 for 4";
+ $x =~ s/4/four/; # $x contains "I batted four for 4"
+ $x = "I batted 4 for 4";
+ $x =~ s/4/four/g; # $x contains "I batted four for four"
+
+The evaluation modifier C<s///e> wraps an C<eval{...}> around the
+replacement string and the evaluated result is substituted for the
+matched substring. Some examples:
+
+ # reverse all the words in a string
+ $x = "the cat in the hat";
+ $x =~ s/(\w+)/reverse $1/ge; # $x contains "eht tac ni eht tah"
+
+ # convert percentage to decimal
+ $x = "A 39% hit rate";
+ $x =~ s!(\d+)%!$1/100!e; # $x contains "A 0.39 hit rate"
+
+The last example shows that C<s///> can use other delimiters, such as
+C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are used
+C<s'''>, then the regex and replacement are treated as single quoted
+strings.
+
+=head2 The split operator
+
+C<split /regex/, string> splits C<string> into a list of substrings
+and returns that list. The regex determines the character sequence
+that C<string> is split with respect to. For example, to split a
+string into words, use
+
+ $x = "Calvin and Hobbes";
+ @word = split /\s+/, $x; # $word[0] = 'Calvin'
+ # $word[1] = 'and'
+ # $word[2] = 'Hobbes'
+
+To extract a comma-delimited list of numbers, use
+
+ $x = "1.618,2.718, 3.142";
+ @const = split /,\s*/, $x; # $const[0] = '1.618'
+ # $const[1] = '2.718'
+ # $const[2] = '3.142'
+
+If the empty regex C<//> is used, the string is split into individual
+characters. If the regex has groupings, then list produced contains
+the matched substrings from the groupings as well:
+
+ $x = "/usr/bin";
+ @parts = split m!(/)!, $x; # $parts[0] = ''
+ # $parts[1] = '/'
+ # $parts[2] = 'usr'
+ # $parts[3] = '/'
+ # $parts[4] = 'bin'
+
+Since the first character of $x matched the regex, C<split> prepended
+an empty initial element to the list.
+
+=head1 BUGS
+
+None.
+
+=head1 SEE ALSO
+
+This is just a quick start guide. For a more in-depth tutorial on
+regexes, see L<perlretut> and for the reference page, see L<perlre>.
+
+=head1 AUTHOR AND COPYRIGHT
+
+Copyright (c) 2000 Mark Kvale
+All rights reserved.
+
+This document may be distributed under the same terms as Perl itself.
+
+=head2 Acknowledgments
+
+The author would like to thank Mark-Jason Dominus, Tom Christiansen,
+Ilya Zakharevich, Brad Hughes, and Mike Giroux for all their helpful
+comments.
+
+=cut
+
diff --git a/contrib/perl5/pod/perlretut.pod b/contrib/perl5/pod/perlretut.pod
new file mode 100644
index 0000000000000..fa6479c0c45bc
--- /dev/null
+++ b/contrib/perl5/pod/perlretut.pod
@@ -0,0 +1,2504 @@
+=head1 NAME
+
+perlretut - Perl regular expressions tutorial
+
+=head1 DESCRIPTION
+
+This page provides a basic tutorial on understanding, creating and
+using regular expressions in Perl. It serves as a complement to the
+reference page on regular expressions L<perlre>. Regular expressions
+are an integral part of the C<m//>, C<s///>, C<qr//> and C<split>
+operators and so this tutorial also overlaps with
+L<perlop/"Regexp Quote-Like Operators"> and L<perlfunc/split>.
+
+Perl is widely renowned for excellence in text processing, and regular
+expressions are one of the big factors behind this fame. Perl regular
+expressions display an efficiency and flexibility unknown in most
+other computer languages. Mastering even the basics of regular
+expressions will allow you to manipulate text with surprising ease.
+
+What is a regular expression? A regular expression is simply a string
+that describes a pattern. Patterns are in common use these days;
+examples are the patterns typed into a search engine to find web pages
+and the patterns used to list files in a directory, e.g., C<ls *.txt>
+or C<dir *.*>. In Perl, the patterns described by regular expressions
+are used to search strings, extract desired parts of strings, and to
+do search and replace operations.
+
+Regular expressions have the undeserved reputation of being abstract
+and difficult to understand. Regular expressions are constructed using
+simple concepts like conditionals and loops and are no more difficult
+to understand than the corresponding C<if> conditionals and C<while>
+loops in the Perl language itself. In fact, the main challenge in
+learning regular expressions is just getting used to the terse
+notation used to express these concepts.
+
+This tutorial flattens the learning curve by discussing regular
+expression concepts, along with their notation, one at a time and with
+many examples. The first part of the tutorial will progress from the
+simplest word searches to the basic regular expression concepts. If
+you master the first part, you will have all the tools needed to solve
+about 98% of your needs. The second part of the tutorial is for those
+comfortable with the basics and hungry for more power tools. It
+discusses the more advanced regular expression operators and
+introduces the latest cutting edge innovations in 5.6.0.
+
+A note: to save time, 'regular expression' is often abbreviated as
+regexp or regex. Regexp is a more natural abbreviation than regex, but
+is harder to pronounce. The Perl pod documentation is evenly split on
+regexp vs regex; in Perl, there is more than one way to abbreviate it.
+We'll use regexp in this tutorial.
+
+=head1 Part 1: The basics
+
+=head2 Simple word matching
+
+The simplest regexp is simply a word, or more generally, a string of
+characters. A regexp consisting of a word matches any string that
+contains that word:
+
+ "Hello World" =~ /World/; # matches
+
+What is this perl statement all about? C<"Hello World"> is a simple
+double quoted string. C<World> is the regular expression and the
+C<//> enclosing C</World/> tells perl to search a string for a match.
+The operator C<=~> associates the string with the regexp match and
+produces a true value if the regexp matched, or false if the regexp
+did not match. In our case, C<World> matches the second word in
+C<"Hello World">, so the expression is true. Expressions like this
+are useful in conditionals:
+
+ if ("Hello World" =~ /World/) {
+ print "It matches\n";
+ }
+ else {
+ print "It doesn't match\n";
+ }
+
+There are useful variations on this theme. The sense of the match can
+be reversed by using C<!~> operator:
+
+ if ("Hello World" !~ /World/) {
+ print "It doesn't match\n";
+ }
+ else {
+ print "It matches\n";
+ }
+
+The literal string in the regexp can be replaced by a variable:
+
+ $greeting = "World";
+ if ("Hello World" =~ /$greeting/) {
+ print "It matches\n";
+ }
+ else {
+ print "It doesn't match\n";
+ }
+
+If you're matching against the special default variable C<$_>, the
+C<$_ =~> part can be omitted:
+
+ $_ = "Hello World";
+ if (/World/) {
+ print "It matches\n";
+ }
+ else {
+ print "It doesn't match\n";
+ }
+
+And finally, the C<//> default delimiters for a match can be changed
+to arbitrary delimiters by putting an C<'m'> out front:
+
+ "Hello World" =~ m!World!; # matches, delimited by '!'
+ "Hello World" =~ m{World}; # matches, note the matching '{}'
+ "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin',
+ # '/' becomes an ordinary char
+
+C</World/>, C<m!World!>, and C<m{World}> all represent the
+same thing. When, e.g., C<""> is used as a delimiter, the forward
+slash C<'/'> becomes an ordinary character and can be used in a regexp
+without trouble.
+
+Let's consider how different regexps would match C<"Hello World">:
+
+ "Hello World" =~ /world/; # doesn't match
+ "Hello World" =~ /o W/; # matches
+ "Hello World" =~ /oW/; # doesn't match
+ "Hello World" =~ /World /; # doesn't match
+
+The first regexp C<world> doesn't match because regexps are
+case-sensitive. The second regexp matches because the substring
+S<C<'o W'> > occurs in the string S<C<"Hello World"> >. The space
+character ' ' is treated like any other character in a regexp and is
+needed to match in this case. The lack of a space character is the
+reason the third regexp C<'oW'> doesn't match. The fourth regexp
+C<'World '> doesn't match because there is a space at the end of the
+regexp, but not at the end of the string. The lesson here is that
+regexps must match a part of the string I<exactly> in order for the
+statement to be true.
+
+If a regexp matches in more than one place in the string, perl will
+always match at the earliest possible point in the string:
+
+ "Hello World" =~ /o/; # matches 'o' in 'Hello'
+ "That hat is red" =~ /hat/; # matches 'hat' in 'That'
+
+With respect to character matching, there are a few more points you
+need to know about. First of all, not all characters can be used 'as
+is' in a match. Some characters, called B<metacharacters>, are reserved
+for use in regexp notation. The metacharacters are
+
+ {}[]()^$.|*+?\
+
+The significance of each of these will be explained
+in the rest of the tutorial, but for now, it is important only to know
+that a metacharacter can be matched by putting a backslash before it:
+
+ "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter
+ "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary +
+ "The interval is [0,1)." =~ /[0,1)./ # is a syntax error!
+ "The interval is [0,1)." =~ /\[0,1\)\./ # matches
+ "/usr/bin/perl" =~ /\/usr\/local\/bin\/perl/; # matches
+
+In the last regexp, the forward slash C<'/'> is also backslashed,
+because it is used to delimit the regexp. This can lead to LTS
+(leaning toothpick syndrome), however, and it is often more readable
+to change delimiters.
+
+
+The backslash character C<'\'> is a metacharacter itself and needs to
+be backslashed:
+
+ 'C:\WIN32' =~ /C:\\WIN/; # matches
+
+In addition to the metacharacters, there are some ASCII characters
+which don't have printable character equivalents and are instead
+represented by B<escape sequences>. Common examples are C<\t> for a
+tab, C<\n> for a newline, C<\r> for a carriage return and C<\a> for a
+bell. If your string is better thought of as a sequence of arbitrary
+bytes, the octal escape sequence, e.g., C<\033>, or hexadecimal escape
+sequence, e.g., C<\x1B> may be a more natural representation for your
+bytes. Here are some examples of escapes:
+
+ "1000\t2000" =~ m(0\t2) # matches
+ "1000\n2000" =~ /0\n20/ # matches
+ "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000"
+ "cat" =~ /\143\x61\x74/ # matches, but a weird way to spell cat
+
+If you've been around Perl a while, all this talk of escape sequences
+may seem familiar. Similar escape sequences are used in double-quoted
+strings and in fact the regexps in Perl are mostly treated as
+double-quoted strings. This means that variables can be used in
+regexps as well. Just like double-quoted strings, the values of the
+variables in the regexp will be substituted in before the regexp is
+evaluated for matching purposes. So we have:
+
+ $foo = 'house';
+ 'housecat' =~ /$foo/; # matches
+ 'cathouse' =~ /cat$foo/; # matches
+ 'housecat' =~ /${foo}cat/; # matches
+
+So far, so good. With the knowledge above you can already perform
+searches with just about any literal string regexp you can dream up.
+Here is a I<very simple> emulation of the Unix grep program:
+
+ % cat > simple_grep
+ #!/usr/bin/perl
+ $regexp = shift;
+ while (<>) {
+ print if /$regexp/;
+ }
+ ^D
+
+ % chmod +x simple_grep
+
+ % simple_grep abba /usr/dict/words
+ Babbage
+ cabbage
+ cabbages
+ sabbath
+ Sabbathize
+ Sabbathizes
+ sabbatical
+ scabbard
+ scabbards
+
+This program is easy to understand. C<#!/usr/bin/perl> is the standard
+way to invoke a perl program from the shell.
+S<C<$regexp = shift;> > saves the first command line argument as the
+regexp to be used, leaving the rest of the command line arguments to
+be treated as files. S<C<< while (<>) >> > loops over all the lines in
+all the files. For each line, S<C<print if /$regexp/;> > prints the
+line if the regexp matches the line. In this line, both C<print> and
+C</$regexp/> use the default variable C<$_> implicitly.
+
+With all of the regexps above, if the regexp matched anywhere in the
+string, it was considered a match. Sometimes, however, we'd like to
+specify I<where> in the string the regexp should try to match. To do
+this, we would use the B<anchor> metacharacters C<^> and C<$>. The
+anchor C<^> means match at the beginning of the string and the anchor
+C<$> means match at the end of the string, or before a newline at the
+end of the string. Here is how they are used:
+
+ "housekeeper" =~ /keeper/; # matches
+ "housekeeper" =~ /^keeper/; # doesn't match
+ "housekeeper" =~ /keeper$/; # matches
+ "housekeeper\n" =~ /keeper$/; # matches
+
+The second regexp doesn't match because C<^> constrains C<keeper> to
+match only at the beginning of the string, but C<"housekeeper"> has
+keeper starting in the middle. The third regexp does match, since the
+C<$> constrains C<keeper> to match only at the end of the string.
+
+When both C<^> and C<$> are used at the same time, the regexp has to
+match both the beginning and the end of the string, i.e., the regexp
+matches the whole string. Consider
+
+ "keeper" =~ /^keep$/; # doesn't match
+ "keeper" =~ /^keeper$/; # matches
+ "" =~ /^$/; # ^$ matches an empty string
+
+The first regexp doesn't match because the string has more to it than
+C<keep>. Since the second regexp is exactly the string, it
+matches. Using both C<^> and C<$> in a regexp forces the complete
+string to match, so it gives you complete control over which strings
+match and which don't. Suppose you are looking for a fellow named
+bert, off in a string by himself:
+
+ "dogbert" =~ /bert/; # matches, but not what you want
+
+ "dilbert" =~ /^bert/; # doesn't match, but ..
+ "bertram" =~ /^bert/; # matches, so still not good enough
+
+ "bertram" =~ /^bert$/; # doesn't match, good
+ "dilbert" =~ /^bert$/; # doesn't match, good
+ "bert" =~ /^bert$/; # matches, perfect
+
+Of course, in the case of a literal string, one could just as easily
+use the string equivalence S<C<$string eq 'bert'> > and it would be
+more efficient. The C<^...$> regexp really becomes useful when we
+add in the more powerful regexp tools below.
+
+=head2 Using character classes
+
+Although one can already do quite a lot with the literal string
+regexps above, we've only scratched the surface of regular expression
+technology. In this and subsequent sections we will introduce regexp
+concepts (and associated metacharacter notations) that will allow a
+regexp to not just represent a single character sequence, but a I<whole
+class> of them.
+
+One such concept is that of a B<character class>. A character class
+allows a set of possible characters, rather than just a single
+character, to match at a particular point in a regexp. Character
+classes are denoted by brackets C<[...]>, with the set of characters
+to be possibly matched inside. Here are some examples:
+
+ /cat/; # matches 'cat'
+ /[bcr]at/; # matches 'bat, 'cat', or 'rat'
+ /item[0123456789]/; # matches 'item0' or ... or 'item9'
+ "abc" =~ /[cab]/; # matches 'a'
+
+In the last statement, even though C<'c'> is the first character in
+the class, C<'a'> matches because the first character position in the
+string is the earliest point at which the regexp can match.
+
+ /[yY][eE][sS]/; # match 'yes' in a case-insensitive way
+ # 'yes', 'Yes', 'YES', etc.
+
+This regexp displays a common task: perform a a case-insensitive
+match. Perl provides away of avoiding all those brackets by simply
+appending an C<'i'> to the end of the match. Then C</[yY][eE][sS]/;>
+can be rewritten as C</yes/i;>. The C<'i'> stands for
+case-insensitive and is an example of a B<modifier> of the matching
+operation. We will meet other modifiers later in the tutorial.
+
+We saw in the section above that there were ordinary characters, which
+represented themselves, and special characters, which needed a
+backslash C<\> to represent themselves. The same is true in a
+character class, but the sets of ordinary and special characters
+inside a character class are different than those outside a character
+class. The special characters for a character class are C<-]\^$>. C<]>
+is special because it denotes the end of a character class. C<$> is
+special because it denotes a scalar variable. C<\> is special because
+it is used in escape sequences, just like above. Here is how the
+special characters C<]$\> are handled:
+
+ /[\]c]def/; # matches ']def' or 'cdef'
+ $x = 'bcr';
+ /[$x]at/; # matches 'bat', 'cat', or 'rat'
+ /[\$x]at/; # matches '$at' or 'xat'
+ /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat'
+
+The last two are a little tricky. in C<[\$x]>, the backslash protects
+the dollar sign, so the character class has two members C<$> and C<x>.
+In C<[\\$x]>, the backslash is protected, so C<$x> is treated as a
+variable and substituted in double quote fashion.
+
+The special character C<'-'> acts as a range operator within character
+classes, so that a contiguous set of characters can be written as a
+range. With ranges, the unwieldy C<[0123456789]> and C<[abc...xyz]>
+become the svelte C<[0-9]> and C<[a-z]>. Some examples are
+
+ /item[0-9]/; # matches 'item0' or ... or 'item9'
+ /[0-9bx-z]aa/; # matches '0aa', ..., '9aa',
+ # 'baa', 'xaa', 'yaa', or 'zaa'
+ /[0-9a-fA-F]/; # matches a hexadecimal digit
+ /[0-9a-zA-Z_]/; # matches a "word" character,
+ # like those in a perl variable name
+
+If C<'-'> is the first or last character in a character class, it is
+treated as an ordinary character; C<[-ab]>, C<[ab-]> and C<[a\-b]> are
+all equivalent.
+
+The special character C<^> in the first position of a character class
+denotes a B<negated character class>, which matches any character but
+those in the brackets. Both C<[...]> and C<[^...]> must match a
+character, or the match fails. Then
+
+ /[^a]at/; # doesn't match 'aat' or 'at', but matches
+ # all other 'bat', 'cat, '0at', '%at', etc.
+ /[^0-9]/; # matches a non-numeric character
+ /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary
+
+Now, even C<[0-9]> can be a bother the write multiple times, so in the
+interest of saving keystrokes and making regexps more readable, Perl
+has several abbreviations for common character classes:
+
+=over 4
+
+=item *
+
+\d is a digit and represents [0-9]
+
+=item *
+
+\s is a whitespace character and represents [\ \t\r\n\f]
+
+=item *
+
+\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_]
+
+=item *
+
+\D is a negated \d; it represents any character but a digit [^0-9]
+
+=item *
+
+\S is a negated \s; it represents any non-whitespace character [^\s]
+
+=item *
+
+\W is a negated \w; it represents any non-word character [^\w]
+
+=item *
+
+The period '.' matches any character but "\n"
+
+=back
+
+The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside
+of character classes. Here are some in use:
+
+ /\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format
+ /[\d\s]/; # matches any digit or whitespace character
+ /\w\W\w/; # matches a word char, followed by a
+ # non-word char, followed by a word char
+ /..rt/; # matches any two chars, followed by 'rt'
+ /end\./; # matches 'end.'
+ /end[.]/; # same thing, matches 'end.'
+
+Because a period is a metacharacter, it needs to be escaped to match
+as an ordinary period. Because, for example, C<\d> and C<\w> are sets
+of characters, it is incorrect to think of C<[^\d\w]> as C<[\D\W]>; in
+fact C<[^\d\w]> is the same as C<[^\w]>, which is the same as
+C<[\W]>. Think DeMorgan's laws.
+
+An anchor useful in basic regexps is the S<B<word anchor> >
+C<\b>. This matches a boundary between a word character and a non-word
+character C<\w\W> or C<\W\w>:
+
+ $x = "Housecat catenates house and cat";
+ $x =~ /cat/; # matches cat in 'housecat'
+ $x =~ /\bcat/; # matches cat in 'catenates'
+ $x =~ /cat\b/; # matches cat in 'housecat'
+ $x =~ /\bcat\b/; # matches 'cat' at end of string
+
+Note in the last example, the end of the string is considered a word
+boundary.
+
+You might wonder why C<'.'> matches everything but C<"\n"> - why not
+every character? The reason is that often one is matching against
+lines and would like to ignore the newline characters. For instance,
+while the string C<"\n"> represents one line, we would like to think
+of as empty. Then
+
+ "" =~ /^$/; # matches
+ "\n" =~ /^$/; # matches, "\n" is ignored
+
+ "" =~ /./; # doesn't match; it needs a char
+ "" =~ /^.$/; # doesn't match; it needs a char
+ "\n" =~ /^.$/; # doesn't match; it needs a char other than "\n"
+ "a" =~ /^.$/; # matches
+ "a\n" =~ /^.$/; # matches, ignores the "\n"
+
+This behavior is convenient, because we usually want to ignore
+newlines when we count and match characters in a line. Sometimes,
+however, we want to keep track of newlines. We might even want C<^>
+and C<$> to anchor at the beginning and end of lines within the
+string, rather than just the beginning and end of the string. Perl
+allows us to choose between ignoring and paying attention to newlines
+by using the C<//s> and C<//m> modifiers. C<//s> and C<//m> stand for
+single line and multi-line and they determine whether a string is to
+be treated as one continuous string, or as a set of lines. The two
+modifiers affect two aspects of how the regexp is interpreted: 1) how
+the C<'.'> character class is defined, and 2) where the anchors C<^>
+and C<$> are able to match. Here are the four possible combinations:
+
+=over 4
+
+=item *
+
+no modifiers (//): Default behavior. C<'.'> matches any character
+except C<"\n">. C<^> matches only at the beginning of the string and
+C<$> matches only at the end or before a newline at the end.
+
+=item *
+
+s modifier (//s): Treat string as a single long line. C<'.'> matches
+any character, even C<"\n">. C<^> matches only at the beginning of
+the string and C<$> matches only at the end or before a newline at the
+end.
+
+=item *
+
+m modifier (//m): Treat string as a set of multiple lines. C<'.'>
+matches any character except C<"\n">. C<^> and C<$> are able to match
+at the start or end of I<any> line within the string.
+
+=item *
+
+both s and m modifiers (//sm): Treat string as a single long line, but
+detect multiple lines. C<'.'> matches any character, even
+C<"\n">. C<^> and C<$>, however, are able to match at the start or end
+of I<any> line within the string.
+
+=back
+
+Here are examples of C<//s> and C<//m> in action:
+
+ $x = "There once was a girl\nWho programmed in Perl\n";
+
+ $x =~ /^Who/; # doesn't match, "Who" not at start of string
+ $x =~ /^Who/s; # doesn't match, "Who" not at start of string
+ $x =~ /^Who/m; # matches, "Who" at start of second line
+ $x =~ /^Who/sm; # matches, "Who" at start of second line
+
+ $x =~ /girl.Who/; # doesn't match, "." doesn't match "\n"
+ $x =~ /girl.Who/s; # matches, "." matches "\n"
+ $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\n"
+ $x =~ /girl.Who/sm; # matches, "." matches "\n"
+
+Most of the time, the default behavior is what is want, but C<//s> and
+C<//m> are occasionally very useful. If C<//m> is being used, the start
+of the string can still be matched with C<\A> and the end of string
+can still be matched with the anchors C<\Z> (matches both the end and
+the newline before, like C<$>), and C<\z> (matches only the end):
+
+ $x =~ /^Who/m; # matches, "Who" at start of second line
+ $x =~ /\AWho/m; # doesn't match, "Who" is not at start of string
+
+ $x =~ /girl$/m; # matches, "girl" at end of first line
+ $x =~ /girl\Z/m; # doesn't match, "girl" is not at end of string
+
+ $x =~ /Perl\Z/m; # matches, "Perl" is at newline before end
+ $x =~ /Perl\z/m; # doesn't match, "Perl" is not at end of string
+
+We now know how to create choices among classes of characters in a
+regexp. What about choices among words or character strings? Such
+choices are described in the next section.
+
+=head2 Matching this or that
+
+Sometimes we would like to our regexp to be able to match different
+possible words or character strings. This is accomplished by using
+the B<alternation> metacharacter C<|>. To match C<dog> or C<cat>, we
+form the regexp C<dog|cat>. As before, perl will try to match the
+regexp at the earliest possible point in the string. At each
+character position, perl will first try to match the first
+alternative, C<dog>. If C<dog> doesn't match, perl will then try the
+next alternative, C<cat>. If C<cat> doesn't match either, then the
+match fails and perl moves to the next position in the string. Some
+examples:
+
+ "cats and dogs" =~ /cat|dog|bird/; # matches "cat"
+ "cats and dogs" =~ /dog|cat|bird/; # matches "cat"
+
+Even though C<dog> is the first alternative in the second regexp,
+C<cat> is able to match earlier in the string.
+
+ "cats" =~ /c|ca|cat|cats/; # matches "c"
+ "cats" =~ /cats|cat|ca|c/; # matches "cats"
+
+Here, all the alternatives match at the first string position, so the
+first alternative is the one that matches. If some of the
+alternatives are truncations of the others, put the longest ones first
+to give them a chance to match.
+
+ "cab" =~ /a|b|c/ # matches "c"
+ # /a|b|c/ == /[abc]/
+
+The last example points out that character classes are like
+alternations of characters. At a given character position, the first
+alternative that allows the regexp match to succeed wil be the one
+that matches.
+
+=head2 Grouping things and hierarchical matching
+
+Alternation allows a regexp to choose among alternatives, but by
+itself it unsatisfying. The reason is that each alternative is a whole
+regexp, but sometime we want alternatives for just part of a
+regexp. For instance, suppose we want to search for housecats or
+housekeepers. The regexp C<housecat|housekeeper> fits the bill, but is
+inefficient because we had to type C<house> twice. It would be nice to
+have parts of the regexp be constant, like C<house>, and and some
+parts have alternatives, like C<cat|keeper>.
+
+The B<grouping> metacharacters C<()> solve this problem. Grouping
+allows parts of a regexp to be treated as a single unit. Parts of a
+regexp are grouped by enclosing them in parentheses. Thus we could solve
+the C<housecat|housekeeper> by forming the regexp as
+C<house(cat|keeper)>. The regexp C<house(cat|keeper)> means match
+C<house> followed by either C<cat> or C<keeper>. Some more examples
+are
+
+ /(a|b)b/; # matches 'ab' or 'bb'
+ /(ac|b)b/; # matches 'acb' or 'bb'
+ /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere
+ /(a|[bc])d/; # matches 'ad', 'bd', or 'cd'
+
+ /house(cat|)/; # matches either 'housecat' or 'house'
+ /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or
+ # 'house'. Note groups can be nested.
+
+ /(19|20|)\d\d/; # match years 19xx, 20xx, or the Y2K problem, xx
+ "20" =~ /(19|20|)\d\d/; # matches the null alternative '()\d\d',
+ # because '20\d\d' can't match
+
+Alternations behave the same way in groups as out of them: at a given
+string position, the leftmost alternative that allows the regexp to
+match is taken. So in the last example at tth first string position,
+C<"20"> matches the second alternative, but there is nothing left over
+to match the next two digits C<\d\d>. So perl moves on to the next
+alternative, which is the null alternative and that works, since
+C<"20"> is two digits.
+
+The process of trying one alternative, seeing if it matches, and
+moving on to the next alternative if it doesn't, is called
+B<backtracking>. The term 'backtracking' comes from the idea that
+matching a regexp is like a walk in the woods. Successfully matching
+a regexp is like arriving at a destination. There are many possible
+trailheads, one for each string position, and each one is tried in
+order, left to right. From each trailhead there may be many paths,
+some of which get you there, and some which are dead ends. When you
+walk along a trail and hit a dead end, you have to backtrack along the
+trail to an earlier point to try another trail. If you hit your
+destination, you stop immediately and forget about trying all the
+other trails. You are persistent, and only if you have tried all the
+trails from all the trailheads and not arrived at your destination, do
+you declare failure. To be concrete, here is a step-by-step analysis
+of what perl does when it tries to match the regexp
+
+ "abcde" =~ /(abd|abc)(df|d|de)/;
+
+=over 4
+
+=item 0
+
+Start with the first letter in the string 'a'.
+
+=item 1
+
+Try the first alternative in the first group 'abd'.
+
+=item 2
+
+Match 'a' followed by 'b'. So far so good.
+
+=item 3
+
+'d' in the regexp doesn't match 'c' in the string - a dead
+end. So backtrack two characters and pick the second alternative in
+the first group 'abc'.
+
+=item 4
+
+Match 'a' followed by 'b' followed by 'c'. We are on a roll
+and have satisfied the first group. Set $1 to 'abc'.
+
+=item 5
+
+Move on to the second group and pick the first alternative
+'df'.
+
+=item 6
+
+Match the 'd'.
+
+=item 7
+
+'f' in the regexp doesn't match 'e' in the string, so a dead
+end. Backtrack one character and pick the second alternative in the
+second group 'd'.
+
+=item 8
+
+'d' matches. The second grouping is satisfied, so set $2 to
+'d'.
+
+=item 9
+
+We are at the end of the regexp, so we are done! We have
+matched 'abcd' out of the string "abcde".
+
+=back
+
+There are a couple of things to note about this analysis. First, the
+third alternative in the second group 'de' also allows a match, but we
+stopped before we got to it - at a given character position, leftmost
+wins. Second, we were able to get a match at the first character
+position of the string 'a'. If there were no matches at the first
+position, perl would move to the second character position 'b' and
+attempt the match all over again. Only when all possible paths at all
+possible character positions have been exhausted does perl give give
+up and declare S<C<$string =~ /(abd|abc)(df|d|de)/;> > to be false.
+
+Even with all this work, regexp matching happens remarkably fast. To
+speed things up, during compilation stage, perl compiles the regexp
+into a compact sequence of opcodes that can often fit inside a
+processor cache. When the code is executed, these opcodes can then run
+at full throttle and search very quickly.
+
+=head2 Extracting matches
+
+The grouping metacharacters C<()> also serve another completely
+different function: they allow the extraction of the parts of a string
+that matched. This is very useful to find out what matched and for
+text processing in general. For each grouping, the part that matched
+inside goes into the special variables C<$1>, C<$2>, etc. They can be
+used just as ordinary variables:
+
+ # extract hours, minutes, seconds
+ $time =~ /(\d\d):(\d\d):(\d\d)/; # match hh:mm:ss format
+ $hours = $1;
+ $minutes = $2;
+ $seconds = $3;
+
+Now, we know that in scalar context,
+S<C<$time =~ /(\d\d):(\d\d):(\d\d)/> > returns a true or false
+value. In list context, however, it returns the list of matched values
+C<($1,$2,$3)>. So we could write the code more compactly as
+
+ # extract hours, minutes, seconds
+ ($hours, $minutes, $second) = ($time =~ /(\d\d):(\d\d):(\d\d)/);
+
+If the groupings in a regexp are nested, C<$1> gets the group with the
+leftmost opening parenthesis, C<$2> the next opening parenthesis,
+etc. For example, here is a complex regexp and the matching variables
+indicated below it:
+
+ /(ab(cd|ef)((gi)|j))/;
+ 1 2 34
+
+so that if the regexp matched, e.g., C<$2> would contain 'cd' or 'ef'.
+For convenience, perl sets C<$+> to the highest numbered C<$1>, C<$2>,
+... that got assigned.
+
+Closely associated with the matching variables C<$1>, C<$2>, ... are
+the B<backreferences> C<\1>, C<\2>, ... . Backreferences are simply
+matching variables that can be used I<inside> a regexp. This is a
+really nice feature - what matches later in a regexp can depend on
+what matched earlier in the regexp. Suppose we wanted to look
+for doubled words in text, like 'the the'. The following regexp finds
+all 3-letter doubles with a space in between:
+
+ /(\w\w\w)\s\1/;
+
+The grouping assigns a value to \1, so that the same 3 letter sequence
+is used for both parts. Here are some words with repeated parts:
+
+ % simple_grep '^(\w\w\w\w|\w\w\w|\w\w|\w)\1$' /usr/dict/words
+ beriberi
+ booboo
+ coco
+ mama
+ murmur
+ papa
+
+The regexp has a single grouping which considers 4-letter
+combinations, then 3-letter combinations, etc. and uses C<\1> to look for
+a repeat. Although C<$1> and C<\1> represent the same thing, care should be
+taken to use matched variables C<$1>, C<$2>, ... only outside a regexp
+and backreferences C<\1>, C<\2>, ... only inside a regexp; not doing
+so may lead to surprising and/or undefined results.
+
+In addition to what was matched, Perl 5.6.0 also provides the
+positions of what was matched with the C<@-> and C<@+>
+arrays. C<$-[0]> is the position of the start of the entire match and
+C<$+[0]> is the position of the end. Similarly, C<$-[n]> is the
+position of the start of the C<$n> match and C<$+[n]> is the position
+of the end. If C<$n> is undefined, so are C<$-[n]> and C<$+[n]>. Then
+this code
+
+ $x = "Mmm...donut, thought Homer";
+ $x =~ /^(Mmm|Yech)\.\.\.(donut|peas)/; # matches
+ foreach $expr (1..$#-) {
+ print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\n";
+ }
+
+prints
+
+ Match 1: 'Mmm' at position (0,3)
+ Match 2: 'donut' at position (6,11)
+
+Even if there are no groupings in a regexp, it is still possible to
+find out what exactly matched in a string. If you use them, perl
+will set C<$`> to the part of the string before the match, will set C<$&>
+to the part of the string that matched, and will set C<$'> to the part
+of the string after the match. An example:
+
+ $x = "the cat caught the mouse";
+ $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse'
+ $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse'
+
+In the second match, S<C<$` = ''> > because the regexp matched at the
+first character position in the string and stopped, it never saw the
+second 'the'. It is important to note that using C<$`> and C<$'>
+slows down regexp matching quite a bit, and C< $& > slows it down to a
+lesser extent, because if they are used in one regexp in a program,
+they are generated for <all> regexps in the program. So if raw
+performance is a goal of your application, they should be avoided.
+If you need them, use C<@-> and C<@+> instead:
+
+ $` is the same as substr( $x, 0, $-[0] )
+ $& is the same as substr( $x, $-[0], $+[0]-$-[0] )
+ $' is the same as substr( $x, $+[0] )
+
+=head2 Matching repetitions
+
+The examples in the previous section display an annoying weakness. We
+were only matching 3-letter words, or syllables of 4 letters or
+less. We'd like to be able to match words or syllables of any length,
+without writing out tedious alternatives like
+C<\w\w\w\w|\w\w\w|\w\w|\w>.
+
+This is exactly the problem the B<quantifier> metacharacters C<?>,
+C<*>, C<+>, and C<{}> were created for. They allow us to determine the
+number of repeats of a portion of a regexp we consider to be a
+match. Quantifiers are put immediately after the character, character
+class, or grouping that we want to specify. They have the following
+meanings:
+
+=over 4
+
+=item *
+
+C<a?> = match 'a' 1 or 0 times
+
+=item *
+
+C<a*> = match 'a' 0 or more times, i.e., any number of times
+
+=item *
+
+C<a+> = match 'a' 1 or more times, i.e., at least once
+
+=item *
+
+C<a{n,m}> = match at least C<n> times, but not more than C<m>
+times.
+
+=item *
+
+C<a{n,}> = match at least C<n> or more times
+
+=item *
+
+C<a{n}> = match exactly C<n> times
+
+=back
+
+Here are some examples:
+
+ /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and
+ # any number of digits
+ /(\w+)\s+\1/; # match doubled words of arbitrary length
+ /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes'
+ $year =~ /\d{2,4}/; # make sure year is at least 2 but not more
+ # than 4 digits
+ $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates
+ $year =~ /\d{2}(\d{2})?/; # same thing written differently. However,
+ # this produces $1 and the other does not.
+
+ % simple_grep '^(\w+)\1$' /usr/dict/words # isn't this easier?
+ beriberi
+ booboo
+ coco
+ mama
+ murmur
+ papa
+
+For all of these quantifiers, perl will try to match as much of the
+string as possible, while still allowing the regexp to succeed. Thus
+with C</a?.../>, perl will first try to match the regexp with the C<a>
+present; if that fails, perl will try to match the regexp without the
+C<a> present. For the quantifier C<*>, we get the following:
+
+ $x = "the cat in the hat";
+ $x =~ /^(.*)(cat)(.*)$/; # matches,
+ # $1 = 'the '
+ # $2 = 'cat'
+ # $3 = ' in the hat'
+
+Which is what we might expect, the match finds the only C<cat> in the
+string and locks onto it. Consider, however, this regexp:
+
+ $x =~ /^(.*)(at)(.*)$/; # matches,
+ # $1 = 'the cat in the h'
+ # $2 = 'at'
+ # $3 = '' (0 matches)
+
+One might initially guess that perl would find the C<at> in C<cat> and
+stop there, but that wouldn't give the longest possible string to the
+first quantifier C<.*>. Instead, the first quantifier C<.*> grabs as
+much of the string as possible while still having the regexp match. In
+this example, that means having the C<at> sequence with the final C<at>
+in the string. The other important principle illustrated here is that
+when there are two or more elements in a regexp, the I<leftmost>
+quantifier, if there is one, gets to grab as much the string as
+possible, leaving the rest of the regexp to fight over scraps. Thus in
+our example, the first quantifier C<.*> grabs most of the string, while
+the second quantifier C<.*> gets the empty string. Quantifiers that
+grab as much of the string as possible are called B<maximal match> or
+B<greedy> quantifiers.
+
+When a regexp can match a string in several different ways, we can use
+the principles above to predict which way the regexp will match:
+
+=over 4
+
+=item *
+
+Principle 0: Taken as a whole, any regexp will be matched at the
+earliest possible position in the string.
+
+=item *
+
+Principle 1: In an alternation C<a|b|c...>, the leftmost alternative
+that allows a match for the whole regexp will be the one used.
+
+=item *
+
+Principle 2: The maximal matching quantifiers C<?>, C<*>, C<+> and
+C<{n,m}> will in general match as much of the string as possible while
+still allowing the whole regexp to match.
+
+=item *
+
+Principle 3: If there are two or more elements in a regexp, the
+leftmost greedy quantifier, if any, will match as much of the string
+as possible while still allowing the whole regexp to match. The next
+leftmost greedy quantifier, if any, will try to match as much of the
+string remaining available to it as possible, while still allowing the
+whole regexp to match. And so on, until all the regexp elements are
+satisfied.
+
+=back
+
+As we have seen above, Principle 0 overrides the others - the regexp
+will be matched as early as possible, with the other principles
+determining how the regexp matches at that earliest character
+position.
+
+Here is an example of these principles in action:
+
+ $x = "The programming republic of Perl";
+ $x =~ /^(.+)(e|r)(.*)$/; # matches,
+ # $1 = 'The programming republic of Pe'
+ # $2 = 'r'
+ # $3 = 'l'
+
+This regexp matches at the earliest string position, C<'T'>. One
+might think that C<e>, being leftmost in the alternation, would be
+matched, but C<r> produces the longest string in the first quantifier.
+
+ $x =~ /(m{1,2})(.*)$/; # matches,
+ # $1 = 'mm'
+ # $2 = 'ing republic of Perl'
+
+Here, The earliest possible match is at the first C<'m'> in
+C<programming>. C<m{1,2}> is the first quantifier, so it gets to match
+a maximal C<mm>.
+
+ $x =~ /.*(m{1,2})(.*)$/; # matches,
+ # $1 = 'm'
+ # $2 = 'ing republic of Perl'
+
+Here, the regexp matches at the start of the string. The first
+quantifier C<.*> grabs as much as possible, leaving just a single
+C<'m'> for the second quantifier C<m{1,2}>.
+
+ $x =~ /(.?)(m{1,2})(.*)$/; # matches,
+ # $1 = 'a'
+ # $2 = 'mm'
+ # $3 = 'ing republic of Perl'
+
+Here, C<.?> eats its maximal one character at the earliest possible
+position in the string, C<'a'> in C<programming>, leaving C<m{1,2}>
+the opportunity to match both C<m>'s. Finally,
+
+ "aXXXb" =~ /(X*)/; # matches with $1 = ''
+
+because it can match zero copies of C<'X'> at the beginning of the
+string. If you definitely want to match at least one C<'X'>, use
+C<X+>, not C<X*>.
+
+Sometimes greed is not good. At times, we would like quantifiers to
+match a I<minimal> piece of string, rather than a maximal piece. For
+this purpose, Larry Wall created the S<B<minimal match> > or
+B<non-greedy> quantifiers C<??>,C<*?>, C<+?>, and C<{}?>. These are
+the usual quantifiers with a C<?> appended to them. They have the
+following meanings:
+
+=over 4
+
+=item *
+
+C<a??> = match 'a' 0 or 1 times. Try 0 first, then 1.
+
+=item *
+
+C<a*?> = match 'a' 0 or more times, i.e., any number of times,
+but as few times as possible
+
+=item *
+
+C<a+?> = match 'a' 1 or more times, i.e., at least once, but
+as few times as possible
+
+=item *
+
+C<a{n,m}?> = match at least C<n> times, not more than C<m>
+times, as few times as possible
+
+=item *
+
+C<a{n,}?> = match at least C<n> times, but as few times as
+possible
+
+=item *
+
+C<a{n}?> = match exactly C<n> times. Because we match exactly
+C<n> times, C<a{n}?> is equivalent to C<a{n}> and is just there for
+notational consistency.
+
+=back
+
+Let's look at the example above, but with minimal quantifiers:
+
+ $x = "The programming republic of Perl";
+ $x =~ /^(.+?)(e|r)(.*)$/; # matches,
+ # $1 = 'Th'
+ # $2 = 'e'
+ # $3 = ' programming republic of Perl'
+
+The minimal string that will allow both the start of the string C<^>
+and the alternation to match is C<Th>, with the alternation C<e|r>
+matching C<e>. The second quantifier C<.*> is free to gobble up the
+rest of the string.
+
+ $x =~ /(m{1,2}?)(.*?)$/; # matches,
+ # $1 = 'm'
+ # $2 = 'ming republic of Perl'
+
+The first string position that this regexp can match is at the first
+C<'m'> in C<programming>. At this position, the minimal C<m{1,2}?>
+matches just one C<'m'>. Although the second quantifier C<.*?> would
+prefer to match no characters, it is constrained by the end-of-string
+anchor C<$> to match the rest of the string.
+
+ $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches,
+ # $1 = 'The progra'
+ # $2 = 'm'
+ # $3 = 'ming republic of Perl'
+
+In this regexp, you might expect the first minimal quantifier C<.*?>
+to match the empty string, because it is not constrained by a C<^>
+anchor to match the beginning of the word. Principle 0 applies here,
+however. Because it is possible for the whole regexp to match at the
+start of the string, it I<will> match at the start of the string. Thus
+the first quantifier has to match everything up to the first C<m>. The
+second minimal quantifier matches just one C<m> and the third
+quantifier matches the rest of the string.
+
+ $x =~ /(.??)(m{1,2})(.*)$/; # matches,
+ # $1 = 'a'
+ # $2 = 'mm'
+ # $3 = 'ing republic of Perl'
+
+Just as in the previous regexp, the first quantifier C<.??> can match
+earliest at position C<'a'>, so it does. The second quantifier is
+greedy, so it matches C<mm>, and the third matches the rest of the
+string.
+
+We can modify principle 3 above to take into account non-greedy
+quantifiers:
+
+=over 4
+
+=item *
+
+Principle 3: If there are two or more elements in a regexp, the
+leftmost greedy (non-greedy) quantifier, if any, will match as much
+(little) of the string as possible while still allowing the whole
+regexp to match. The next leftmost greedy (non-greedy) quantifier, if
+any, will try to match as much (little) of the string remaining
+available to it as possible, while still allowing the whole regexp to
+match. And so on, until all the regexp elements are satisfied.
+
+=back
+
+Just like alternation, quantifiers are also susceptible to
+backtracking. Here is a step-by-step analysis of the example
+
+ $x = "the cat in the hat";
+ $x =~ /^(.*)(at)(.*)$/; # matches,
+ # $1 = 'the cat in the h'
+ # $2 = 'at'
+ # $3 = '' (0 matches)
+
+=over 4
+
+=item 0
+
+Start with the first letter in the string 't'.
+
+=item 1
+
+The first quantifier '.*' starts out by matching the whole
+string 'the cat in the hat'.
+
+=item 2
+
+'a' in the regexp element 'at' doesn't match the end of the
+string. Backtrack one character.
+
+=item 3
+
+'a' in the regexp element 'at' still doesn't match the last
+letter of the string 't', so backtrack one more character.
+
+=item 4
+
+Now we can match the 'a' and the 't'.
+
+=item 5
+
+Move on to the third element '.*'. Since we are at the end of
+the string and '.*' can match 0 times, assign it the empty string.
+
+=item 6
+
+We are done!
+
+=back
+
+Most of the time, all this moving forward and backtracking happens
+quickly and searching is fast. There are some pathological regexps,
+however, whose execution time exponentially grows with the size of the
+string. A typical structure that blows up in your face is of the form
+
+ /(a|b+)*/;
+
+The problem is the nested indeterminate quantifiers. There are many
+different ways of partitioning a string of length n between the C<+>
+and C<*>: one repetition with C<b+> of length n, two repetitions with
+the first C<b+> length k and the second with length n-k, m repetitions
+whose bits add up to length n, etc. In fact there are an exponential
+number of ways to partition a string as a function of length. A
+regexp may get lucky and match early in the process, but if there is
+no match, perl will try I<every> possibility before giving up. So be
+careful with nested C<*>'s, C<{n,m}>'s, and C<+>'s. The book
+I<Mastering regular expressions> by Jeffrey Friedl gives a wonderful
+discussion of this and other efficiency issues.
+
+=head2 Building a regexp
+
+At this point, we have all the basic regexp concepts covered, so let's
+give a more involved example of a regular expression. We will build a
+regexp that matches numbers.
+
+The first task in building a regexp is to decide what we want to match
+and what we want to exclude. In our case, we want to match both
+integers and floating point numbers and we want to reject any string
+that isn't a number.
+
+The next task is to break the problem down into smaller problems that
+are easily converted into a regexp.
+
+The simplest case is integers. These consist of a sequence of digits,
+with an optional sign in front. The digits we can represent with
+C<\d+> and the sign can be matched with C<[+-]>. Thus the integer
+regexp is
+
+ /[+-]?\d+/; # matches integers
+
+A floating point number potentially has a sign, an integral part, a
+decimal point, a fractional part, and an exponent. One or more of these
+parts is optional, so we need to check out the different
+possibilities. Floating point numbers which are in proper form include
+123., 0.345, .34, -1e6, and 25.4E-72. As with integers, the sign out
+front is completely optional and can be matched by C<[+-]?>. We can
+see that if there is no exponent, floating point numbers must have a
+decimal point, otherwise they are integers. We might be tempted to
+model these with C<\d*\.\d*>, but this would also match just a single
+decimal point, which is not a number. So the three cases of floating
+point number sans exponent are
+
+ /[+-]?\d+\./; # 1., 321., etc.
+ /[+-]?\.\d+/; # .1, .234, etc.
+ /[+-]?\d+\.\d+/; # 1.0, 30.56, etc.
+
+These can be combined into a single regexp with a three-way alternation:
+
+ /[+-]?(\d+\.\d+|\d+\.|\.\d+)/; # floating point, no exponent
+
+In this alternation, it is important to put C<'\d+\.\d+'> before
+C<'\d+\.'>. If C<'\d+\.'> were first, the regexp would happily match that
+and ignore the fractional part of the number.
+
+Now consider floating point numbers with exponents. The key
+observation here is that I<both> integers and numbers with decimal
+points are allowed in front of an exponent. Then exponents, like the
+overall sign, are independent of whether we are matching numbers with
+or without decimal points, and can be 'decoupled' from the
+mantissa. The overall form of the regexp now becomes clear:
+
+ /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/;
+
+The exponent is an C<e> or C<E>, followed by an integer. So the
+exponent regexp is
+
+ /[eE][+-]?\d+/; # exponent
+
+Putting all the parts together, we get a regexp that matches numbers:
+
+ /^[+-]?(\d+\.\d+|\d+\.|\.\d+|\d+)([eE][+-]?\d+)?$/; # Ta da!
+
+Long regexps like this may impress your friends, but can be hard to
+decipher. In complex situations like this, the C<//x> modifier for a
+match is invaluable. It allows one to put nearly arbitrary whitespace
+and comments into a regexp without affecting their meaning. Using it,
+we can rewrite our 'extended' regexp in the more pleasing form
+
+ /^
+ [+-]? # first, match an optional sign
+ ( # then match integers or f.p. mantissas:
+ \d+\.\d+ # mantissa of the form a.b
+ |\d+\. # mantissa of the form a.
+ |\.\d+ # mantissa of the form .b
+ |\d+ # integer of the form a
+ )
+ ([eE][+-]?\d+)? # finally, optionally match an exponent
+ $/x;
+
+If whitespace is mostly irrelevant, how does one include space
+characters in an extended regexp? The answer is to backslash it
+S<C<'\ '> > or put it in a character class S<C<[ ]> >. The same thing
+goes for pound signs, use C<\#> or C<[#]>. For instance, Perl allows
+a space between the sign and the mantissa/integer, and we could add
+this to our regexp as follows:
+
+ /^
+ [+-]?\ * # first, match an optional sign *and space*
+ ( # then match integers or f.p. mantissas:
+ \d+\.\d+ # mantissa of the form a.b
+ |\d+\. # mantissa of the form a.
+ |\.\d+ # mantissa of the form .b
+ |\d+ # integer of the form a
+ )
+ ([eE][+-]?\d+)? # finally, optionally match an exponent
+ $/x;
+
+In this form, it is easier to see a way to simplify the
+alternation. Alternatives 1, 2, and 4 all start with C<\d+>, so it
+could be factored out:
+
+ /^
+ [+-]?\ * # first, match an optional sign
+ ( # then match integers or f.p. mantissas:
+ \d+ # start out with a ...
+ (
+ \.\d* # mantissa of the form a.b or a.
+ )? # ? takes care of integers of the form a
+ |\.\d+ # mantissa of the form .b
+ )
+ ([eE][+-]?\d+)? # finally, optionally match an exponent
+ $/x;
+
+or written in the compact form,
+
+ /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/;
+
+This is our final regexp. To recap, we built a regexp by
+
+=over 4
+
+=item *
+
+specifying the task in detail,
+
+=item *
+
+breaking down the problem into smaller parts,
+
+=item *
+
+translating the small parts into regexps,
+
+=item *
+
+combining the regexps,
+
+=item *
+
+and optimizing the final combined regexp.
+
+=back
+
+These are also the typical steps involved in writing a computer
+program. This makes perfect sense, because regular expressions are
+essentially programs written a little computer language that specifies
+patterns.
+
+=head2 Using regular expressions in Perl
+
+The last topic of Part 1 briefly covers how regexps are used in Perl
+programs. Where do they fit into Perl syntax?
+
+We have already introduced the matching operator in its default
+C</regexp/> and arbitrary delimiter C<m!regexp!> forms. We have used
+the binding operator C<=~> and its negation C<!~> to test for string
+matches. Associated with the matching operator, we have discussed the
+single line C<//s>, multi-line C<//m>, case-insensitive C<//i> and
+extended C<//x> modifiers.
+
+There are a few more things you might want to know about matching
+operators. First, we pointed out earlier that variables in regexps are
+substituted before the regexp is evaluated:
+
+ $pattern = 'Seuss';
+ while (<>) {
+ print if /$pattern/;
+ }
+
+This will print any lines containing the word C<Seuss>. It is not as
+efficient as it could be, however, because perl has to re-evaluate
+C<$pattern> each time through the loop. If C<$pattern> won't be
+changing over the lifetime of the script, we can add the C<//o>
+modifier, which directs perl to only perform variable substitutions
+once:
+
+ #!/usr/bin/perl
+ # Improved simple_grep
+ $regexp = shift;
+ while (<>) {
+ print if /$regexp/o; # a good deal faster
+ }
+
+If you change C<$pattern> after the first substitution happens, perl
+will ignore it. If you don't want any substitutions at all, use the
+special delimiter C<m''>:
+
+ $pattern = 'Seuss';
+ while (<>) {
+ print if m'$pattern'; # matches '$pattern', not 'Seuss'
+ }
+
+C<m''> acts like single quotes on a regexp; all other C<m> delimiters
+act like double quotes. If the regexp evaluates to the empty string,
+the regexp in the I<last successful match> is used instead. So we have
+
+ "dog" =~ /d/; # 'd' matches
+ "dogbert =~ //; # this matches the 'd' regexp used before
+
+The final two modifiers C<//g> and C<//c> concern multiple matches.
+The modifier C<//g> stands for global matching and allows the the
+matching operator to match within a string as many times as possible.
+In scalar context, successive invocations against a string will have
+`C<//g> jump from match to match, keeping track of position in the
+string as it goes along. You can get or set the position with the
+C<pos()> function.
+
+The use of C<//g> is shown in the following example. Suppose we have
+a string that consists of words separated by spaces. If we know how
+many words there are in advance, we could extract the words using
+groupings:
+
+ $x = "cat dog house"; # 3 words
+ $x =~ /^\s*(\w+)\s+(\w+)\s+(\w+)\s*$/; # matches,
+ # $1 = 'cat'
+ # $2 = 'dog'
+ # $3 = 'house'
+
+But what if we had an indeterminate number of words? This is the sort
+of task C<//g> was made for. To extract all words, form the simple
+regexp C<(\w+)> and loop over all matches with C</(\w+)/g>:
+
+ while ($x =~ /(\w+)/g) {
+ print "Word is $1, ends at position ", pos $x, "\n";
+ }
+
+prints
+
+ Word is cat, ends at position 3
+ Word is dog, ends at position 7
+ Word is house, ends at position 13
+
+A failed match or changing the target string resets the position. If
+you don't want the position reset after failure to match, add the
+C<//c>, as in C</regexp/gc>. The current position in the string is
+associated with the string, not the regexp. This means that different
+strings have different positions and their respective positions can be
+set or read independently.
+
+In list context, C<//g> returns a list of matched groupings, or if
+there are no groupings, a list of matches to the whole regexp. So if
+we wanted just the words, we could use
+
+ @words = ($x =~ /(\w+)/g); # matches,
+ # $word[0] = 'cat'
+ # $word[1] = 'dog'
+ # $word[2] = 'house'
+
+Closely associated with the C<//g> modifier is the C<\G> anchor. The
+C<\G> anchor matches at the point where the previous C<//g> match left
+off. C<\G> allows us to easily do context-sensitive matching:
+
+ $metric = 1; # use metric units
+ ...
+ $x = <FILE>; # read in measurement
+ $x =~ /^([+-]?\d+)\s*/g; # get magnitude
+ $weight = $1;
+ if ($metric) { # error checking
+ print "Units error!" unless $x =~ /\Gkg\./g;
+ }
+ else {
+ print "Units error!" unless $x =~ /\Glbs\./g;
+ }
+ $x =~ /\G\s+(widget|sprocket)/g; # continue processing
+
+The combination of C<//g> and C<\G> allows us to process the string a
+bit at a time and use arbitrary Perl logic to decide what to do next.
+
+C<\G> is also invaluable in processing fixed length records with
+regexps. Suppose we have a snippet of coding region DNA, encoded as
+base pair letters C<ATCGTTGAAT...> and we want to find all the stop
+codons C<TGA>. In a coding region, codons are 3-letter sequences, so
+we can think of the DNA snippet as a sequence of 3-letter records. The
+naive regexp
+
+ # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC"
+ $dna = "ATCGTTGAATGCAAATGACATGAC";
+ $dna =~ /TGA/;
+
+doesn't work; it may match an C<TGA>, but there is no guarantee that
+the match is aligned with codon boundaries, e.g., the substring
+S<C<GTT GAA> > gives a match. A better solution is
+
+ while ($dna =~ /(\w\w\w)*?TGA/g) { # note the minimal *?
+ print "Got a TGA stop codon at position ", pos $dna, "\n";
+ }
+
+which prints
+
+ Got a TGA stop codon at position 18
+ Got a TGA stop codon at position 23
+
+Position 18 is good, but position 23 is bogus. What happened?
+
+The answer is that our regexp works well until we get past the last
+real match. Then the regexp will fail to match a synchronized C<TGA>
+and start stepping ahead one character position at a time, not what we
+want. The solution is to use C<\G> to anchor the match to the codon
+alignment:
+
+ while ($dna =~ /\G(\w\w\w)*?TGA/g) {
+ print "Got a TGA stop codon at position ", pos $dna, "\n";
+ }
+
+This prints
+
+ Got a TGA stop codon at position 18
+
+which is the correct answer. This example illustrates that it is
+important not only to match what is desired, but to reject what is not
+desired.
+
+B<search and replace>
+
+Regular expressions also play a big role in B<search and replace>
+operations in Perl. Search and replace is accomplished with the
+C<s///> operator. The general form is
+C<s/regexp/replacement/modifiers>, with everything we know about
+regexps and modifiers applying in this case as well. The
+C<replacement> is a Perl double quoted string that replaces in the
+string whatever is matched with the C<regexp>. The operator C<=~> is
+also used here to associate a string with C<s///>. If matching
+against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match,
+C<s///> returns the number of substitutions made, otherwise it returns
+false. Here are a few examples:
+
+ $x = "Time to feed the cat!";
+ $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!"
+ if ($x =~ s/^(Time.*hacker)!$/$1 now!/) {
+ $more_insistent = 1;
+ }
+ $y = "'quoted words'";
+ $y =~ s/^'(.*)'$/$1/; # strip single quotes,
+ # $y contains "quoted words"
+
+In the last example, the whole string was matched, but only the part
+inside the single quotes was grouped. With the C<s///> operator, the
+matched variables C<$1>, C<$2>, etc. are immediately available for use
+in the replacement expression, so we use C<$1> to replace the quoted
+string with just what was quoted. With the global modifier, C<s///g>
+will search and replace all occurrences of the regexp in the string:
+
+ $x = "I batted 4 for 4";
+ $x =~ s/4/four/; # doesn't do it all:
+ # $x contains "I batted four for 4"
+ $x = "I batted 4 for 4";
+ $x =~ s/4/four/g; # does it all:
+ # $x contains "I batted four for four"
+
+If you prefer 'regex' over 'regexp' in this tutorial, you could use
+the following program to replace it:
+
+ % cat > simple_replace
+ #!/usr/bin/perl
+ $regexp = shift;
+ $replacement = shift;
+ while (<>) {
+ s/$regexp/$replacement/go;
+ print;
+ }
+ ^D
+
+ % simple_replace regexp regex perlretut.pod
+
+In C<simple_replace> we used the C<s///g> modifier to replace all
+occurrences of the regexp on each line and the C<s///o> modifier to
+compile the regexp only once. As with C<simple_grep>, both the
+C<print> and the C<s/$regexp/$replacement/go> use C<$_> implicitly.
+
+A modifier available specifically to search and replace is the
+C<s///e> evaluation modifier. C<s///e> wraps an C<eval{...}> around
+the replacement string and the evaluated result is substituted for the
+matched substring. C<s///e> is useful if you need to do a bit of
+computation in the process of replacing text. This example counts
+character frequencies in a line:
+
+ $x = "Bill the cat";
+ $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself
+ print "frequency of '$_' is $chars{$_}\n"
+ foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars);
+
+This prints
+
+ frequency of ' ' is 2
+ frequency of 't' is 2
+ frequency of 'l' is 2
+ frequency of 'B' is 1
+ frequency of 'c' is 1
+ frequency of 'e' is 1
+ frequency of 'h' is 1
+ frequency of 'i' is 1
+ frequency of 'a' is 1
+
+As with the match C<m//> operator, C<s///> can use other delimiters,
+such as C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are
+used C<s'''>, then the regexp and replacement are treated as single
+quoted strings and there are no substitutions. C<s///> in list context
+returns the same thing as in scalar context, i.e., the number of
+matches.
+
+B<The split operator>
+
+The B<C<split> > function can also optionally use a matching operator
+C<m//> to split a string. C<split /regexp/, string, limit> splits
+C<string> into a list of substrings and returns that list. The regexp
+is used to match the character sequence that the C<string> is split
+with respect to. The C<limit>, if present, constrains splitting into
+no more than C<limit> number of strings. For example, to split a
+string into words, use
+
+ $x = "Calvin and Hobbes";
+ @words = split /\s+/, $x; # $word[0] = 'Calvin'
+ # $word[1] = 'and'
+ # $word[2] = 'Hobbes'
+
+If the empty regexp C<//> is used, the regexp always matches and
+the string is split into individual characters. If the regexp has
+groupings, then list produced contains the matched substrings from the
+groupings as well. For instance,
+
+ $x = "/usr/bin/perl";
+ @dirs = split m!/!, $x; # $dirs[0] = ''
+ # $dirs[1] = 'usr'
+ # $dirs[2] = 'bin'
+ # $dirs[3] = 'perl'
+ @parts = split m!(/)!, $x; # $parts[0] = ''
+ # $parts[1] = '/'
+ # $parts[2] = 'usr'
+ # $parts[3] = '/'
+ # $parts[4] = 'bin'
+ # $parts[5] = '/'
+ # $parts[6] = 'perl'
+
+Since the first character of $x matched the regexp, C<split> prepended
+an empty initial element to the list.
+
+If you have read this far, congratulations! You now have all the basic
+tools needed to use regular expressions to solve a wide range of text
+processing problems. If this is your first time through the tutorial,
+why not stop here and play around with regexps a while... S<Part 2>
+concerns the more esoteric aspects of regular expressions and those
+concepts certainly aren't needed right at the start.
+
+=head1 Part 2: Power tools
+
+OK, you know the basics of regexps and you want to know more. If
+matching regular expressions is analogous to a walk in the woods, then
+the tools discussed in Part 1 are analogous to topo maps and a
+compass, basic tools we use all the time. Most of the tools in part 2
+are are analogous to flare guns and satellite phones. They aren't used
+too often on a hike, but when we are stuck, they can be invaluable.
+
+What follows are the more advanced, less used, or sometimes esoteric
+capabilities of perl regexps. In Part 2, we will assume you are
+comfortable with the basics and concentrate on the new features.
+
+=head2 More on characters, strings, and character classes
+
+There are a number of escape sequences and character classes that we
+haven't covered yet.
+
+There are several escape sequences that convert characters or strings
+between upper and lower case. C<\l> and C<\u> convert the next
+character to lower or upper case, respectively:
+
+ $x = "perl";
+ $string =~ /\u$x/; # matches 'Perl' in $string
+ $x = "M(rs?|s)\\."; # note the double backslash
+ $string =~ /\l$x/; # matches 'mr.', 'mrs.', and 'ms.',
+
+C<\L> and C<\U> converts a whole substring, delimited by C<\L> or
+C<\U> and C<\E>, to lower or upper case:
+
+ $x = "This word is in lower case:\L SHOUT\E";
+ $x =~ /shout/; # matches
+ $x = "I STILL KEYPUNCH CARDS FOR MY 360"
+ $x =~ /\Ukeypunch/; # matches punch card string
+
+If there is no C<\E>, case is converted until the end of the
+string. The regexps C<\L\u$word> or C<\u\L$word> convert the first
+character of C<$word> to uppercase and the rest of the characters to
+lowercase.
+
+Control characters can be escaped with C<\c>, so that a control-Z
+character would be matched with C<\cZ>. The escape sequence
+C<\Q>...C<\E> quotes, or protects most non-alphabetic characters. For
+instance,
+
+ $x = "\QThat !^*&%~& cat!";
+ $x =~ /\Q!^*&%~&\E/; # check for rough language
+
+It does not protect C<$> or C<@>, so that variables can still be
+substituted.
+
+With the advent of 5.6.0, perl regexps can handle more than just the
+standard ASCII character set. Perl now supports B<Unicode>, a standard
+for encoding the character sets from many of the world's written
+languages. Unicode does this by allowing characters to be more than
+one byte wide. Perl uses the UTF-8 encoding, in which ASCII characters
+are still encoded as one byte, but characters greater than C<chr(127)>
+may be stored as two or more bytes.
+
+What does this mean for regexps? Well, regexp users don't need to know
+much about perl's internal representation of strings. But they do need
+to know 1) how to represent Unicode characters in a regexp and 2) when
+a matching operation will treat the string to be searched as a
+sequence of bytes (the old way) or as a sequence of Unicode characters
+(the new way). The answer to 1) is that Unicode characters greater
+than C<chr(127)> may be represented using the C<\x{hex}> notation,
+with C<hex> a hexadecimal integer:
+
+ use utf8; # We will be doing Unicode processing
+ /\x{263a}/; # match a Unicode smiley face :)
+
+Unicode characters in the range of 128-255 use two hexadecimal digits
+with braces: C<\x{ab}>. Note that this is different than C<\xab>,
+which is just a hexadecimal byte with no Unicode
+significance.
+
+Figuring out the hexadecimal sequence of a Unicode character you want
+or deciphering someone else's hexadecimal Unicode regexp is about as
+much fun as programming in machine code. So another way to specify
+Unicode characters is to use the S<B<named character> > escape
+sequence C<\N{name}>. C<name> is a name for the Unicode character, as
+specified in the Unicode standard. For instance, if we wanted to
+represent or match the astrological sign for the planet Mercury, we
+could use
+
+ use utf8; # We will be doing Unicode processing
+ use charnames ":full"; # use named chars with Unicode full names
+ $x = "abc\N{MERCURY}def";
+ $x =~ /\N{MERCURY}/; # matches
+
+One can also use short names or restrict names to a certain alphabet:
+
+ use utf8; # We will be doing Unicode processing
+
+ use charnames ':full';
+ print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
+
+ use charnames ":short";
+ print "\N{greek:Sigma} is an upper-case sigma.\n";
+
+ use charnames qw(greek);
+ print "\N{sigma} is Greek sigma\n";
+
+A list of full names is found in the file Names.txt in the
+lib/perl5/5.6.0/unicode directory.
+
+The answer to requirement 2), as of 5.6.0, is that if a regexp
+contains Unicode characters, the string is searched as a sequence of
+Unicode characters. Otherwise, the string is searched as a sequence of
+bytes. If the string is being searched as a sequence of Unicode
+characters, but matching a single byte is required, we can use the C<\C>
+escape sequence. C<\C> is a character class akin to C<.> except that
+it matches I<any> byte 0-255. So
+
+ use utf8; # We will be doing Unicode processing
+ use charnames ":full"; # use named chars with Unicode full names
+ $x = "a";
+ $x =~ /\C/; # matches 'a', eats one byte
+ $x = "";
+ $x =~ /\C/; # doesn't match, no bytes to match
+ $x = "\N{MERCURY}"; # two-byte Unicode character
+ $x =~ /\C/; # matches, but dangerous!
+
+The last regexp matches, but is dangerous because the string
+I<character> position is no longer synchronized to the string I<byte>
+position. This generates the warning 'Malformed UTF-8
+character'. C<\C> is best used for matching the binary data in strings
+with binary data intermixed with Unicode characters.
+
+Let us now discuss the rest of the character classes. Just as with
+Unicode characters, there are named Unicode character classes
+represented by the C<\p{name}> escape sequence. Closely associated is
+the C<\P{name}> character class, which is the negation of the
+C<\p{name}> class. For example, to match lower and uppercase
+characters,
+
+ use utf8; # We will be doing Unicode processing
+ use charnames ":full"; # use named chars with Unicode full names
+ $x = "BOB";
+ $x =~ /^\p{IsUpper}/; # matches, uppercase char class
+ $x =~ /^\P{IsUpper}/; # doesn't match, char class sans uppercase
+ $x =~ /^\p{IsLower}/; # doesn't match, lowercase char class
+ $x =~ /^\P{IsLower}/; # matches, char class sans lowercase
+
+Here is the association between some Perl named classes and the
+traditional Unicode classes:
+
+ Perl class name Unicode class name or regular expression
+
+ IsAlpha /^[LM]/
+ IsAlnum /^[LMN]/
+ IsASCII $code <= 127
+ IsCntrl /^C/
+ IsBlank $code =~ /^(0020|0009)$/ || /^Z[^lp]/
+ IsDigit Nd
+ IsGraph /^([LMNPS]|Co)/
+ IsLower Ll
+ IsPrint /^([LMNPS]|Co|Zs)/
+ IsPunct /^P/
+ IsSpace /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/
+ IsSpacePerl /^Z/ || ($code =~ /^(0009|000A|000C|000D)$/
+ IsUpper /^L[ut]/
+ IsWord /^[LMN]/ || $code eq "005F"
+ IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/
+
+You can also use the official Unicode class names with the C<\p> and
+C<\P>, like C<\p{L}> for Unicode 'letters', or C<\p{Lu}> for uppercase
+letters, or C<\P{Nd}> for non-digits. If a C<name> is just one
+letter, the braces can be dropped. For instance, C<\pM> is the
+character class of Unicode 'marks'.
+
+C<\X> is an abbreviation for a character class sequence that includes
+the Unicode 'combining character sequences'. A 'combining character
+sequence' is a base character followed by any number of combining
+characters. An example of a combining character is an accent. Using
+the Unicode full names, e.g., S<C<A + COMBINING RING> > is a combining
+character sequence with base character C<A> and combining character
+S<C<COMBINING RING> >, which translates in Danish to A with the circle
+atop it, as in the word Angstrom. C<\X> is equivalent to C<\PM\pM*}>,
+i.e., a non-mark followed by one or more marks.
+
+As if all those classes weren't enough, Perl also defines POSIX style
+character classes. These have the form C<[:name:]>, with C<name> the
+name of the POSIX class. The POSIX classes are C<alpha>, C<alnum>,
+C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>,
+C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl
+extension to match C<\w>), and C<blank> (a GNU extension). If C<utf8>
+is being used, then these classes are defined the same as their
+corresponding perl Unicode classes: C<[:upper:]> is the same as
+C<\p{IsUpper}>, etc. The POSIX character classes, however, don't
+require using C<utf8>. The C<[:digit:]>, C<[:word:]>, and
+C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s>
+character classes. To negate a POSIX class, put a C<^> in front of
+the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
+C<utf8>, C<\P{IsDigit}>. The Unicode and POSIX character classes can
+be used just like C<\d>, both inside and outside of character classes:
+
+ /\s+[abc[:digit:]xyz]\s*/; # match a,b,c,x,y,z, or a digit
+ /^=item\s[:digit:]/; # match '=item',
+ # followed by a space and a digit
+ use utf8;
+ use charnames ":full";
+ /\s+[abc\p{IsDigit}xyz]\s+/; # match a,b,c,x,y,z, or a digit
+ /^=item\s\p{IsDigit}/; # match '=item',
+ # followed by a space and a digit
+
+Whew! That is all the rest of the characters and character classes.
+
+=head2 Compiling and saving regular expressions
+
+In Part 1 we discussed the C<//o> modifier, which compiles a regexp
+just once. This suggests that a compiled regexp is some data structure
+that can be stored once and used again and again. The regexp quote
+C<qr//> does exactly that: C<qr/string/> compiles the C<string> as a
+regexp and transforms the result into a form that can be assigned to a
+variable:
+
+ $reg = qr/foo+bar?/; # reg contains a compiled regexp
+
+Then C<$reg> can be used as a regexp:
+
+ $x = "fooooba";
+ $x =~ $reg; # matches, just like /foo+bar?/
+ $x =~ /$reg/; # same thing, alternate form
+
+C<$reg> can also be interpolated into a larger regexp:
+
+ $x =~ /(abc)?$reg/; # still matches
+
+As with the matching operator, the regexp quote can use different
+delimiters, e.g., C<qr!!>, C<qr{}> and C<qr~~>. The single quote
+delimiters C<qr''> prevent any interpolation from taking place.
+
+Pre-compiled regexps are useful for creating dynamic matches that
+don't need to be recompiled each time they are encountered. Using
+pre-compiled regexps, C<simple_grep> program can be expanded into a
+program that matches multiple patterns:
+
+ % cat > multi_grep
+ #!/usr/bin/perl
+ # multi_grep - match any of <number> regexps
+ # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ...
+
+ $number = shift;
+ $regexp[$_] = shift foreach (0..$number-1);
+ @compiled = map qr/$_/, @regexp;
+ while ($line = <>) {
+ foreach $pattern (@compiled) {
+ if ($line =~ /$pattern/) {
+ print $line;
+ last; # we matched, so move onto the next line
+ }
+ }
+ }
+ ^D
+
+ % multi_grep 2 last for multi_grep
+ $regexp[$_] = shift foreach (0..$number-1);
+ foreach $pattern (@compiled) {
+ last;
+
+Storing pre-compiled regexps in an array C<@compiled> allows us to
+simply loop through the regexps without any recompilation, thus gaining
+flexibility without sacrificing speed.
+
+=head2 Embedding comments and modifiers in a regular expression
+
+Starting with this section, we will be discussing Perl's set of
+B<extended patterns>. These are extensions to the traditional regular
+expression syntax that provide powerful new tools for pattern
+matching. We have already seen extensions in the form of the minimal
+matching constructs C<??>, C<*?>, C<+?>, C<{n,m}?>, and C<{n,}?>. The
+rest of the extensions below have the form C<(?char...)>, where the
+C<char> is a character that determines the type of extension.
+
+The first extension is an embedded comment C<(?#text)>. This embeds a
+comment into the regular expression without affecting its meaning. The
+comment should not have any closing parentheses in the text. An
+example is
+
+ /(?# Match an integer:)[+-]?\d+/;
+
+This style of commenting has been largely superseded by the raw,
+freeform commenting that is allowed with the C<//x> modifier.
+
+The modifiers C<//i>, C<//m>, C<//s>, and C<//x> can also embedded in
+a regexp using C<(?i)>, C<(?m)>, C<(?s)>, and C<(?x)>. For instance,
+
+ /(?i)yes/; # match 'yes' case insensitively
+ /yes/i; # same thing
+ /(?x)( # freeform version of an integer regexp
+ [+-]? # match an optional sign
+ \d+ # match a sequence of digits
+ )
+ /x;
+
+Embedded modifiers can have two important advantages over the usual
+modifiers. Embedded modifiers allow a custom set of modifiers to
+I<each> regexp pattern. This is great for matching an array of regexps
+that must have different modifiers:
+
+ $pattern[0] = '(?i)doctor';
+ $pattern[1] = 'Johnson';
+ ...
+ while (<>) {
+ foreach $patt (@pattern) {
+ print if /$patt/;
+ }
+ }
+
+The second advantage is that embedded modifiers only affect the regexp
+inside the group the embedded modifier is contained in. So grouping
+can be used to localize the modifier's effects:
+
+ /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc.
+
+Embedded modifiers can also turn off any modifiers already present
+by using, e.g., C<(?-i)>. Modifiers can also be combined into
+a single expression, e.g., C<(?s-i)> turns on single line mode and
+turns off case insensitivity.
+
+=head2 Non-capturing groupings
+
+We noted in Part 1 that groupings C<()> had two distinct functions: 1)
+group regexp elements together as a single unit, and 2) extract, or
+capture, substrings that matched the regexp in the
+grouping. Non-capturing groupings, denoted by C<(?:regexp)>, allow the
+regexp to be treated as a single unit, but don't extract substrings or
+set matching variables C<$1>, etc. Both capturing and non-capturing
+groupings are allowed to co-exist in the same regexp. Because there is
+no extraction, non-capturing groupings are faster than capturing
+groupings. Non-capturing groupings are also handy for choosing exactly
+which parts of a regexp are to be extracted to matching variables:
+
+ # match a number, $1-$4 are set, but we only want $1
+ /([+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)/;
+
+ # match a number faster , only $1 is set
+ /([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)/;
+
+ # match a number, get $1 = whole number, $2 = exponent
+ /([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE]([+-]?\d+))?)/;
+
+Non-capturing groupings are also useful for removing nuisance
+elements gathered from a split operation:
+
+ $x = '12a34b5';
+ @num = split /(a|b)/, $x; # @num = ('12','a','34','b','5')
+ @num = split /(?:a|b)/, $x; # @num = ('12','34','5')
+
+Non-capturing groupings may also have embedded modifiers:
+C<(?i-m:regexp)> is a non-capturing grouping that matches C<regexp>
+case insensitively and turns off multi-line mode.
+
+=head2 Looking ahead and looking behind
+
+This section concerns the lookahead and lookbehind assertions. First,
+a little background.
+
+In Perl regular expressions, most regexp elements 'eat up' a certain
+amount of string when they match. For instance, the regexp element
+C<[abc}]> eats up one character of the string when it matches, in the
+sense that perl moves to the next character position in the string
+after the match. There are some elements, however, that don't eat up
+characters (advance the character position) if they match. The examples
+we have seen so far are the anchors. The anchor C<^> matches the
+beginning of the line, but doesn't eat any characters. Similarly, the
+word boundary anchor C<\b> matches, e.g., if the character to the left
+is a word character and the character to the right is a non-word
+character, but it doesn't eat up any characters itself. Anchors are
+examples of 'zero-width assertions'. Zero-width, because they consume
+no characters, and assertions, because they test some property of the
+string. In the context of our walk in the woods analogy to regexp
+matching, most regexp elements move us along a trail, but anchors have
+us stop a moment and check our surroundings. If the local environment
+checks out, we can proceed forward. But if the local environment
+doesn't satisfy us, we must backtrack.
+
+Checking the environment entails either looking ahead on the trail,
+looking behind, or both. C<^> looks behind, to see that there are no
+characters before. C<$> looks ahead, to see that there are no
+characters after. C<\b> looks both ahead and behind, to see if the
+characters on either side differ in their 'word'-ness.
+
+The lookahead and lookbehind assertions are generalizations of the
+anchor concept. Lookahead and lookbehind are zero-width assertions
+that let us specify which characters we want to test for. The
+lookahead assertion is denoted by C<(?=regexp)> and the lookbehind
+assertion is denoted by C<< (?<=fixed-regexp) >>. Some examples are
+
+ $x = "I catch the housecat 'Tom-cat' with catnip";
+ $x =~ /cat(?=\s+)/; # matches 'cat' in 'housecat'
+ @catwords = ($x =~ /(?<=\s)cat\w+/g); # matches,
+ # $catwords[0] = 'catch'
+ # $catwords[1] = 'catnip'
+ $x =~ /\bcat\b/; # matches 'cat' in 'Tom-cat'
+ $x =~ /(?<=\s)cat(?=\s)/; # doesn't match; no isolated 'cat' in
+ # middle of $x
+
+Note that the parentheses in C<(?=regexp)> and C<< (?<=regexp) >> are
+non-capturing, since these are zero-width assertions. Thus in the
+second regexp, the substrings captured are those of the whole regexp
+itself. Lookahead C<(?=regexp)> can match arbitrary regexps, but
+lookbehind C<< (?<=fixed-regexp) >> only works for regexps of fixed
+width, i.e., a fixed number of characters long. Thus
+C<< (?<=(ab|bc)) >> is fine, but C<< (?<=(ab)*) >> is not. The
+negated versions of the lookahead and lookbehind assertions are
+denoted by C<(?!regexp)> and C<< (?<!fixed-regexp) >> respectively.
+They evaluate true if the regexps do I<not> match:
+
+ $x = "foobar";
+ $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo'
+ $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo'
+ $x =~ /(?<!\s)foo/; # matches, there is no \s before 'foo'
+
+=head2 Using independent subexpressions to prevent backtracking
+
+The last few extended patterns in this tutorial are experimental as of
+5.6.0. Play with them, use them in some code, but don't rely on them
+just yet for production code.
+
+S<B<Independent subexpressions> > are regular expressions, in the
+context of a larger regular expression, that function independently of
+the larger regular expression. That is, they consume as much or as
+little of the string as they wish without regard for the ability of
+the larger regexp to match. Independent subexpressions are represented
+by C<< (?>regexp) >>. We can illustrate their behavior by first
+considering an ordinary regexp:
+
+ $x = "ab";
+ $x =~ /a*ab/; # matches
+
+This obviously matches, but in the process of matching, the
+subexpression C<a*> first grabbed the C<a>. Doing so, however,
+wouldn't allow the whole regexp to match, so after backtracking, C<a*>
+eventually gave back the C<a> and matched the empty string. Here, what
+C<a*> matched was I<dependent> on what the rest of the regexp matched.
+
+Contrast that with an independent subexpression:
+
+ $x =~ /(?>a*)ab/; # doesn't match!
+
+The independent subexpression C<< (?>a*) >> doesn't care about the rest
+of the regexp, so it sees an C<a> and grabs it. Then the rest of the
+regexp C<ab> cannot match. Because C<< (?>a*) >> is independent, there
+is no backtracking and and the independent subexpression does not give
+up its C<a>. Thus the match of the regexp as a whole fails. A similar
+behavior occurs with completely independent regexps:
+
+ $x = "ab";
+ $x =~ /a*/g; # matches, eats an 'a'
+ $x =~ /\Gab/g; # doesn't match, no 'a' available
+
+Here C<//g> and C<\G> create a 'tag team' handoff of the string from
+one regexp to the other. Regexps with an independent subexpression are
+much like this, with a handoff of the string to the independent
+subexpression, and a handoff of the string back to the enclosing
+regexp.
+
+The ability of an independent subexpression to prevent backtracking
+can be quite useful. Suppose we want to match a non-empty string
+enclosed in parentheses up to two levels deep. Then the following
+regexp matches:
+
+ $x = "abc(de(fg)h"; # unbalanced parentheses
+ $x =~ /\( ( [^()]+ | \([^()]*\) )+ \)/x;
+
+The regexp matches an open parenthesis, one or more copies of an
+alternation, and a close parenthesis. The alternation is two-way, with
+the first alternative C<[^()]+> matching a substring with no
+parentheses and the second alternative C<\([^()]*\)> matching a
+substring delimited by parentheses. The problem with this regexp is
+that it is pathological: it has nested indeterminate quantifiers
+ of the form C<(a+|b)+>. We discussed in Part 1 how nested quantifiers
+like this could take an exponentially long time to execute if there
+was no match possible. To prevent the exponential blowup, we need to
+prevent useless backtracking at some point. This can be done by
+enclosing the inner quantifier as an independent subexpression:
+
+ $x =~ /\( ( (?>[^()]+) | \([^()]*\) )+ \)/x;
+
+Here, C<< (?>[^()]+) >> breaks the degeneracy of string partitioning
+by gobbling up as much of the string as possible and keeping it. Then
+match failures fail much more quickly.
+
+=head2 Conditional expressions
+
+A S<B<conditional expression> > is a form of if-then-else statement
+that allows one to choose which patterns are to be matched, based on
+some condition. There are two types of conditional expression:
+C<(?(condition)yes-regexp)> and
+C<(?(condition)yes-regexp|no-regexp)>. C<(?(condition)yes-regexp)> is
+like an S<C<'if () {}'> > statement in Perl. If the C<condition> is true,
+the C<yes-regexp> will be matched. If the C<condition> is false, the
+C<yes-regexp> will be skipped and perl will move onto the next regexp
+element. The second form is like an S<C<'if () {} else {}'> > statement
+in Perl. If the C<condition> is true, the C<yes-regexp> will be
+matched, otherwise the C<no-regexp> will be matched.
+
+The C<condition> can have two forms. The first form is simply an
+integer in parentheses C<(integer)>. It is true if the corresponding
+backreference C<\integer> matched earlier in the regexp. The second
+form is a bare zero width assertion C<(?...)>, either a
+lookahead, a lookbehind, or a code assertion (discussed in the next
+section).
+
+The integer form of the C<condition> allows us to choose, with more
+flexibility, what to match based on what matched earlier in the
+regexp. This searches for words of the form C<"$x$x"> or
+C<"$x$y$y$x">:
+
+ % simple_grep '^(\w+)(\w+)?(?(2)\2\1|\1)$' /usr/dict/words
+ beriberi
+ coco
+ couscous
+ deed
+ ...
+ toot
+ toto
+ tutu
+
+The lookbehind C<condition> allows, along with backreferences,
+an earlier part of the match to influence a later part of the
+match. For instance,
+
+ /[ATGC]+(?(?<=AA)G|C)$/;
+
+matches a DNA sequence such that it either ends in C<AAG>, or some
+other base pair combination and C<C>. Note that the form is
+C<< (?(?<=AA)G|C) >> and not C<< (?((?<=AA))G|C) >>; for the
+lookahead, lookbehind or code assertions, the parentheses around the
+conditional are not needed.
+
+=head2 A bit of magic: executing Perl code in a regular expression
+
+Normally, regexps are a part of Perl expressions.
+S<B<Code evaluation> > expressions turn that around by allowing
+arbitrary Perl code to be a part of of a regexp. A code evaluation
+expression is denoted C<(?{code})>, with C<code> a string of Perl
+statements.
+
+Code expressions are zero-width assertions, and the value they return
+depends on their environment. There are two possibilities: either the
+code expression is used as a conditional in a conditional expression
+C<(?(condition)...)>, or it is not. If the code expression is a
+conditional, the code is evaluated and the result (i.e., the result of
+the last statement) is used to determine truth or falsehood. If the
+code expression is not used as a conditional, the assertion always
+evaluates true and the result is put into the special variable
+C<$^R>. The variable C<$^R> can then be used in code expressions later
+in the regexp. Here are some silly examples:
+
+ $x = "abcdef";
+ $x =~ /abc(?{print "Hi Mom!";})def/; # matches,
+ # prints 'Hi Mom!'
+ $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match,
+ # no 'Hi Mom!'
+
+Pay careful attention to the next example:
+
+ $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match,
+ # no 'Hi Mom!'
+ # but why not?
+
+At first glance, you'd think that it shouldn't print, because obviously
+the C<ddd> isn't going to match the target string. But look at this
+example:
+
+ $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match,
+ # but _does_ print
+
+Hmm. What happened here? If you've been following along, you know that
+the above pattern should be effectively the same as the last one --
+enclosing the d in a character class isn't going to change what it
+matches. So why does the first not print while the second one does?
+
+The answer lies in the optimizations the REx engine makes. In the first
+case, all the engine sees are plain old characters (aside from the
+C<?{}> construct). It's smart enough to realize that the string 'ddd'
+doesn't occur in our target string before actually running the pattern
+through. But in the second case, we've tricked it into thinking that our
+pattern is more complicated than it is. It takes a look, sees our
+character class, and decides that it will have to actually run the
+pattern to determine whether or not it matches, and in the process of
+running it hits the print statement before it discovers that we don't
+have a match.
+
+To take a closer look at how the engine does optimizations, see the
+section L<"Pragmas and debugging"> below.
+
+More fun with C<?{}>:
+
+ $x =~ /(?{print "Hi Mom!";})/; # matches,
+ # prints 'Hi Mom!'
+ $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches,
+ # prints '1'
+ $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches,
+ # prints '1'
+
+The bit of magic mentioned in the section title occurs when the regexp
+backtracks in the process of searching for a match. If the regexp
+backtracks over a code expression and if the variables used within are
+localized using C<local>, the changes in the variables produced by the
+code expression are undone! Thus, if we wanted to count how many times
+a character got matched inside a group, we could use, e.g.,
+
+ $x = "aaaa";
+ $count = 0; # initialize 'a' count
+ $c = "bob"; # test if $c gets clobbered
+ $x =~ /(?{local $c = 0;}) # initialize count
+ ( a # match 'a'
+ (?{local $c = $c + 1;}) # increment count
+ )* # do this any number of times,
+ aa # but match 'aa' at the end
+ (?{$count = $c;}) # copy local $c var into $count
+ /x;
+ print "'a' count is $count, \$c variable is '$c'\n";
+
+This prints
+
+ 'a' count is 2, $c variable is 'bob'
+
+If we replace the S<C< (?{local $c = $c + 1;})> > with
+S<C< (?{$c = $c + 1;})> >, the variable changes are I<not> undone
+during backtracking, and we get
+
+ 'a' count is 4, $c variable is 'bob'
+
+Note that only localized variable changes are undone. Other side
+effects of code expression execution are permanent. Thus
+
+ $x = "aaaa";
+ $x =~ /(a(?{print "Yow\n";}))*aa/;
+
+produces
+
+ Yow
+ Yow
+ Yow
+ Yow
+
+The result C<$^R> is automatically localized, so that it will behave
+properly in the presence of backtracking.
+
+This example uses a code expression in a conditional to match the
+article 'the' in either English or German:
+
+ $lang = 'DE'; # use German
+ ...
+ $text = "das";
+ print "matched\n"
+ if $text =~ /(?(?{
+ $lang eq 'EN'; # is the language English?
+ })
+ the | # if so, then match 'the'
+ (die|das|der) # else, match 'die|das|der'
+ )
+ /xi;
+
+Note that the syntax here is C<(?(?{...})yes-regexp|no-regexp)>, not
+C<(?((?{...}))yes-regexp|no-regexp)>. In other words, in the case of a
+code expression, we don't need the extra parentheses around the
+conditional.
+
+If you try to use code expressions with interpolating variables, perl
+may surprise you:
+
+ $bar = 5;
+ $pat = '(?{ 1 })';
+ /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated
+ /foo(?{ 1 })$bar/; # compile error!
+ /foo${pat}bar/; # compile error!
+
+ $pat = qr/(?{ $foo = 1 })/; # precompile code regexp
+ /foo${pat}bar/; # compiles ok
+
+If a regexp has (1) code expressions and interpolating variables,or
+(2) a variable that interpolates a code expression, perl treats the
+regexp as an error. If the code expression is precompiled into a
+variable, however, interpolating is ok. The question is, why is this
+an error?
+
+The reason is that variable interpolation and code expressions
+together pose a security risk. The combination is dangerous because
+many programmers who write search engines often take user input and
+plug it directly into a regexp:
+
+ $regexp = <>; # read user-supplied regexp
+ $chomp $regexp; # get rid of possible newline
+ $text =~ /$regexp/; # search $text for the $regexp
+
+If the C<$regexp> variable contains a code expression, the user could
+then execute arbitrary Perl code. For instance, some joker could
+search for S<C<system('rm -rf *');> > to erase your files. In this
+sense, the combination of interpolation and code expressions B<taints>
+your regexp. So by default, using both interpolation and code
+expressions in the same regexp is not allowed. If you're not
+concerned about malicious users, it is possible to bypass this
+security check by invoking S<C<use re 'eval'> >:
+
+ use re 'eval'; # throw caution out the door
+ $bar = 5;
+ $pat = '(?{ 1 })';
+ /foo(?{ 1 })$bar/; # compiles ok
+ /foo${pat}bar/; # compiles ok
+
+Another form of code expression is the S<B<pattern code expression> >.
+The pattern code expression is like a regular code expression, except
+that the result of the code evaluation is treated as a regular
+expression and matched immediately. A simple example is
+
+ $length = 5;
+ $char = 'a';
+ $x = 'aaaaabb';
+ $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a'
+
+
+This final example contains both ordinary and pattern code
+expressions. It detects if a binary string C<1101010010001...> has a
+Fibonacci spacing 0,1,1,2,3,5,... of the C<1>'s:
+
+ $s0 = 0; $s1 = 1; # initial conditions
+ $x = "1101010010001000001";
+ print "It is a Fibonacci sequence\n"
+ if $x =~ /^1 # match an initial '1'
+ (
+ (??{'0' x $s0}) # match $s0 of '0'
+ 1 # and then a '1'
+ (?{
+ $largest = $s0; # largest seq so far
+ $s2 = $s1 + $s0; # compute next term
+ $s0 = $s1; # in Fibonacci sequence
+ $s1 = $s2;
+ })
+ )+ # repeat as needed
+ $ # that is all there is
+ /x;
+ print "Largest sequence matched was $largest\n";
+
+This prints
+
+ It is a Fibonacci sequence
+ Largest sequence matched was 5
+
+Ha! Try that with your garden variety regexp package...
+
+Note that the variables C<$s0> and C<$s1> are not substituted when the
+regexp is compiled, as happens for ordinary variables outside a code
+expression. Rather, the code expressions are evaluated when perl
+encounters them during the search for a match.
+
+The regexp without the C<//x> modifier is
+
+ /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/;
+
+and is a great start on an Obfuscated Perl entry :-) When working with
+code and conditional expressions, the extended form of regexps is
+almost necessary in creating and debugging regexps.
+
+=head2 Pragmas and debugging
+
+Speaking of debugging, there are several pragmas available to control
+and debug regexps in Perl. We have already encountered one pragma in
+the previous section, S<C<use re 'eval';> >, that allows variable
+interpolation and code expressions to coexist in a regexp. The other
+pragmas are
+
+ use re 'taint';
+ $tainted = <>;
+ @parts = ($tainted =~ /(\w+)\s+(\w+)/; # @parts is now tainted
+
+The C<taint> pragma causes any substrings from a match with a tainted
+variable to be tainted as well. This is not normally the case, as
+regexps are often used to extract the safe bits from a tainted
+variable. Use C<taint> when you are not extracting safe bits, but are
+performing some other processing. Both C<taint> and C<eval> pragmas
+are lexically scoped, which means they are in effect only until
+the end of the block enclosing the pragmas.
+
+ use re 'debug';
+ /^(.*)$/s; # output debugging info
+
+ use re 'debugcolor';
+ /^(.*)$/s; # output debugging info in living color
+
+The global C<debug> and C<debugcolor> pragmas allow one to get
+detailed debugging info about regexp compilation and
+execution. C<debugcolor> is the same as debug, except the debugging
+information is displayed in color on terminals that can display
+termcap color sequences. Here is example output:
+
+ % perl -e 'use re "debug"; "abc" =~ /a*b+c/;'
+ Compiling REx `a*b+c'
+ size 9 first at 1
+ 1: STAR(4)
+ 2: EXACT <a>(0)
+ 4: PLUS(7)
+ 5: EXACT <b>(0)
+ 7: EXACT <c>(9)
+ 9: END(0)
+ floating `bc' at 0..2147483647 (checking floating) minlen 2
+ Guessing start of match, REx `a*b+c' against `abc'...
+ Found floating substr `bc' at offset 1...
+ Guessed: match at offset 0
+ Matching REx `a*b+c' against `abc'
+ Setting an EVAL scope, savestack=3
+ 0 <> <abc> | 1: STAR
+ EXACT <a> can match 1 times out of 32767...
+ Setting an EVAL scope, savestack=3
+ 1 <a> <bc> | 4: PLUS
+ EXACT <b> can match 1 times out of 32767...
+ Setting an EVAL scope, savestack=3
+ 2 <ab> <c> | 7: EXACT <c>
+ 3 <abc> <> | 9: END
+ Match successful!
+ Freeing REx: `a*b+c'
+
+If you have gotten this far into the tutorial, you can probably guess
+what the different parts of the debugging output tell you. The first
+part
+
+ Compiling REx `a*b+c'
+ size 9 first at 1
+ 1: STAR(4)
+ 2: EXACT <a>(0)
+ 4: PLUS(7)
+ 5: EXACT <b>(0)
+ 7: EXACT <c>(9)
+ 9: END(0)
+
+describes the compilation stage. C<STAR(4)> means that there is a
+starred object, in this case C<'a'>, and if it matches, goto line 4,
+i.e., C<PLUS(7)>. The middle lines describe some heuristics and
+optimizations performed before a match:
+
+ floating `bc' at 0..2147483647 (checking floating) minlen 2
+ Guessing start of match, REx `a*b+c' against `abc'...
+ Found floating substr `bc' at offset 1...
+ Guessed: match at offset 0
+
+Then the match is executed and the remaining lines describe the
+process:
+
+ Matching REx `a*b+c' against `abc'
+ Setting an EVAL scope, savestack=3
+ 0 <> <abc> | 1: STAR
+ EXACT <a> can match 1 times out of 32767...
+ Setting an EVAL scope, savestack=3
+ 1 <a> <bc> | 4: PLUS
+ EXACT <b> can match 1 times out of 32767...
+ Setting an EVAL scope, savestack=3
+ 2 <ab> <c> | 7: EXACT <c>
+ 3 <abc> <> | 9: END
+ Match successful!
+ Freeing REx: `a*b+c'
+
+Each step is of the form S<C<< n <x> <y> >> >, with C<< <x> >> the
+part of the string matched and C<< <y> >> the part not yet
+matched. The S<C<< | 1: STAR >> > says that perl is at line number 1
+n the compilation list above. See
+L<perldebguts/"Debugging regular expressions"> for much more detail.
+
+An alternative method of debugging regexps is to embed C<print>
+statements within the regexp. This provides a blow-by-blow account of
+the backtracking in an alternation:
+
+ "that this" =~ m@(?{print "Start at position ", pos, "\n";})
+ t(?{print "t1\n";})
+ h(?{print "h1\n";})
+ i(?{print "i1\n";})
+ s(?{print "s1\n";})
+ |
+ t(?{print "t2\n";})
+ h(?{print "h2\n";})
+ a(?{print "a2\n";})
+ t(?{print "t2\n";})
+ (?{print "Done at position ", pos, "\n";})
+ @x;
+
+prints
+
+ Start at position 0
+ t1
+ h1
+ t2
+ h2
+ a2
+ t2
+ Done at position 4
+
+=head1 BUGS
+
+Code expressions, conditional expressions, and independent expressions
+are B<experimental>. Don't use them in production code. Yet.
+
+=head1 SEE ALSO
+
+This is just a tutorial. For the full story on perl regular
+expressions, see the L<perlre> regular expressions reference page.
+
+For more information on the matching C<m//> and substitution C<s///>
+operators, see L<perlop/"Regexp Quote-Like Operators">. For
+information on the C<split> operation, see L<perlfunc/split>.
+
+For an excellent all-around resource on the care and feeding of
+regular expressions, see the book I<Mastering Regular Expressions> by
+Jeffrey Friedl (published by O'Reilly, ISBN 1556592-257-3).
+
+=head1 AUTHOR AND COPYRIGHT
+
+Copyright (c) 2000 Mark Kvale
+All rights reserved.
+
+This document may be distributed under the same terms as Perl itself.
+
+=head2 Acknowledgments
+
+The inspiration for the stop codon DNA example came from the ZIP
+code example in chapter 7 of I<Mastering Regular Expressions>.
+
+The author would like to thank Jeff Pinyan, Andrew Johnson, Peter
+Haworth, Ronald J Kimball, and Joe Smith for all their helpful
+comments.
+
+=cut
+
diff --git a/contrib/perl5/pod/perlutil.pod b/contrib/perl5/pod/perlutil.pod
new file mode 100644
index 0000000000000..be7a345f79678
--- /dev/null
+++ b/contrib/perl5/pod/perlutil.pod
@@ -0,0 +1,185 @@
+=head1 NAME
+
+perlutil - utilities packaged with the Perl distribution
+
+=head1 DESCRIPTION
+
+Along with the Perl interpreter itself, the Perl distribution installs a
+range of utilities on your system. There are also several utilities
+which are used by the Perl distribution itself as part of the install
+process. This document exists to list all of these utilities, explain
+what they are for and provide pointers to each module's documentation,
+if appropriate.
+
+=head2 DOCUMENTATION
+
+=over 3
+
+=item L<perldoc|perldoc>
+
+The main interface to Perl's documentation is C<perldoc>, although
+if you're reading this, it's more than likely that you've already found
+it. F<perldoc> will extract and format the documentation from any file
+in the current directory, any Perl module installed on the system, or
+any of the standard documentation pages, such as this one. Use
+C<perldoc E<lt>nameE<gt>> to get information on any of the utilities
+described in this document.
+
+=item L<pod2man|pod2man> and L<pod2text|pod2text>
+
+If it's run from a terminal, F<perldoc> will usually call F<pod2man> to
+translate POD (Plain Old Documentation - see L<perlpod> for an
+explanation) into a man page, and then run F<man> to display it; if
+F<man> isn't available, F<pod2text> will be used instead and the output
+piped through your favourite pager.
+
+=item L<pod2html|pod2html> and L<pod2latex|pod2latex>
+
+As well as these two, there are two other converters: F<pod2html> will
+produce HTML pages from POD, and F<pod2latex>, which produces LaTeX
+files.
+
+=item L<pod2usage|pod2usage>
+
+If you just want to know how to use the utilities described here,
+F<pod2usage> will just extract the "USAGE" section; some of
+the utilities will automatically call F<pod2usage> on themselves when
+you call them with C<-help>.
+
+=item L<podselect|podselect>
+
+F<pod2usage> is a special case of F<podselect>, a utility to extract
+named sections from documents written in POD. For instance, while
+utilities have "USAGE" sections, Perl modules usually have "SYNOPSIS"
+sections: C<podselect -s "SYNOPSIS" ...> will extract this section for
+a given file.
+
+=item L<podchecker|podchecker>
+
+If you're writing your own documentation in POD, the F<podchecker>
+utility will look for errors in your markup.
+
+=item L<splain|splain>
+
+F<splain> is an interface to L<perldiag> - paste in your error message
+to it, and it'll explain it for you.
+
+=item L<roffitall|roffitall>
+
+The C<roffitall> utility is not installed on your system but lives in
+the F<pod/> directory of your Perl source kit; it converts all the
+documentation from the distribution to F<*roff> format, and produces a
+typeset PostScript or text file of the whole lot.
+
+=back
+
+=head2 CONVERTORS
+
+To help you convert legacy programs to Perl, we've included three
+conversion filters:
+
+=over 3
+
+=item L<a2p|a2p>
+
+F<a2p> converts F<awk> scripts to Perl programs; for example, C<a2p -F:>
+on the simple F<awk> script C<{print $2}> will produce a Perl program
+based around this code:
+
+ while (<>) {
+ ($Fld1,$Fld2) = split(/[:\n]/, $_, 9999);
+ print $Fld2;
+ }
+
+=item L<s2p|s2p>
+
+Similarly, F<s2p> converts F<sed> scripts to Perl programs. F<s2p> run
+on C<s/foo/bar> will produce a Perl program based around this:
+
+ while (<>) {
+ chomp;
+ s/foo/bar/g;
+ print if $printit;
+ }
+
+=item L<find2perl|find2perl>
+
+Finally, F<find2perl> translates C<find> commands to Perl equivalents which
+use the L<File::Find|File::Find> module. As an example,
+C<find2perl . -user root -perm 4000 -print> produces the following callback
+subroutine for C<File::Find>:
+
+ sub wanted {
+ my ($dev,$ino,$mode,$nlink,$uid,$gid);
+ (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
+ $uid == $uid{'root'}) &&
+ (($mode & 0777) == 04000);
+ print("$name\n");
+ }
+
+=back
+
+As well as these filters for converting other languages, the
+L<pl2pm|pl2pm> utility will help you convert old-style Perl 4 libraries to
+new-style Perl5 modules.
+
+=head2 Development
+
+There are a set of utilities which help you in developing Perl programs,
+and in particular, extending Perl with C.
+
+=over 3
+
+=item L<perlbug|perlbug>
+
+F<perlbug> is the recommended way to report bugs in the perl interpreter
+itself or any of the standard library modules back to the developers;
+please read through the documentation for F<perlbug> thoroughly before
+using it to submit a bug report.
+
+=item L<h2ph|h2ph>
+
+Back before Perl had the XS system for connecting with C libraries,
+programmers used to get library constants by reading through the C
+header files. You may still see C<require 'syscall.ph'> or similar
+around - the F<.ph> file should be created by running F<h2ph> on the
+corresponding F<.h> file. See the F<h2ph> documentation for more on how
+to convert a whole bunch of header files at ones.
+
+=item L<c2ph|c2ph> and L<pstruct|pstruct>
+
+F<c2ph> and F<pstruct>, which are actually the same program but behave
+differently depending on how they are called, provide another way of
+getting at C with Perl - they'll convert C structures and union declarations
+to Perl code. This is deprecated in favour of F<h2xs> these days.
+
+=item L<h2xs|h2xs>
+
+F<h2xs> converts C header files into XS modules, and will try and write
+as much glue between C libraries and Perl modules as it can. It's also
+very useful for creating skeletons of pure Perl modules.
+
+=item L<dprofpp|dprofpp>
+
+Perl comes with a profiler, the F<Devel::Dprof> module. The
+F<dprofpp> utility analyzes the output of this profiler and tells you
+which subroutines are taking up the most run time. See L<Devel::Dprof>
+for more information.
+
+=item L<perlcc|perlcc>
+
+F<perlcc> is the interface to the experimental Perl compiler suite.
+
+=back
+
+=head2 SEE ALSO
+
+L<perldoc|perldoc>, L<pod2man|pod2man>, L<perlpod>,
+L<pod2html|pod2html>, L<pod2usage|pod2usage>, L<podselect|podselect>,
+L<podchecker|podchecker>, L<splain|splain>, L<perldiag>,
+L<roffitall|roffitall>, L<a2p|a2p>, L<s2p|s2p>, L<find2perl|find2perl>,
+L<File::Find|File::Find>, L<pl2pm|pl2pm>, L<perlbug|perlbug>,
+L<h2ph|h2ph>, L<c2ph|c2ph>, L<h2xs|h2xs>, L<dprofpp|dprofpp>,
+L<Devel::Dprof>, L<perlcc|perlcc>
+
+=cut