diff options
Diffstat (limited to 'contrib/perl5/pod')
-rw-r--r-- | contrib/perl5/pod/Makefile.SH | 167 | ||||
-rwxr-xr-x | contrib/perl5/pod/buildtoc.PL | 492 | ||||
-rw-r--r-- | contrib/perl5/pod/perlclib.pod | 197 | ||||
-rw-r--r-- | contrib/perl5/pod/perldebtut.pod | 721 | ||||
-rw-r--r-- | contrib/perl5/pod/perlebcdic.pod | 1235 | ||||
-rwxr-xr-x | contrib/perl5/pod/perlmodlib.PL | 1383 | ||||
-rw-r--r-- | contrib/perl5/pod/perlnewmod.pod | 282 | ||||
-rw-r--r-- | contrib/perl5/pod/perlrequick.pod | 503 | ||||
-rw-r--r-- | contrib/perl5/pod/perlretut.pod | 2504 | ||||
-rw-r--r-- | contrib/perl5/pod/perlutil.pod | 185 |
10 files changed, 7669 insertions, 0 deletions
diff --git a/contrib/perl5/pod/Makefile.SH b/contrib/perl5/pod/Makefile.SH new file mode 100644 index 0000000000000..b8c8c8f24c730 --- /dev/null +++ b/contrib/perl5/pod/Makefile.SH @@ -0,0 +1,167 @@ +case $CONFIG in +'') + if test -f config.sh; then TOP=.; + elif test -f ../config.sh; then TOP=..; + elif test -f ../../config.sh; then TOP=../..; + elif test -f ../../../config.sh; then TOP=../../..; + elif test -f ../../../../config.sh; then TOP=../../../..; + else + echo "Can't find config.sh."; exit 1 + fi + . $TOP/config.sh + ;; +esac +: This forces SH files to create target in same directory as SH file. +: This is so that make depend always knows where to find SH derivatives. +case "$0" in +*/*) cd `expr X$0 : 'X\(.*\)/'` ;; +esac + +if test -d pod; then + cd pod || exit 1 +fi +POD=`echo *.pod` +MAN=`echo $POD|sed 's/\.pod/\.man/g'` +HTML=`echo $POD|sed 's/perltoc.pod//'|sed 's/\.pod/\.html/g'` +TEX=`echo $POD|sed 's/\.pod/\.tex/g'` + +echo "Extracting pod/Makefile (with variable substitutions)" +: This section of the file will have variable substitutions done on it. +: Move anything that needs config subs from !NO!SUBS! section to !GROK!THIS!. +: Protect any dollar signs and backticks that you do not want interpreted +: by putting a backslash in front. You may delete these comments. + +$spitshell >Makefile <<!GROK!THIS! +# pod/Makefile +# This file is derived from pod/Makefile.SH. Any changes made here will +# be lost the next time you run Configure. + +POD = $POD + +MAN = $MAN + +# no perltoc.html +HTML = $HTML + +TEX = $TEX + +!GROK!THIS! + +## In the following dollars and backticks do not need the extra backslash. +$spitshell >>Makefile <<'!NO!SUBS!' + +CONVERTERS = pod2html pod2latex pod2man pod2text checkpods \ + pod2usage podchecker podselect + +HTMLROOT = / # Change this to fix cross-references in HTML +POD2HTML = pod2html \ + --htmlroot=$(HTMLROOT) \ + --podroot=.. --podpath=pod:lib:ext:vms \ + --libpods=perlfunc:perlguts:perlvar:perlrun:perlop + +PERL = ../miniperl +PERLILIB = $(PERL) -I../lib +REALPERL = ../perl + +all: $(CONVERTERS) man + +converters: $(CONVERTERS) + +regen_pods: perlmodlib.pod toc + +buildtoc: buildtoc.PL perl.pod ../MANIFEST + $(PERLILIB) buildtoc.PL + +perltoc.pod: buildtoc + +man: pod2man $(MAN) + +html: pod2html $(HTML) + +tex: pod2latex $(TEX) + +toc: buildtoc + $(PERLILIB) buildtoc + +.SUFFIXES: .pm .pod + +.SUFFIXES: .man + +.pm.man: pod2man + $(PERL) -I../lib pod2man $*.pm >$*.man + +.pod.man: pod2man + $(PERL) -I../lib pod2man $*.pod >$*.man + +.SUFFIXES: .html + +.pm.html: pod2html + $(PERL) -I../lib $(POD2HTML) --infile=$*.pm --outfile=$*.html + +.pod.html: pod2html + $(PERL) -I../lib $(POD2HTML) --infile=$*.pod --outfile=$*.html + +.SUFFIXES: .tex + +.pm.tex: pod2latex + $(PERL) -I../lib pod2latex $*.pm + +.pod.tex: pod2latex + $(PERL) -I../lib pod2latex $*.pod + +clean: + rm -f $(MAN) + rm -f $(HTML) + rm -f $(TEX) + rm -f pod2html-*cache + rm -f *.aux *.log *.exe + +realclean: clean + rm -f $(CONVERTERS) + +distclean: realclean + +veryclean: distclean + -rm -f *~ *.orig + +check: checkpods + @echo "checking..."; \ + $(PERL) -I../lib checkpods $(POD) + +# Dependencies. +pod2latex: pod2latex.PL ../lib/Config.pm + $(PERL) -I../lib pod2latex.PL + +pod2html: pod2html.PL ../lib/Config.pm + $(PERL) -I ../lib pod2html.PL + +pod2man: pod2man.PL ../lib/Config.pm + $(PERL) -I ../lib pod2man.PL + +pod2text: pod2text.PL ../lib/Config.pm + $(PERL) -I ../lib pod2text.PL + +checkpods: checkpods.PL ../lib/Config.pm + $(PERL) -I ../lib checkpods.PL + +pod2usage: pod2usage.PL ../lib/Config.pm + $(PERL) -I ../lib pod2usage.PL + +podchecker: podchecker.PL ../lib/Config.pm + $(PERL) -I ../lib podchecker.PL + +podselect: podselect.PL ../lib/Config.pm + $(PERL) -I ../lib podselect.PL + +perlmodlib.pod: $(PERL) perlmodlib.PL ../mv-if-diff + rm -f perlmodlib.tmp + $(PERL) -I ../lib perlmodlib.PL + sh ../mv-if-diff perlmodlib.tmp perlmodlib.pod + +compile: all + $(REALPERL) -I../lib ../utils/perlcc -o pod2latex.exe pod2latex -log ../compilelog + $(REALPERL) -I../lib ../utils/perlcc -o pod2man.exe pod2man -log ../compilelog + $(REALPERL) -I../lib ../utils/perlcc -o pod2text.exe pod2text -log ../compilelog + $(REALPERL) -I../lib ../utils/perlcc -o checkpods.exe checkpods -log ../compilelog + +!NO!SUBS! diff --git a/contrib/perl5/pod/buildtoc.PL b/contrib/perl5/pod/buildtoc.PL new file mode 100755 index 0000000000000..7c5a45018e8e5 --- /dev/null +++ b/contrib/perl5/pod/buildtoc.PL @@ -0,0 +1,492 @@ +#!/usr/local/bin/perl + +use Config; +use File::Basename qw(&basename &dirname); +use Cwd; + +# List explicitly here the variables you want Configure to +# generate. Metaconfig only looks for shell variables, so you +# have to mention them as if they were shell variables, not +# %Config entries. Thus you write +# $startperl +# to ensure Configure will look for $Config{startperl}. + +# This forces PL files to create target in same directory as PL file. +# This is so that make depend always knows where to find PL derivatives. +$origdir = cwd; +chdir(dirname($0)); +($file = basename($0)) =~ s/\.PL$//; +$file =~ s/\.pl$// if ($^O eq 'os2' or $^O eq 'dos'); # "case-forgiving" +$file =~ s/\.pl$/.com/ if ($^O eq 'VMS'); # "case-forgiving" + +open OUT,">$file" or die "Can't create $file: $!"; + +print "Extracting $file (with variable substitutions)\n"; + +# In this section, perl variables will be expanded during extraction. +# You can use $Config{...} to use Configure variables. + +print OUT <<"!GROK!THIS!"; +$Config{'startperl'} + eval 'exec perl -S \$0 "\$@"' + if 0; +!GROK!THIS! + +# In the following, perl variables are not expanded during extraction. + +print OUT <<'!NO!SUBS!'; + +# +# buildtoc +# +# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! +# This file is autogenerated by buildtoc.PL. +# Edit that file and run it to effect changes. +# +# Builds perltoc.pod and sanity checks the list of pods against all +# of the MANIFEST, perl.pod, and ourselves. +# + +use File::Find; +use Cwd; +use Text::Wrap; + +@PODS = glob("*.pod"); + +sub output ($); + +if (-d "pod") { + die "$0: failed to chdir('pod'): $!\n" unless chdir("pod"); +} + +@pods = qw( + perl + perlfaq + perltoc + perlbook + + perlsyn + perldata + perlop + perlsub + perlfunc + perlreftut + perldsc + perlrequick + perlpod + perlstyle + perltrap + + perlrun + perldiag + perllexwarn + perldebtut + perldebug + + perlvar + perllol + perlopentut + perlretut + + perlre + perlref + + perlform + + perlboot + perltoot + perltootc + perlobj + perlbot + perltie + + perlipc + perlfork + perlnumber + perlthrtut + + perlport + perllocale + perlunicode + perlebcdic + + perlsec + + perlmod + perlmodlib + perlmodinstall + perlnewmod + + perlfaq1 + perlfaq2 + perlfaq3 + perlfaq4 + perlfaq5 + perlfaq6 + perlfaq7 + perlfaq8 + perlfaq9 + + perlcompile + + perlembed + perldebguts + perlxstut + perlxs + perlclib + perlguts + perlcall + perlutil + perlfilter + perldbmfilter + perlapi + perlintern + perlapio + perltodo + perlhack + + perlhist + perldelta + perl5005delta + perl5004delta + + perlaix + perlamiga + perlbs2000 + perlcygwin + perldos + perlepoc + perlhpux + perlmachten + perlmacos + perlmpeix + perlos2 + perlos390 + perlsolaris + perlvmesa + perlvms + perlvos + perlwin32 + ); + +@ARCHPODS = qw( + perlaix + perlamiga + perlbs2000 + perlcygwin + perldos + perlepoc + perlhpux + perlmachten + perlmacos + perlmpeix + perlos2 + perlos390 + perlsolaris + perlvmesa + perlvms + perlvos + perlwin32 + ); +for (@ARCHPODS) { s/$/.pod/ } +@ARCHPODS{@ARCHPODS} = (); + +for (@pods) { s/$/.pod/ } +@pods{@pods} = (); +@PODS{@PODS} = (); + +open(MANI, "../MANIFEST") || die "$0: opening ../MANIFEST failed: $!"; +while (<MANI>) { + if (m!^pod/([^.]+\.pod)\s+!i) { + push @MANIPODS, $1; + } +} +close(MANI); +@MANIPODS{@MANIPODS} = (); + +open(PERLPOD, "perl.pod") || die "$0: opening perl.pod failed: $!\n"; +while (<PERLPOD>) { + if (/^For ease of access, /../^\(If you're intending /) { + if (/^\s+(perl\S*)\s+\w/) { + push @PERLPODS, "$1.pod"; + } + } +} +close(PERLPOD); +die "$0: could not find the pod listing of perl.pod\n" + unless @PERLPODS; +@PERLPODS{@PERLPODS} = (); + +# Cross-check against ourselves +# Cross-check against the MANIFEST +# Cross-check against the perl.pod + +foreach my $i (sort keys %PODS) { + warn "$0: $i exists but is unknown by buildtoc\n" + unless exists $pods{$i}; + warn "$0: $i exists but is unknown by ../MANIFEST\n" + if !exists $MANIPODS{$i} && !exists $ARCHPODS{$i}; + warn "$0: $i exists but is unknown by perl.pod\n" + unless exists $PERLPODS{$i}; +} +foreach my $i (sort keys %pods) { + warn "$0: $i is known by buildtoc but does not exist\n" + unless exists $PODS{$i}; +} +foreach my $i (sort keys %MANIPODS) { + warn "$0: $i is known by ../MANIFEST but does not exist\n" + unless exists $PODS{$i}; +} +foreach my $i (sort keys %PERLPODS) { + warn "$0: $i is known by perl.pod but does not exist\n" + unless exists $PODS{$i}; +} + +# We are ready to rock. +open(OUT, ">perltoc.pod") || die "$0: creating perltoc.pod failed: $!"; + +$/ = ''; +@ARGV = @pods; + +($_= <<EOPOD2B) =~ s/^\t//gm && output($_); + + =head1 NAME + + perltoc - perl documentation table of contents + + =head1 DESCRIPTION + + This page provides a brief table of contents for the rest of the Perl + documentation set. It is meant to be scanned quickly or grepped + through to locate the proper section you're looking for. + + =head1 BASIC DOCUMENTATION + +EOPOD2B +#' make emacs happy + +podset(@pods); + +find \&getpods => qw(../lib ../ext); + +sub getpods { + if (/\.p(od|m)$/) { + # Skip .pm files that have corresponding .pod files, and Functions.pm. + return if /(.*)\.pm$/ && -f "$1.pod"; + my $file = $File::Find::name; + return if $file eq '../lib/Pod/Functions.pm'; # Used only by pod itself + + die "tut $name" if $file =~ /TUT/; + unless (open (F, "< $_\0")) { + warn "bogus <$file>: $!"; + system "ls", "-l", $file; + } + else { + my $line; + while ($line = <F>) { + if ($line =~ /^=head1\s+NAME\b/) { + push @modpods, $file; + #warn "GOOD $file\n"; + return; + } + } + warn "$0: $file: cannot find =head1 NAME\n"; + } + } +} + +die "no pods" unless @modpods; + +for (@modpods) { + #($name) = /(\w+)\.p(m|od)$/; + $name = path2modname($_); + if ($name =~ /^[a-z]/) { + push @pragmata, $_; + } else { + if ($done{$name}++) { + # warn "already did $_\n"; + next; + } + push @modules, $_; + push @modname, $name; + } +} + +($_= <<EOPOD2B) =~ s/^\t//gm && output($_); + + + + =head1 PRAGMA DOCUMENTATION + +EOPOD2B + +podset(sort @pragmata); + +($_= <<EOPOD2B) =~ s/^\t//gm && output($_); + + + + =head1 MODULE DOCUMENTATION + +EOPOD2B + +podset( @modules[ sort { $modname[$a] cmp $modname[$b] } 0 .. $#modules ] ); + +($_= <<EOPOD2B) =~ s/^\t//gm; + + + =head1 AUXILIARY DOCUMENTATION + + Here should be listed all the extra programs' documentation, but they + don't all have manual pages yet: + + =over 4 + + =item a2p + + =item s2p + + =item find2perl + + =item h2ph + + =item c2ph + + =item h2xs + + =item xsubpp + + =item pod2man + + =item wrapsuid + + =back + + =head1 AUTHOR + + Larry Wall <F<larry\@wall.org>>, with the help of oodles + of other folks. + + +EOPOD2B +output $_; +output "\n"; # flush $LINE +exit; + +sub podset { + local @ARGV = @_; + + while(<>) { + if (s/^=head1 (NAME)\s*/=head2 /) { + $pod = path2modname($ARGV); + unhead1(); + output "\n \n\n=head2 "; + $_ = <>; + if ( /^\s*$pod\b/ ) { + s/$pod\.pm/$pod/; # '.pm' in NAME !? + output $_; + } else { + s/^/$pod, /; + output $_; + } + next; + } + if (s/^=head1 (.*)/=item $1/) { + unhead2(); + output "=over 4\n\n" unless $inhead1; + $inhead1 = 1; + output $_; nl(); next; + } + if (s/^=head2 (.*)/=item $1/) { + unitem(); + output "=over 4\n\n" unless $inhead2; + $inhead2 = 1; + output $_; nl(); next; + } + if (s/^=item ([^=].*)/$1/) { + next if $pod eq 'perldiag'; + s/^\s*\*\s*$// && next; + s/^\s*\*\s*//; + s/\n/ /g; + s/\s+$//; + next if /^[\d.]+$/; + next if $pod eq 'perlmodlib' && /^ftp:/; + ##print "=over 4\n\n" unless $initem; + output ", " if $initem; + $initem = 1; + s/\.$//; + s/^-X\b/-I<X>/; + output $_; next; + } + if (s/^=cut\s*\n//) { + unhead1(); + next; + } + } +} + +sub path2modname { + local $_ = shift; + s/\.p(m|od)$//; + s-.*?/(lib|ext)/--; + s-/-::-g; + s/(\w+)::\1/$1/; + return $_; +} + +sub unhead1 { + unhead2(); + if ($inhead1) { + output "\n\n=back\n\n"; + } + $inhead1 = 0; +} + +sub unhead2 { + unitem(); + if ($inhead2) { + output "\n\n=back\n\n"; + } + $inhead2 = 0; +} + +sub unitem { + if ($initem) { + output "\n\n"; + ##print "\n\n=back\n\n"; + } + $initem = 0; +} + +sub nl { + output "\n"; +} + +my $NEWLINE; # how many newlines have we seen recently +my $LINE; # what remains to be printed + +sub output ($) { + for (split /(\n)/, shift) { + if ($_ eq "\n") { + if ($LINE) { + print OUT wrap('', '', $LINE); + $LINE = ''; + } + if ($NEWLINE < 2) { + print OUT; + $NEWLINE++; + } + } + elsif (/\S/ && length) { + $LINE .= $_; + $NEWLINE = 0; + } + } +} + +!NO!SUBS! + +close OUT or die "Can't close $file: $!"; +chmod 0755, $file or die "Can't reset permissions for $file: $!\n"; +exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':'; +chdir $origdir; diff --git a/contrib/perl5/pod/perlclib.pod b/contrib/perl5/pod/perlclib.pod new file mode 100644 index 0000000000000..a0f4a80eecd70 --- /dev/null +++ b/contrib/perl5/pod/perlclib.pod @@ -0,0 +1,197 @@ +=head1 NAME + +perlclib - Internal replacements for standard C library functions + +=head1 DESCRIPTION + +One thing Perl porters should note is that F<perl> doesn't tend to use that +much of the C standard library internally; you'll see very little use of, +for example, the F<ctype.h> functions in there. This is because Perl +tends to reimplement or abstract standard library functions, so that we +know exactly how they're going to operate. + +This is a reference card for people who are familiar with the C library +and who want to do things the Perl way; to tell them which functions +they ought to use instead of the more normal C functions. + +=head2 Conventions + +In the following tables: + +=over 3 + +=item C<t> + +is a type. + +=item C<p> + +is a pointer. + +=item C<n> + +is a number. + +=item C<s> + +is a string. + +=back + +C<sv>, C<av>, C<hv>, etc. represent variables of their respective types. + +=head2 File Operations + +Instead of the F<stdio.h> functions, you should use the Perl abstraction +layer. Instead of C<FILE*> types, you need to be handling C<PerlIO*> +types. Don't forget that with the new PerlIO layered I/O abstraction +C<FILE*> types may not even be available. See also the C<perlapio> +documentation for more information about the following functions: + + Instead Of: Use: + + stdin PerlIO_stdin() + stdout PerlIO_stdout() + stderr PerlIO_stderr() + + fopen(fn, mode) PerlIO_open(fn, mode) + freopen(fn, mode, stream) PerlIO_reopen(fn, mode, perlio) (Deprecated) + fflush(stream) PerlIO_flush(perlio) + fclose(stream) PerlIO_close(perlio) + +=head2 File Input and Output + + Instead Of: Use: + + fprintf(stream, fmt, ...) PerlIO_printf(perlio, fmt, ...) + + [f]getc(stream) PerlIO_getc(perlio) + [f]putc(stream, n) PerlIO_putc(perlio, n) + ungetc(n, stream) PerlIO_ungetc(perlio, n) + +Note that the PerlIO equivalents of C<fread> and C<fwrite> are slightly +different from their C library counterparts: + + fread(p, size, n, stream) PerlIO_read(perlio, buf, numbytes) + fwrite(p, size, n, stream) PerlIO_write(perlio, buf, numbytes) + + fputs(s, stream) PerlIO_puts(perlio, s) + +There is no equivalent to C<fgets>; one should use C<sv_gets> instead: + + fgets(s, n, stream) sv_gets(sv, perlio, append) + +=head2 File Positioning + + Instead Of: Use: + + feof(stream) PerlIO_eof(perlio) + fseek(stream, n, whence) PerlIO_seek(perlio, n, whence) + rewind(stream) PerlIO_rewind(perlio) + + fgetpos(stream, p) PerlIO_getpos(perlio, sv) + fsetpos(stream, p) PerlIO_setpos(perlio, sv) + + ferror(stream) PerlIO_error(perlio) + clearerr(stream) PerlIO_clearerr(perlio) + +=head2 Memory Management and String Handling + + Instead Of: Use: + + t* p = malloc(n) New(id, p, n, t) + t* p = calloc(n, s) Newz(id, p, n, t) + p = realloc(p, n) Renew(p, n, t) + memcpy(dst, src, n) Copy(src, dst, n, t) + memmove(dst, src, n) Move(src, dst, n, t) + memcpy/*(struct foo *) StructCopy(src, dst, t) + free(p) Safefree(p) + + strdup(p) savepv(p) + strndup(p, n) savepvn(p, n) (Hey, strndup doesn't exist!) + + strstr(big, little) instr(big, little) + strcmp(s1, s2) strLE(s1, s2) / strEQ(s1, s2) / strGT(s1,s2) + strncmp(s1, s2, n) strnNE(s1, s2, n) / strnEQ(s1, s2, n) + +Notice the different order of arguments to C<Copy> and C<Move> than used +in C<memcpy> and C<memmove>. + +Most of the time, though, you'll want to be dealing with SVs internally +instead of raw C<char *> strings: + + strlen(s) sv_len(sv) + strcpy(dt, src) sv_setpv(sv, s) + strncpy(dt, src, n) sv_setpvn(sv, s, n) + strcat(dt, src) sv_catpv(sv, s) + strncat(dt, src) sv_catpvn(sv, s) + sprintf(s, fmt, ...) sv_setpvf(sv, fmt, ...) + +Note also the existence of C<sv_catpvf> and C<sv_catpvfn>, combining +concatenation with formatting. + +=head2 Character Class Tests + +There are two types of character class tests that Perl implements: one +type deals in C<char>s and are thus B<not> Unicode aware (and hence +deprecated unless you B<know> you should use them) and the other type +deal in C<UV>s and know about Unicode properties. In the following +table, C<c> is a C<char>, and C<u> is a Unicode codepoint. + + Instead Of: Use: But better use: + + isalnum(c) isALNUM(c) isALNUM_uni(u) + isalpha(c) isALPHA(c) isALPHA_uni(u) + iscntrl(c) isCNTRL(c) isCNTRL_uni(u) + isdigit(c) isDIGIT(c) isDIGIT_uni(u) + isgraph(c) isGRAPH(c) isGRAPH_uni(u) + islower(c) isLOWER(c) isLOWER_uni(u) + isprint(c) isPRINT(c) isPRINT_uni(u) + ispunct(c) isPUNCT(c) isPUNCT_uni(u) + isspace(c) isSPACE(c) isSPACE_uni(u) + isupper(c) isUPPER(c) isUPPER_uni(u) + isxdigit(c) isXDIGIT(c) isXDIGIT_uni(u) + + tolower(c) toLOWER(c) toLOWER_uni(u) + toupper(c) toUPPER(c) toUPPER_uni(u) + +=head2 F<stdlib.h> functions + + Instead Of: Use: + + atof(s) Atof(s) + atol(s) Atol(s) + strtod(s, *p) Nothing. Just don't use it. + strtol(s, *p, n) Strtol(s, *p, n) + strtoul(s, *p, n) Strtoul(s, *p, n) + +Notice also the C<scan_bin>, C<scan_hex>, and C<scan_oct> functions in +F<util.c> for converting strings representing numbers in the respective +bases into C<NV>s. + +In theory C<Strtol> and C<Strtoul> may not be defined if the machine perl is +built on doesn't actually have strtol and strtoul. But as those 2 +functions are part of the 1989 ANSI C spec we suspect you'll find them +everywhere by now. + + int rand() double Drand01() + srand(n) { seedDrand01((Rand_seed_t)n); + PL_srand_called = TRUE; } + + exit(n) my_exit(n) + system(s) Don't. Look at pp_system or use my_popen + + getenv(s) PerlEnv_getenv(s) + setenv(s, val) my_putenv(s, val) + +=head2 Miscellaneous functions + +You should not even B<want> to use F<setjmp.h> functions, but if you +think you do, use the C<JMPENV> stack in F<scope.h> instead. + +For C<signal>/C<sigaction>, use C<rsignal(signo, handler)>. + +=head1 SEE ALSO + +C<perlapi>, C<perlapio>, C<perlguts> + diff --git a/contrib/perl5/pod/perldebtut.pod b/contrib/perl5/pod/perldebtut.pod new file mode 100644 index 0000000000000..e11102e5676ef --- /dev/null +++ b/contrib/perl5/pod/perldebtut.pod @@ -0,0 +1,721 @@ +=head1 NAME + +perldebtut - Perl debugging tutorial + +=head1 DESCRIPTION + +A (very) lightweight introduction in the use of the perl debugger, and a +pointer to existing, deeper sources of information on the subject of debugging +perl programs. + +There's an extraordinary number of people out there who don't appear to know +anything about using the perl debugger, though they use the language every +day. +This is for them. + + +=head1 use strict + +First of all, there's a few things you can do to make your life a lot more +straightforward when it comes to debugging perl programs, without using the +debugger at all. To demonstrate, here's a simple script with a problem: + + #!/usr/bin/perl + + $var1 = 'Hello World'; # always wanted to do that :-) + $var2 = "$varl\n"; + + print $var2; + exit; + +While this compiles and runs happily, it probably won't do what's expected, +namely it doesn't print "Hello World\n" at all; It will on the other hand do +exactly what it was told to do, computers being a bit that way inclined. That +is, it will print out a newline character, and you'll get what looks like a +blank line. It looks like there's 2 variables when (because of the typo) +there's really 3: + + $var1 = 'Hello World' + $varl = undef + $var2 = "\n" + +To catch this kind of problem, we can force each variable to be declared +before use by pulling in the strict module, by putting 'use strict;' after the +first line of the script. + +Now when you run it, perl complains about the 3 undeclared variables and we +get four error messages because one variable is referenced twice: + + Global symbol "$var1" requires explicit package name at ./t1 line 4. + Global symbol "$var2" requires explicit package name at ./t1 line 5. + Global symbol "$varl" requires explicit package name at ./t1 line 5. + Global symbol "$var2" requires explicit package name at ./t1 line 7. + Execution of ./hello aborted due to compilation errors. + +Luvverly! and to fix this we declare all variables explicitly and now our +script looks like this: + + #!/usr/bin/perl + use strict; + + my $var1 = 'Hello World'; + my $varl = ''; + my $var2 = "$varl\n"; + + print $var2; + exit; + +We then do (always a good idea) a syntax check before we try to run it again: + + > perl -c hello + hello syntax OK + +And now when we run it, we get "\n" still, but at least we know why. Just +getting this script to compile has exposed the '$varl' (with the letter 'l) +variable, and simply changing $varl to $var1 solves the problem. + + +=head1 Looking at data and -w and w + +Ok, but how about when you want to really see your data, what's in that +dynamic variable, just before using it? + + #!/usr/bin/perl + use strict; + + my $key = 'welcome'; + my %data = ( + 'this' => qw(that), + 'tom' => qw(and jerry), + 'welcome' => q(Hello World), + 'zip' => q(welcome), + ); + my @data = keys %data; + + print "$data{$key}\n"; + exit; + +Looks OK, after it's been through the syntax check (perl -c scriptname), we +run it and all we get is a blank line again! Hmmmm. + +One common debugging approach here, would be to liberally sprinkle a few print +statements, to add a check just before we print out our data, and another just +after: + + print "All OK\n" if grep($key, keys %data); + print "$data{$key}\n"; + print "done: '$data{$key}'\n"; + +And try again: + + > perl data + All OK + + done: '' + +After much staring at the same piece of code and not seeing the wood for the +trees for some time, we get a cup of coffee and try another approach. That +is, we bring in the cavalry by giving perl the 'B<-d>' switch on the command +line: + + > perl -d data + Default die handler restored. + + Loading DB routines from perl5db.pl version 1.07 + Editor support available. + + Enter h or `h h' for help, or `man perldebug' for more help. + + main::(./data:4): my $key = 'welcome'; + +Now, what we've done here is to launch the built-in perl debugger on our +script. It's stopped at the first line of executable code and is waiting for +input. + +Before we go any further, you'll want to know how to quit the debugger: use +just the letter 'B<q>', not the words 'quit' or 'exit': + + DB<1> q + > + +That's it, you're back on home turf again. + + +=head1 help + +Fire the debugger up again on your script and we'll look at the help menu. +There's a couple of ways of calling help: a simple 'B<h>' will get you a long +scrolled list of help, 'B<|h>' (pipe-h) will pipe the help through your pager +('more' or 'less' probably), and finally, 'B<h h>' (h-space-h) will give you a +helpful mini-screen snapshot: + + DB<1> h h + List/search source lines: Control script execution: + l [ln|sub] List source code T Stack trace + - or . List previous/current line s [expr] Single step [in expr] + w [line] List around line n [expr] Next, steps over subs + f filename View source in file <CR/Enter> Repeat last n or s + /pattern/ ?patt? Search forw/backw r Return from subroutine + v Show versions of modules c [ln|sub] Continue until position + Debugger controls: L List +break/watch/actions + O [...] Set debugger options t [expr] Toggle trace [trace expr] + <[<]|{[{]|>[>] [cmd] Do pre/post-prompt b [ln|event|sub] [cnd] Set breakpoint + ! [N|pat] Redo a previous command d [ln] or D Delete a/all breakpoints + H [-num] Display last num commands a [ln] cmd Do cmd before line + = [a val] Define/list an alias W expr Add a watch expression + h [db_cmd] Get help on command A or W Delete all actions/watch + |[|]db_cmd Send output to pager ![!] syscmd Run cmd in a subprocess + q or ^D Quit R Attempt a restart + Data Examination: expr Execute perl code, also see: s,n,t expr + x|m expr Evals expr in list context, dumps the result or lists methods. + p expr Print expression (uses script's current package). + S [[!]pat] List subroutine names [not] matching pattern + V [Pk [Vars]] List Variables in Package. Vars can be ~pattern or !pattern. + X [Vars] Same as "V current_package [Vars]". + For more help, type h cmd_letter, or run man perldebug for all docs. + +More confusing options than you can shake a big stick at! It's not as bad as +it looks and it's very useful to know more about all of it, and fun too! + +There's a couple of useful ones to know about straight away. You wouldn't +think we're using any libraries at all at the moment, but 'B<v>' will show +which modules are currently loaded, by the debugger as well your script. +'B<V>' and 'B<X>' show variables in the program by package scope and can be +constrained by pattern. 'B<m>' shows methods and 'B<S>' shows all subroutines +(by pattern): + + DB<2>S str + dumpvar::stringify + strict::bits + strict::import + strict::unimport + +Using 'X' and cousins requires you not to use the type identifiers ($@%), just +the 'name': + + DM<3>X ~err + FileHandle(stderr) => fileno(2) + +Remember we're in our tiny program with a problem, we should have a look at +where we are, and what our data looks like. First of all let's have a window +on our present position (the first line of code in this case), via the letter +'B<w>': + + DB<4> w + 1 #!/usr/bin/perl + 2: use strict; + 3 + 4==> my $key = 'welcome'; + 5: my %data = ( + 6 'this' => qw(that), + 7 'tom' => qw(and jerry), + 8 'welcome' => q(Hello World), + 9 'zip' => q(welcome), + 10 ); + +At line number 4 is a helpful pointer, that tells you where you are now. To +see more code, type 'w' again: + + DB<4> w + 8 'welcome' => q(Hello World), + 9 'zip' => q(welcome), + 10 ); + 11: my @data = keys %data; + 12: print "All OK\n" if grep($key, keys %data); + 13: print "$data{$key}\n"; + 14: print "done: '$data{$key}'\n"; + 15: exit; + +And if you wanted to list line 5 again, type 'l 5', (note the space): + + DB<4> l 5 + 5: my %data = ( + +In this case, there's not much to see, but of course normally there's pages of +stuff to wade through, and 'l' can be very useful. To reset your view to the +line we're about to execute, type a lone period '.': + + DB<5> . + main::(./data_a:4): my $key = 'welcome'; + +The line shown is the one that is about to be executed B<next>, it hasn't +happened yet. So while we can print a variable with the letter 'B<p>', at +this point all we'd get is an empty (undefined) value back. What we need to +do is to step through the next executable statement with an 'B<s>': + + DB<6> s + main::(./data_a:5): my %data = ( + main::(./data_a:6): 'this' => qw(that), + main::(./data_a:7): 'tom' => qw(and jerry), + main::(./data_a:8): 'welcome' => q(Hello World), + main::(./data_a:9): 'zip' => q(welcome), + main::(./data_a:10): ); + +Now we can have a look at that first ($key) variable: + + DB<7> p $key + welcome + +line 13 is where the action is, so let's continue down to there via the letter +'B<c>', which by the way, inserts a 'one-time-only' breakpoint at the given +line or sub routine: + + DB<8> c 13 + All OK + main::(./data_a:13): print "$data{$key}\n"; + +We've gone past our check (where 'All OK' was printed) and have stopped just +before the meat of our task. We could try to print out a couple of variables +to see what is happening: + + DB<9> p $data{$key} + +Not much in there, lets have a look at our hash: + + DB<10> p %data + Hello Worldziptomandwelcomejerrywelcomethisthat + + DB<11> p keys %data + Hello Worldtomwelcomejerrythis + +Well, this isn't very easy to read, and using the helpful manual (B<h h>), the +'B<x>' command looks promising: + + DB<12> x %data + 0 'Hello World' + 1 'zip' + 2 'tom' + 3 'and' + 4 'welcome' + 5 undef + 6 'jerry' + 7 'welcome' + 8 'this' + 9 'that' + +That's not much help, a couple of welcomes in there, but no indication of +which are keys, and which are values, it's just a listed array dump and, in +this case, not particularly helpful. The trick here, is to use a B<reference> +to the data structure: + + DB<13> x \%data + 0 HASH(0x8194bc4) + 'Hello World' => 'zip' + 'jerry' => 'welcome' + 'this' => 'that' + 'tom' => 'and' + 'welcome' => undef + +The reference is truly dumped and we can finally see what we're dealing with. +Our quoting was perfectly valid but wrong for our purposes, with 'and jerry' +being treated as 2 separate words rather than a phrase, thus throwing the +evenly paired hash structure out of alignment. + +The 'B<-w>' switch would have told us about this, had we used it at the start, +and saved us a lot of trouble: + + > perl -w data + Odd number of elements in hash assignment at ./data line 5. + +We fix our quoting: 'tom' => q(and jerry), and run it again, this time we get +our expected output: + + > perl -w data + Hello World + + +While we're here, take a closer look at the 'B<x>' command, it's really useful +and will merrily dump out nested references, complete objects, partial objects +- just about whatever you throw at it: + +Let's make a quick object and x-plode it, first we'll start the the debugger: +it wants some form of input from STDIN, so we give it something non-commital, +a zero: + + > perl -de 0 + Default die handler restored. + + Loading DB routines from perl5db.pl version 1.07 + Editor support available. + + Enter h or `h h' for help, or `man perldebug' for more help. + + main::(-e:1): 0 + +Now build an on-the-fly object over a couple of lines (note the backslash): + + DB<1> $obj = bless({'unique_id'=>'123', 'attr'=> \ + cont: {'col' => 'black', 'things' => [qw(this that etc)]}}, 'MY_class') + +And let's have a look at it: + + DB<2> x $obj + 0 MY_class=HASH(0x828ad98) + 'attr' => HASH(0x828ad68) + 'col' => 'black' + 'things' => ARRAY(0x828abb8) + 0 'this' + 1 'that' + 2 'etc' + 'unique_id' => 123 + DB<3> + +Useful, huh? You can eval nearly anything in there, and experiment with bits +of code or regexes until the cows come home: + + DB<3> @data = qw(this that the other atheism leather theory scythe) + + DB<4> p 'saw -> '.($cnt += map { print "\t:\t$_\n" } grep(/the/, sort @data)) + atheism + leather + other + scythe + the + theory + saw -> 6 + +If you want to see the command History, type an 'B<H>': + + DB<5> H + 4: p 'saw -> '.($cnt += map { print "\t:\t$_\n" } grep(/the/, sort @data)) + 3: @data = qw(this that the other atheism leather theory scythe) + 2: x $obj + 1: $obj = bless({'unique_id'=>'123', 'attr'=> + {'col' => 'black', 'things' => [qw(this that etc)]}}, 'MY_class') + DB<5> + +And if you want to repeat any previous command, use the exclamation: 'B<!>': + + DB<5> !4 + p 'saw -> '.($cnt += map { print "$_\n" } grep(/the/, sort @data)) + atheism + leather + other + scythe + the + theory + saw -> 12 + +For more on references see L<perlref> and L<perlreftut> + + +=head1 Stepping through code + +Here's a simple program which converts between Celsius and Fahrenheit, it too +has a problem: + + #!/usr/bin/perl -w + use strict; + + my $arg = $ARGV[0] || '-c20'; + + if ($arg =~ /^\-(c|f)((\-|\+)*\d+(\.\d+)*)$/) { + my ($deg, $num) = ($1, $2); + my ($in, $out) = ($num, $num); + if ($deg eq 'c') { + $deg = 'f'; + $out = &c2f($num); + } else { + $deg = 'c'; + $out = &f2c($num); + } + $out = sprintf('%0.2f', $out); + $out =~ s/^((\-|\+)*\d+)\.0+$/$1/; + print "$out $deg\n"; + } else { + print "Usage: $0 -[c|f] num\n"; + } + exit; + + sub f2c { + my $f = shift; + my $c = 5 * $f - 32 / 9; + return $c; + } + + sub c2f { + my $c = shift; + my $f = 9 * $c / 5 + 32; + return $f; + } + + +For some reason, the Fahrenheit to Celsius conversion fails to return the +expected output. This is what it does: + + > temp -c0.72 + 33.30 f + + > temp -f33.3 + 162.94 c + +Not very consistent! We'll set a breakpoint in the code manually and run it +under the debugger to see what's going on. A breakpoint is a flag, to which +the debugger will run without interruption, when it reaches the breakpoint, it +will stop execution and offer a prompt for further interaction. In normal +use, these debugger commands are completely ignored, and they are safe - if a +little messy, to leave in production code. + + my ($in, $out) = ($num, $num); + $DB::single=2; # insert at line 9! + if ($deg eq 'c') + ... + + > perl -d temp -f33.3 + Default die handler restored. + + Loading DB routines from perl5db.pl version 1.07 + Editor support available. + + Enter h or `h h' for help, or `man perldebug' for more help. + + main::(temp:4): my $arg = $ARGV[0] || '-c100'; + +We'll simply continue down to our pre-set breakpoint with a 'B<c>': + + DB<1> c + main::(temp:10): if ($deg eq 'c') { + +Followed by a window command to see where we are: + + DB<1> w + 7: my ($deg, $num) = ($1, $2); + 8: my ($in, $out) = ($num, $num); + 9: $DB::single=2; + 10==> if ($deg eq 'c') { + 11: $deg = 'f'; + 12: $out = &c2f($num); + 13 } else { + 14: $deg = 'c'; + 15: $out = &f2c($num); + 16 } + +And a print to show what values we're currently using: + + DB<1> p $deg, $num + f33.3 + +We can put another break point on any line beginning with a colon, we'll use +line 17 as that's just as we come out of the subroutine, and we'd like to +pause there later on: + + DB<2> b 17 + +There's no feedback from this, but you can see what breakpoints are set by +using the list 'L' command: + + DB<3> L + temp: + 17: print "$out $deg\n"; + break if (1) + +Note that to delete a breakpoint you use 'd' or 'D'. + +Now we'll continue down into our subroutine, this time rather than by line +number, we'll use the subroutine name, followed by the now familiar 'w': + + DB<3> c f2c + main::f2c(temp:30): my $f = shift; + + DB<4> w + 24: exit; + 25 + 26 sub f2c { + 27==> my $f = shift; + 28: my $c = 5 * $f - 32 / 9; + 29: return $c; + 30 } + 31 + 32 sub c2f { + 33: my $c = shift; + + +Note that if there was a subroutine call between us and line 29, and we wanted +to B<single-step> through it, we could use the 'B<s>' command, and to step +over it we would use 'B<n>' which would execute the sub, but not descend into +it for inspection. In this case though, we simply continue down to line 29: + + DB<4> c 29 + main::f2c(temp:29): return $c; + +And have a look at the return value: + + DB<5> p $c + 162.944444444444 + +This is not the right answer at all, but the sum looks correct. I wonder if +it's anything to do with operator precedence? We'll try a couple of other +possibilities with our sum: + + DB<6> p (5 * $f - 32 / 9) + 162.944444444444 + + DB<7> p 5 * $f - (32 / 9) + 162.944444444444 + + DB<8> p (5 * $f) - 32 / 9 + 162.944444444444 + + DB<9> p 5 * ($f - 32) / 9 + 0.722222222222221 + +:-) that's more like it! Ok, now we can set our return variable and we'll +return out of the sub with an 'r': + + DB<10> $c = 5 * ($f - 32) / 9 + + DB<11> r + scalar context return from main::f2c: 0.722222222222221 + +Looks good, let's just continue off the end of the script: + + DB<12> c + 0.72 c + Debugged program terminated. Use q to quit or R to restart, + use O inhibit_exit to avoid stopping after program termination, + h q, h R or h O to get additional info. + +A quick fix to the offending line (insert the missing parentheses) in the +actual program and we're finished. + + +=head1 Placeholder for a, w, t, T + +Actions, watch variables, stack traces etc.: on the TODO list. + + a + + W + + t + + T + + +=head1 REGULAR EXPRESSIONS + +Ever wanted to know what a regex looked like? You'll need perl compiled with +the DEBUGGING flag for this one: + + > perl -Dr -e '/^pe(a)*rl$/i' + Compiling REx `^pe(a)*rl$' + size 17 first at 2 + rarest char + at 0 + 1: BOL(2) + 2: EXACTF <pe>(4) + 4: CURLYN[1] {0,32767}(14) + 6: NOTHING(8) + 8: EXACTF <a>(0) + 12: WHILEM(0) + 13: NOTHING(14) + 14: EXACTF <rl>(16) + 16: EOL(17) + 17: END(0) + floating `'$ at 4..2147483647 (checking floating) stclass `EXACTF <pe>' +anchored(BOL) minlen 4 + Omitting $` $& $' support. + + EXECUTING... + + Freeing REx: `^pe(a)*rl$' + +Did you really want to know? :-) +For more gory details on getting regular expressions to work, have a look at +L<perlre>, L<perlretut>, and to decode the mysterious labels (BOL and CURLYN, +etc. above), see L<perldebguts>. + + +=head1 OUTPUT TIPS + +To get all the output from your error log, and not miss any messages via +helpful operating system buffering, insert a line like this, at the start of +your script: + + $|=1; + +To watch the tail of a dynamically growing logfile, (from the command line): + + tail -f $error_log + +Wrapping all die calls in a handler routine can be useful to see how, and from +where, they're being called, L<perlvar> has more information: + + BEGIN { $SIG{__DIE__} = sub { require Carp; Carp::confess(@_) } } + +Various useful techniques for the redirection of STDOUT and STDERR filehandles +are explained in L<perlopentut> and L<perlfaq8>. + + +=head1 CGI + +Just a quick hint here for all those CGI programmers who can't figure out how +on earth to get past that 'waiting for input' prompt, when running their CGI +script from the command-line, try something like this: + + > perl -d my_cgi.pl -nodebug + +Of course L<CGI> and L<perlfaq9> will tell you more. + + +=head1 GUIs + +The command line interface is tightly integrated with an B<emacs> extension +and there's a B<vi> interface too. + +You don't have to do this all on the command line, though, there are a few GUI +options out there. The nice thing about these is you can wave a mouse over a +variable and a dump of it's data will appear in an appropriate window, or in a +popup balloon, no more tiresome typing of 'x $varname' :-) + +In particular have a hunt around for the following: + +B<ptkdb> perlTK based wrapper for the built-in debugger + +B<ddd> data display debugger + +B<PerlDevKit> and B<PerlBuilder> are NT specific + +NB. (more info on these and others would be appreciated). + + +=head1 SUMMARY + +We've seen how to encourage good coding practices with B<use strict> and +B<-w>. We can run the perl debugger B<perl -d scriptname> to inspect your +data from within the perl debugger with the B<p> and B<x> commands. You can +walk through your code, set breakpoints with B<b> and step through that code +with B<s> or B<n>, continue with B<c> and return from a sub with B<r>. Fairly +intuitive stuff when you get down to it. + +There is of course lots more to find out about, this has just scratched the +surface. The best way to learn more is to use perldoc to find out more about +the language, to read the on-line help (L<perldebug> is probably the next +place to go), and of course, experiment. + + +=head1 SEE ALSO + +L<perldebug>, +L<perldebguts>, +L<perldiag>, +L<dprofpp>, +L<perlrun> + + +=head1 AUTHOR + +Richard Foley <richard@rfi.net> Copyright (c) 2000 + + +=head1 CONTRIBUTORS + +Various people have made helpful suggestions and contributions, in particular: + +Ronald J Kimball <rjk@linguist.dartmouth.edu> + +Hugo van der Sanden <hv@crypt0.demon.co.uk> + +Peter Scott <Peter@PSDT.com> + diff --git a/contrib/perl5/pod/perlebcdic.pod b/contrib/perl5/pod/perlebcdic.pod new file mode 100644 index 0000000000000..12ea2f3ef4b16 --- /dev/null +++ b/contrib/perl5/pod/perlebcdic.pod @@ -0,0 +1,1235 @@ +=head1 NAME + +perlebcdic - Considerations for running Perl on EBCDIC platforms + +=head1 DESCRIPTION + +An exploration of some of the issues facing Perl programmers +on EBCDIC based computers. We do not cover localization, +internationalization, or multi byte character set issues (yet). + +Portions that are still incomplete are marked with XXX. + +=head1 COMMON CHARACTER CODE SETS + +=head2 ASCII + +The American Standard Code for Information Interchange is a set of +integers running from 0 to 127 (decimal) that imply character +interpretation by the display and other system(s) of computers. +The range 0..127 can be covered by setting the bits in a 7-bit binary +digit, hence the set is sometimes referred to as a "7-bit ASCII". +ASCII was described by the American National Standards Institute +document ANSI X3.4-1986. It was also described by ISO 646:1991 +(with localization for currency symbols). The full ASCII set is +given in the table below as the first 128 elements. Languages that +can be written adequately with the characters in ASCII include +English, Hawaiian, Indonesian, Swahili and some Native American +languages. + +There are many character sets that extend the range of integers +from 0..2**7-1 up to 2**8-1, or 8 bit bytes (octets if you prefer). +One common one is the ISO 8859-1 character set. + +=head2 ISO 8859 + +The ISO 8859-$n are a collection of character code sets from the +International Organization for Standardization (ISO) each of which +adds characters to the ASCII set that are typically found in European +languages many of which are based on the Roman, or Latin, alphabet. + +=head2 Latin 1 (ISO 8859-1) + +A particular 8-bit extension to ASCII that includes grave and acute +accented Latin characters. Languages that can employ ISO 8859-1 +include all the languages covered by ASCII as well as Afrikaans, +Albanian, Basque, Catalan, Danish, Faroese, Finnish, Norwegian, +Portugese, Spanish, and Swedish. Dutch is covered albeit without +the ij ligature. French is covered too but without the oe ligature. +German can use ISO 8859-1 but must do so without German-style +quotation marks. This set is based on Western European extensions +to ASCII and is commonly encountered in world wide web work. +In IBM character code set identification terminology ISO 8859-1 is +also known as CCSID 819 (or sometimes 0819 or even 00819). + +=head2 EBCDIC + +The Extended Binary Coded Decimal Interchange Code refers to a +large collection of slightly different single and multi byte +coded character sets that are different from ASCII or ISO 8859-1 +and typically run on host computers. The EBCDIC encodings derive +from 8 bit byte extensions of Hollerith punched card encodings. +The layout on the cards was such that high bits were set for the +upper and lower case alphabet characters [a-z] and [A-Z], but there +were gaps within each latin alphabet range. + +Some IBM EBCDIC character sets may be known by character code set +identification numbers (CCSID numbers) or code page numbers. Leading +zero digits in CCSID numbers within this document are insignificant. +E.g. CCSID 0037 may be referred to as 37 in places. + +=head2 13 variant characters + +Among IBM EBCDIC character code sets there are 13 characters that +are often mapped to different integer values. Those characters +are known as the 13 "variant" characters and are: + + \ [ ] { } ^ ~ ! # | $ @ ` + +=head2 0037 + +Character code set ID 0037 is a mapping of the ASCII plus Latin-1 +characters (i.e. ISO 8859-1) to an EBCDIC set. 0037 is used +in North American English locales on the OS/400 operating system +that runs on AS/400 computers. CCSID 37 differs from ISO 8859-1 +in 237 places, in other words they agree on only 19 code point values. + +=head2 1047 + +Character code set ID 1047 is also a mapping of the ASCII plus +Latin-1 characters (i.e. ISO 8859-1) to an EBCDIC set. 1047 is +used under Unix System Services for OS/390, and OpenEdition for VM/ESA. +CCSID 1047 differs from CCSID 0037 in eight places. + +=head2 POSIX-BC + +The EBCDIC code page in use on Siemens' BS2000 system is distinct from +1047 and 0037. It is identified below as the POSIX-BC set. + +=head1 SINGLE OCTET TABLES + +The following tables list the ASCII and Latin 1 ordered sets including +the subsets: C0 controls (0..31), ASCII graphics (32..7e), delete (7f), +C1 controls (80..9f), and Latin-1 (a.k.a. ISO 8859-1) (a0..ff). In the +table non-printing control character names as well as the Latin 1 +extensions to ASCII have been labelled with character names roughly +corresponding to I<The Unicode Standard, Version 2.0> albeit with +substitutions such as s/LATIN// and s/VULGAR// in all cases, +s/CAPITAL LETTER// in some cases, and s/SMALL LETTER ([A-Z])/\l$1/ +in some other cases (the C<charnames> pragma names unfortunately do +not list explicit names for the C0 or C1 control characters). The +"names" of the C1 control set (128..159 in ISO 8859-1) listed here are +somewhat arbitrary. The differences between the 0037 and 1047 sets are +flagged with ***. The differences between the 1047 and POSIX-BC sets +are flagged with ###. All ord() numbers listed are decimal. If you +would rather see this table listing octal values then run the table +(that is, the pod version of this document since this recipe may not +work with a pod2_other_format translation) through: + +=over 4 + +=item recipe 0 + +=back + + perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ + -e '{printf("%s%-9o%-9o%-9o%-9o\n",$1,$2,$3,$4,$5)}' perlebcdic.pod + +If you would rather see this table listing hexadecimal values then +run the table through: + +=over 4 + +=item recipe 1 + +=back + + perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ + -e '{printf("%s%-9X%-9X%-9X%-9X\n",$1,$2,$3,$4,$5)}' perlebcdic.pod + + + 8859-1 + chr 0819 0037 1047 POSIX-BC + ---------------------------------------------------------------- + <NULL> 0 0 0 0 + <START OF HEADING> 1 1 1 1 + <START OF TEXT> 2 2 2 2 + <END OF TEXT> 3 3 3 3 + <END OF TRANSMISSION> 4 55 55 55 + <ENQUIRY> 5 45 45 45 + <ACKNOWLEDGE> 6 46 46 46 + <BELL> 7 47 47 47 + <BACKSPACE> 8 22 22 22 + <HORIZONTAL TABULATION> 9 5 5 5 + <LINE FEED> 10 37 21 21 *** + <VERTICAL TABULATION> 11 11 11 11 + <FORM FEED> 12 12 12 12 + <CARRIAGE RETURN> 13 13 13 13 + <SHIFT OUT> 14 14 14 14 + <SHIFT IN> 15 15 15 15 + <DATA LINK ESCAPE> 16 16 16 16 + <DEVICE CONTROL ONE> 17 17 17 17 + <DEVICE CONTROL TWO> 18 18 18 18 + <DEVICE CONTROL THREE> 19 19 19 19 + <DEVICE CONTROL FOUR> 20 60 60 60 + <NEGATIVE ACKNOWLEDGE> 21 61 61 61 + <SYNCHRONOUS IDLE> 22 50 50 50 + <END OF TRANSMISSION BLOCK> 23 38 38 38 + <CANCEL> 24 24 24 24 + <END OF MEDIUM> 25 25 25 25 + <SUBSTITUTE> 26 63 63 63 + <ESCAPE> 27 39 39 39 + <FILE SEPARATOR> 28 28 28 28 + <GROUP SEPARATOR> 29 29 29 29 + <RECORD SEPARATOR> 30 30 30 30 + <UNIT SEPARATOR> 31 31 31 31 + <SPACE> 32 64 64 64 + ! 33 90 90 90 + " 34 127 127 127 + # 35 123 123 123 + $ 36 91 91 91 + % 37 108 108 108 + & 38 80 80 80 + ' 39 125 125 125 + ( 40 77 77 77 + ) 41 93 93 93 + * 42 92 92 92 + + 43 78 78 78 + , 44 107 107 107 + - 45 96 96 96 + . 46 75 75 75 + / 47 97 97 97 + 0 48 240 240 240 + 1 49 241 241 241 + 2 50 242 242 242 + 3 51 243 243 243 + 4 52 244 244 244 + 5 53 245 245 245 + 6 54 246 246 246 + 7 55 247 247 247 + 8 56 248 248 248 + 9 57 249 249 249 + : 58 122 122 122 + ; 59 94 94 94 + < 60 76 76 76 + = 61 126 126 126 + > 62 110 110 110 + ? 63 111 111 111 + @ 64 124 124 124 + A 65 193 193 193 + B 66 194 194 194 + C 67 195 195 195 + D 68 196 196 196 + E 69 197 197 197 + F 70 198 198 198 + G 71 199 199 199 + H 72 200 200 200 + I 73 201 201 201 + J 74 209 209 209 + K 75 210 210 210 + L 76 211 211 211 + M 77 212 212 212 + N 78 213 213 213 + O 79 214 214 214 + P 80 215 215 215 + Q 81 216 216 216 + R 82 217 217 217 + S 83 226 226 226 + T 84 227 227 227 + U 85 228 228 228 + V 86 229 229 229 + W 87 230 230 230 + X 88 231 231 231 + Y 89 232 232 232 + Z 90 233 233 233 + [ 91 186 173 187 *** ### + \ 92 224 224 188 ### + ] 93 187 189 189 *** + ^ 94 176 95 106 *** ### + _ 95 109 109 109 + ` 96 121 121 74 ### + a 97 129 129 129 + b 98 130 130 130 + c 99 131 131 131 + d 100 132 132 132 + e 101 133 133 133 + f 102 134 134 134 + g 103 135 135 135 + h 104 136 136 136 + i 105 137 137 137 + j 106 145 145 145 + k 107 146 146 146 + l 108 147 147 147 + m 109 148 148 148 + n 110 149 149 149 + o 111 150 150 150 + p 112 151 151 151 + q 113 152 152 152 + r 114 153 153 153 + s 115 162 162 162 + t 116 163 163 163 + u 117 164 164 164 + v 118 165 165 165 + w 119 166 166 166 + x 120 167 167 167 + y 121 168 168 168 + z 122 169 169 169 + { 123 192 192 251 ### + | 124 79 79 79 + } 125 208 208 253 ### + ~ 126 161 161 255 ### + <DELETE> 127 7 7 7 + <C1 0> 128 32 32 32 + <C1 1> 129 33 33 33 + <C1 2> 130 34 34 34 + <C1 3> 131 35 35 35 + <C1 4> 132 36 36 36 + <C1 5> 133 21 37 37 *** + <C1 6> 134 6 6 6 + <C1 7> 135 23 23 23 + <C1 8> 136 40 40 40 + <C1 9> 137 41 41 41 + <C1 10> 138 42 42 42 + <C1 11> 139 43 43 43 + <C1 12> 140 44 44 44 + <C1 13> 141 9 9 9 + <C1 14> 142 10 10 10 + <C1 15> 143 27 27 27 + <C1 16> 144 48 48 48 + <C1 17> 145 49 49 49 + <C1 18> 146 26 26 26 + <C1 19> 147 51 51 51 + <C1 20> 148 52 52 52 + <C1 21> 149 53 53 53 + <C1 22> 150 54 54 54 + <C1 23> 151 8 8 8 + <C1 24> 152 56 56 56 + <C1 25> 153 57 57 57 + <C1 26> 154 58 58 58 + <C1 27> 155 59 59 59 + <C1 28> 156 4 4 4 + <C1 29> 157 20 20 20 + <C1 30> 158 62 62 62 + <C1 31> 159 255 255 95 ### + <NON-BREAKING SPACE> 160 65 65 65 + <INVERTED EXCLAMATION MARK> 161 170 170 170 + <CENT SIGN> 162 74 74 176 ### + <POUND SIGN> 163 177 177 177 + <CURRENCY SIGN> 164 159 159 159 + <YEN SIGN> 165 178 178 178 + <BROKEN BAR> 166 106 106 208 ### + <SECTION SIGN> 167 181 181 181 + <DIAERESIS> 168 189 187 121 *** ### + <COPYRIGHT SIGN> 169 180 180 180 + <FEMININE ORDINAL INDICATOR> 170 154 154 154 + <LEFT POINTING GUILLEMET> 171 138 138 138 + <NOT SIGN> 172 95 176 186 *** ### + <SOFT HYPHEN> 173 202 202 202 + <REGISTERED TRADE MARK SIGN> 174 175 175 175 + <MACRON> 175 188 188 161 ### + <DEGREE SIGN> 176 144 144 144 + <PLUS-OR-MINUS SIGN> 177 143 143 143 + <SUPERSCRIPT TWO> 178 234 234 234 + <SUPERSCRIPT THREE> 179 250 250 250 + <ACUTE ACCENT> 180 190 190 190 + <MICRO SIGN> 181 160 160 160 + <PARAGRAPH SIGN> 182 182 182 182 + <MIDDLE DOT> 183 179 179 179 + <CEDILLA> 184 157 157 157 + <SUPERSCRIPT ONE> 185 218 218 218 + <MASC. ORDINAL INDICATOR> 186 155 155 155 + <RIGHT POINTING GUILLEMET> 187 139 139 139 + <FRACTION ONE QUARTER> 188 183 183 183 + <FRACTION ONE HALF> 189 184 184 184 + <FRACTION THREE QUARTERS> 190 185 185 185 + <INVERTED QUESTION MARK> 191 171 171 171 + <A WITH GRAVE> 192 100 100 100 + <A WITH ACUTE> 193 101 101 101 + <A WITH CIRCUMFLEX> 194 98 98 98 + <A WITH TILDE> 195 102 102 102 + <A WITH DIAERESIS> 196 99 99 99 + <A WITH RING ABOVE> 197 103 103 103 + <CAPITAL LIGATURE AE> 198 158 158 158 + <C WITH CEDILLA> 199 104 104 104 + <E WITH GRAVE> 200 116 116 116 + <E WITH ACUTE> 201 113 113 113 + <E WITH CIRCUMFLEX> 202 114 114 114 + <E WITH DIAERESIS> 203 115 115 115 + <I WITH GRAVE> 204 120 120 120 + <I WITH ACUTE> 205 117 117 117 + <I WITH CIRCUMFLEX> 206 118 118 118 + <I WITH DIAERESIS> 207 119 119 119 + <CAPITAL LETTER ETH> 208 172 172 172 + <N WITH TILDE> 209 105 105 105 + <O WITH GRAVE> 210 237 237 237 + <O WITH ACUTE> 211 238 238 238 + <O WITH CIRCUMFLEX> 212 235 235 235 + <O WITH TILDE> 213 239 239 239 + <O WITH DIAERESIS> 214 236 236 236 + <MULTIPLICATION SIGN> 215 191 191 191 + <O WITH STROKE> 216 128 128 128 + <U WITH GRAVE> 217 253 253 224 ### + <U WITH ACUTE> 218 254 254 254 + <U WITH CIRCUMFLEX> 219 251 251 221 ### + <U WITH DIAERESIS> 220 252 252 252 + <Y WITH ACUTE> 221 173 186 173 *** ### + <CAPITAL LETTER THORN> 222 174 174 174 + <SMALL LETTER SHARP S> 223 89 89 89 + <a WITH GRAVE> 224 68 68 68 + <a WITH ACUTE> 225 69 69 69 + <a WITH CIRCUMFLEX> 226 66 66 66 + <a WITH TILDE> 227 70 70 70 + <a WITH DIAERESIS> 228 67 67 67 + <a WITH RING ABOVE> 229 71 71 71 + <SMALL LIGATURE ae> 230 156 156 156 + <c WITH CEDILLA> 231 72 72 72 + <e WITH GRAVE> 232 84 84 84 + <e WITH ACUTE> 233 81 81 81 + <e WITH CIRCUMFLEX> 234 82 82 82 + <e WITH DIAERESIS> 235 83 83 83 + <i WITH GRAVE> 236 88 88 88 + <i WITH ACUTE> 237 85 85 85 + <i WITH CIRCUMFLEX> 238 86 86 86 + <i WITH DIAERESIS> 239 87 87 87 + <SMALL LETTER eth> 240 140 140 140 + <n WITH TILDE> 241 73 73 73 + <o WITH GRAVE> 242 205 205 205 + <o WITH ACUTE> 243 206 206 206 + <o WITH CIRCUMFLEX> 244 203 203 203 + <o WITH TILDE> 245 207 207 207 + <o WITH DIAERESIS> 246 204 204 204 + <DIVISION SIGN> 247 225 225 225 + <o WITH STROKE> 248 112 112 112 + <u WITH GRAVE> 249 221 221 192 ### + <u WITH ACUTE> 250 222 222 222 + <u WITH CIRCUMFLEX> 251 219 219 219 + <u WITH DIAERESIS> 252 220 220 220 + <y WITH ACUTE> 253 141 141 141 + <SMALL LETTER thorn> 254 142 142 142 + <y WITH DIAERESIS> 255 223 223 223 + +If you would rather see the above table in CCSID 0037 order rather than +ASCII + Latin-1 order then run the table through: + +=over 4 + +=item recipe 2 + +=back + + perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,42,3)]}@l;}' perlebcdic.pod + +If you would rather see it in CCSID 1047 order then change the digit +42 in the last line to 51, like this: + +=over 4 + +=item recipe 3 + +=back + + perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,51,3)]}@l;}' perlebcdic.pod + +If you would rather see it in POSIX-BC order then change the digit +51 in the last line to 60, like this: + +=over 4 + +=item recipe 4 + +=back + + perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,60,3)]}@l;}' perlebcdic.pod + + +=head1 IDENTIFYING CHARACTER CODE SETS + +To determine the character set you are running under from perl one +could use the return value of ord() or chr() to test one or more +character values. For example: + + $is_ascii = "A" eq chr(65); + $is_ebcdic = "A" eq chr(193); + +Also, "\t" is a C<HORIZONTAL TABULATION> character so that: + + $is_ascii = ord("\t") == 9; + $is_ebcdic = ord("\t") == 5; + +To distinguish EBCDIC code pages try looking at one or more of +the characters that differ between them. For example: + + $is_ebcdic_37 = "\n" eq chr(37); + $is_ebcdic_1047 = "\n" eq chr(21); + +Or better still choose a character that is uniquely encoded in any +of the code sets, e.g.: + + $is_ascii = ord('[') == 91; + $is_ebcdic_37 = ord('[') == 186; + $is_ebcdic_1047 = ord('[') == 173; + $is_ebcdic_POSIX_BC = ord('[') == 187; + +However, it would be unwise to write tests such as: + + $is_ascii = "\r" ne chr(13); # WRONG + $is_ascii = "\n" ne chr(10); # ILL ADVISED + +Obviously the first of these will fail to distinguish most ASCII machines +from either a CCSID 0037, a 1047, or a POSIX-BC EBCDIC machine since "\r" eq +chr(13) under all of those coded character sets. But note too that +because "\n" is chr(13) and "\r" is chr(10) on the MacIntosh (which is an +ASCII machine) the second C<$is_ascii> test will lead to trouble there. + +To determine whether or not perl was built under an EBCDIC +code page you can use the Config module like so: + + use Config; + $is_ebcdic = $Config{'ebcdic'} eq 'define'; + +=head1 CONVERSIONS + +=head2 tr/// + +In order to convert a string of characters from one character set to +another a simple list of numbers, such as in the right columns in the +above table, along with perl's tr/// operator is all that is needed. +The data in the table are in ASCII order hence the EBCDIC columns +provide easy to use ASCII to EBCDIC operations that are also easily +reversed. + +For example, to convert ASCII to code page 037 take the output of the second +column from the output of recipe 0 (modified to add \\ characters) and use +it in tr/// like so: + + $cp_037 = + '\000\001\002\003\234\011\206\177\227\215\216\013\014\015\016\017' . + '\020\021\022\023\235\205\010\207\030\031\222\217\034\035\036\037' . + '\200\201\202\203\204\012\027\033\210\211\212\213\214\005\006\007' . + '\220\221\026\223\224\225\226\004\230\231\232\233\024\025\236\032' . + '\040\240\342\344\340\341\343\345\347\361\242\056\074\050\053\174' . + '\046\351\352\353\350\355\356\357\354\337\041\044\052\051\073\254' . + '\055\057\302\304\300\301\303\305\307\321\246\054\045\137\076\077' . + '\370\311\312\313\310\315\316\317\314\140\072\043\100\047\075\042' . + '\330\141\142\143\144\145\146\147\150\151\253\273\360\375\376\261' . + '\260\152\153\154\155\156\157\160\161\162\252\272\346\270\306\244' . + '\265\176\163\164\165\166\167\170\171\172\241\277\320\335\336\256' . + '\136\243\245\267\251\247\266\274\275\276\133\135\257\250\264\327' . + '\173\101\102\103\104\105\106\107\110\111\255\364\366\362\363\365' . + '\175\112\113\114\115\116\117\120\121\122\271\373\374\371\372\377' . + '\134\367\123\124\125\126\127\130\131\132\262\324\326\322\323\325' . + '\060\061\062\063\064\065\066\067\070\071\263\333\334\331\332\237' ; + + my $ebcdic_string = $ascii_string; + eval '$ebcdic_string =~ tr/\000-\377/' . $cp_037 . '/'; + +To convert from EBCDIC 037 to ASCII just reverse the order of the tr/// +arguments like so: + + my $ascii_string = $ebcdic_string; + eval '$ascii_string = tr/' . $cp_037 . '/\000-\377/'; + +Similarly one could take the output of the third column from recipe 0 to +obtain a C<$cp_1047> table. The fourth column of the output from recipe +0 could provide a C<$cp_posix_bc> table suitable for transcoding as well. + +=head2 iconv + +XPG operability often implies the presence of an I<iconv> utility +available from the shell or from the C library. Consult your system's +documentation for information on iconv. + +On OS/390 see the iconv(1) man page. One way to invoke the iconv +shell utility from within perl would be to: + + # OS/390 example + $ascii_data = `echo '$ebcdic_data'| iconv -f IBM-1047 -t ISO8859-1` + +or the inverse map: + + # OS/390 example + $ebcdic_data = `echo '$ascii_data'| iconv -f ISO8859-1 -t IBM-1047` + +For other perl based conversion options see the Convert::* modules on CPAN. + +=head2 C RTL + +The OS/390 C run time library provides _atoe() and _etoa() functions. + +=head1 OPERATOR DIFFERENCES + +The C<..> range operator treats certain character ranges with +care on EBCDIC machines. For example the following array +will have twenty six elements on either an EBCDIC machine +or an ASCII machine: + + @alphabet = ('A'..'Z'); # $#alphabet == 25 + +The bitwise operators such as & ^ | may return different results +when operating on string or character data in a perl program running +on an EBCDIC machine than when run on an ASCII machine. Here is +an example adapted from the one in L<perlop>: + + # EBCDIC-based examples + print "j p \n" ^ " a h"; # prints "JAPH\n" + print "JA" | " ph\n"; # prints "japh\n" + print "JAPH\nJunk" & "\277\277\277\277\277"; # prints "japh\n"; + print 'p N$' ^ " E<H\n"; # prints "Perl\n"; + +An interesting property of the 32 C0 control characters +in the ASCII table is that they can "literally" be constructed +as control characters in perl, e.g. C<(chr(0) eq "\c@")> +C<(chr(1) eq "\cA")>, and so on. Perl on EBCDIC machines has been +ported to take "\c@" to chr(0) and "\cA" to chr(1) as well, but the +thirty three characters that result depend on which code page you are +using. The table below uses the character names from the previous table +but with substitutions such as s/START OF/S.O./; s/END OF /E.O./; +s/TRANSMISSION/TRANS./; s/TABULATION/TAB./; s/VERTICAL/VERT./; +s/HORIZONTAL/HORIZ./; s/DEVICE CONTROL/D.C./; s/SEPARATOR/SEP./; +s/NEGATIVE ACKNOWLEDGE/NEG. ACK./;. The POSIX-BC and 1047 sets are +identical throughout this range and differ from the 0037 set at only +one spot (21 decimal). Note that the C<LINE FEED> character +may be generated by "\cJ" on ASCII machines but by "\cU" on 1047 or POSIX-BC +machines and cannot be generated as a C<"\c.letter."> control character on +0037 machines. Note also that "\c\\" maps to two characters +not one. + + chr ord 8859-1 0037 1047 && POSIX-BC + ------------------------------------------------------------------------ + "\c?" 127 <DELETE> " " ***>< + "\c@" 0 <NULL> <NULL> <NULL> ***>< + "\cA" 1 <S.O. HEADING> <S.O. HEADING> <S.O. HEADING> + "\cB" 2 <S.O. TEXT> <S.O. TEXT> <S.O. TEXT> + "\cC" 3 <E.O. TEXT> <E.O. TEXT> <E.O. TEXT> + "\cD" 4 <E.O. TRANS.> <C1 28> <C1 28> + "\cE" 5 <ENQUIRY> <HORIZ. TAB.> <HORIZ. TAB.> + "\cF" 6 <ACKNOWLEDGE> <C1 6> <C1 6> + "\cG" 7 <BELL> <DELETE> <DELETE> + "\cH" 8 <BACKSPACE> <C1 23> <C1 23> + "\cI" 9 <HORIZ. TAB.> <C1 13> <C1 13> + "\cJ" 10 <LINE FEED> <C1 14> <C1 14> + "\cK" 11 <VERT. TAB.> <VERT. TAB.> <VERT. TAB.> + "\cL" 12 <FORM FEED> <FORM FEED> <FORM FEED> + "\cM" 13 <CARRIAGE RETURN> <CARRIAGE RETURN> <CARRIAGE RETURN> + "\cN" 14 <SHIFT OUT> <SHIFT OUT> <SHIFT OUT> + "\cO" 15 <SHIFT IN> <SHIFT IN> <SHIFT IN> + "\cP" 16 <DATA LINK ESCAPE> <DATA LINK ESCAPE> <DATA LINK ESCAPE> + "\cQ" 17 <D.C. ONE> <D.C. ONE> <D.C. ONE> + "\cR" 18 <D.C. TWO> <D.C. TWO> <D.C. TWO> + "\cS" 19 <D.C. THREE> <D.C. THREE> <D.C. THREE> + "\cT" 20 <D.C. FOUR> <C1 29> <C1 29> + "\cU" 21 <NEG. ACK.> <C1 5> <LINE FEED> *** + "\cV" 22 <SYNCHRONOUS IDLE> <BACKSPACE> <BACKSPACE> + "\cW" 23 <E.O. TRANS. BLOCK> <C1 7> <C1 7> + "\cX" 24 <CANCEL> <CANCEL> <CANCEL> + "\cY" 25 <E.O. MEDIUM> <E.O. MEDIUM> <E.O. MEDIUM> + "\cZ" 26 <SUBSTITUTE> <C1 18> <C1 18> + "\c[" 27 <ESCAPE> <C1 15> <C1 15> + "\c\\" 28 <FILE SEP.>\ <FILE SEP.>\ <FILE SEP.>\ + "\c]" 29 <GROUP SEP.> <GROUP SEP.> <GROUP SEP.> + "\c^" 30 <RECORD SEP.> <RECORD SEP.> <RECORD SEP.> ***>< + "\c_" 31 <UNIT SEP.> <UNIT SEP.> <UNIT SEP.> ***>< + + +=head1 FUNCTION DIFFERENCES + +=over 8 + +=item chr() + +chr() must be given an EBCDIC code number argument to yield a desired +character return value on an EBCDIC machine. For example: + + $CAPITAL_LETTER_A = chr(193); + +=item ord() + +ord() will return EBCDIC code number values on an EBCDIC machine. +For example: + + $the_number_193 = ord("A"); + +=item pack() + +The c and C templates for pack() are dependent upon character set +encoding. Examples of usage on EBCDIC include: + + $foo = pack("CCCC",193,194,195,196); + # $foo eq "ABCD" + $foo = pack("C4",193,194,195,196); + # same thing + + $foo = pack("ccxxcc",193,194,195,196); + # $foo eq "AB\0\0CD" + +=item print() + +One must be careful with scalars and strings that are passed to +print that contain ASCII encodings. One common place +for this to occur is in the output of the MIME type header for +CGI script writing. For example, many perl programming guides +recommend something similar to: + + print "Content-type:\ttext/html\015\012\015\012"; + # this may be wrong on EBCDIC + +Under the IBM OS/390 USS Web Server for example you should instead +write that as: + + print "Content-type:\ttext/html\r\n\r\n"; # OK for DGW et alia + +That is because the translation from EBCDIC to ASCII is done +by the web server in this case (such code will not be appropriate for +the Macintosh however). Consult your web server's documentation for +further details. + +=item printf() + +The formats that can convert characters to numbers and vice versa +will be different from their ASCII counterparts when executed +on an EBCDIC machine. Examples include: + + printf("%c%c%c",193,194,195); # prints ABC + +=item sort() + +EBCDIC sort results may differ from ASCII sort results especially for +mixed case strings. This is discussed in more detail below. + +=item sprintf() + +See the discussion of printf() above. An example of the use +of sprintf would be: + + $CAPITAL_LETTER_A = sprintf("%c",193); + +=item unpack() + +See the discussion of pack() above. + +=back + +=head1 REGULAR EXPRESSION DIFFERENCES + +As of perl 5.005_03 the letter range regular expression such as +[A-Z] and [a-z] have been especially coded to not pick up gap +characters. For example, characters such as E<ocirc> C<o WITH CIRCUMFLEX> +that lie between I and J would not be matched by the +regular expression range C</[H-K]/>. + +If you do want to match the alphabet gap characters in a single octet +regular expression try matching the hex or octal code such +as C</\313/> on EBCDIC or C</\364/> on ASCII machines to +have your regular expression match C<o WITH CIRCUMFLEX>. + +Another construct to be wary of is the inappropriate use of hex or +octal constants in regular expressions. Consider the following +set of subs: + + sub is_c0 { + my $char = substr(shift,0,1); + $char =~ /[\000-\037]/; + } + + sub is_print_ascii { + my $char = substr(shift,0,1); + $char =~ /[\040-\176]/; + } + + sub is_delete { + my $char = substr(shift,0,1); + $char eq "\177"; + } + + sub is_c1 { + my $char = substr(shift,0,1); + $char =~ /[\200-\237]/; + } + + sub is_latin_1 { + my $char = substr(shift,0,1); + $char =~ /[\240-\377]/; + } + +The above would be adequate if the concern was only with numeric code points. +However, the concern may be with characters rather than code points +and on an EBCDIC machine it may be desirable for constructs such as +C<if (is_print_ascii("A")) {print "A is a printable character\n";}> to print +out the expected message. One way to represent the above collection +of character classification subs that is capable of working across the +four coded character sets discussed in this document is as follows: + + sub Is_c0 { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char =~ /[\000-\037]/; + } + if (ord('^')==176) { # 37 + return $char =~ /[\000-\003\067\055-\057\026\005\045\013-\023\074\075\062\046\030\031\077\047\034-\037]/; + } + if (ord('^')==95 || ord('^')==106) { # 1047 || posix-bc + return $char =~ /[\000-\003\067\055-\057\026\005\025\013-\023\074\075\062\046\030\031\077\047\034-\037]/; + } + } + + sub Is_print_ascii { + my $char = substr(shift,0,1); + $char =~ /[ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/; + } + + sub Is_delete { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char eq "\177"; + } + else { # ebcdic + return $char eq "\007"; + } + } + + sub Is_c1 { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char =~ /[\200-\237]/; + } + if (ord('^')==176) { # 37 + return $char =~ /[\040-\044\025\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/; + } + if (ord('^')==95) { # 1047 + return $char =~ /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/; + } + if (ord('^')==106) { # posix-bc + return $char =~ + /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\137]/; + } + } + + sub Is_latin_1 { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char =~ /[\240-\377]/; + } + if (ord('^')==176) { # 37 + return $char =~ + /[\101\252\112\261\237\262\152\265\275\264\232\212\137\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/; + } + if (ord('^')==95) { # 1047 + return $char =~ + /[\101\252\112\261\237\262\152\265\273\264\232\212\260\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\272\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/; + } + if (ord('^')==106) { # posix-bc + return $char =~ + /[\101\252\260\261\237\262\320\265\171\264\232\212\272\312\257\241\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\340\376\335\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\300\336\333\334\215\216\337]/; + } + } + +Note however that only the C<Is_ascii_print()> sub is really independent +of coded character set. Another way to write C<Is_latin_1()> would be +to use the characters in the range explicitly: + + sub Is_latin_1 { + my $char = substr(shift,0,1); + $char =~ /[ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ]/; + } + +Although that form may run into trouble in network transit (due to the +presence of 8 bit characters) or on non ISO-Latin character sets. + +=head1 SOCKETS + +Most socket programming assumes ASCII character encodings in network +byte order. Exceptions can include CGI script writing under a +host web server where the server may take care of translation for you. +Most host web servers convert EBCDIC data to ISO-8859-1 or Unicode on +output. + +=head1 SORTING + +One big difference between ASCII based character sets and EBCDIC ones +are the relative positions of upper and lower case letters and the +letters compared to the digits. If sorted on an ASCII based machine the +two letter abbreviation for a physician comes before the two letter +for drive, that is: + + @sorted = sort(qw(Dr. dr.)); # @sorted holds ('Dr.','dr.') on ASCII, + # but ('dr.','Dr.') on EBCDIC + +The property of lower case before uppercase letters in EBCDIC is +even carried to the Latin 1 EBCDIC pages such as 0037 and 1047. +An example would be that E<Euml> C<E WITH DIAERESIS> (203) comes +before E<euml> C<e WITH DIAERESIS> (235) on an ASCII machine, but +the latter (83) comes before the former (115) on an EBCDIC machine. +(Astute readers will note that the upper case version of E<szlig> +C<SMALL LETTER SHARP S> is simply "SS" and that the upper case version of +E<yuml> C<y WITH DIAERESIS> is not in the 0..255 range but it is +at U+x0178 in Unicode, or C<"\x{178}"> in a Unicode enabled Perl). + +The sort order will cause differences between results obtained on +ASCII machines versus EBCDIC machines. What follows are some suggestions +on how to deal with these differences. + +=head2 Ignore ASCII vs. EBCDIC sort differences. + +This is the least computationally expensive strategy. It may require +some user education. + +=head2 MONO CASE then sort data. + +In order to minimize the expense of mono casing mixed test try to +C<tr///> towards the character set case most employed within the data. +If the data are primarily UPPERCASE non Latin 1 then apply tr/[a-z]/[A-Z]/ +then sort(). If the data are primarily lowercase non Latin 1 then +apply tr/[A-Z]/[a-z]/ before sorting. If the data are primarily UPPERCASE +and include Latin-1 characters then apply: + + tr/[a-z]/[A-Z]/; + tr/[àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ]/[ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ]/; + s/ß/SS/g; + +then sort(). Do note however that such Latin-1 manipulation does not +address the E<yuml> C<y WITH DIAERESIS> character that will remain at +code point 255 on ASCII machines, but 223 on most EBCDIC machines +where it will sort to a place less than the EBCDIC numerals. With a +Unicode enabled Perl you might try: + + tr/^?/\x{178}/; + +The strategy of mono casing data before sorting does not preserve the case +of the data and may not be acceptable for that reason. + +=head2 Convert, sort data, then re convert. + +This is the most expensive proposition that does not employ a network +connection. + +=head2 Perform sorting on one type of machine only. + +This strategy can employ a network connection. As such +it would be computationally expensive. + +=head1 TRANFORMATION FORMATS + +There are a variety of ways of transforming data with an intra character set +mapping that serve a variety of purposes. Sorting was discussed in the +previous section and a few of the other more popular mapping techniques are +discussed next. + +=head2 URL decoding and encoding + +Note that some URLs have hexadecimal ASCII code points in them in an +attempt to overcome character or protocol limitation issues. For example +the tilde character is not on every keyboard hence a URL of the form: + + http://www.pvhp.com/~pvhp/ + +may also be expressed as either of: + + http://www.pvhp.com/%7Epvhp/ + + http://www.pvhp.com/%7epvhp/ + +where 7E is the hexadecimal ASCII code point for '~'. Here is an example +of decoding such a URL under CCSID 1047: + + $url = 'http://www.pvhp.com/%7Epvhp/'; + # this array assumes code page 1047 + my @a2e_1047 = ( + 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 21, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 60, 61, 50, 38, 24, 25, 63, 39, 28, 29, 30, 31, + 64, 90,127,123, 91,108, 80,125, 77, 93, 92, 78,107, 96, 75, 97, + 240,241,242,243,244,245,246,247,248,249,122, 94, 76,126,110,111, + 124,193,194,195,196,197,198,199,200,201,209,210,211,212,213,214, + 215,216,217,226,227,228,229,230,231,232,233,173,224,189, 95,109, + 121,129,130,131,132,133,134,135,136,137,145,146,147,148,149,150, + 151,152,153,162,163,164,165,166,167,168,169,192, 79,208,161, 7, + 32, 33, 34, 35, 36, 37, 6, 23, 40, 41, 42, 43, 44, 9, 10, 27, + 48, 49, 26, 51, 52, 53, 54, 8, 56, 57, 58, 59, 4, 20, 62,255, + 65,170, 74,177,159,178,106,181,187,180,154,138,176,202,175,188, + 144,143,234,250,190,160,182,179,157,218,155,139,183,184,185,171, + 100,101, 98,102, 99,103,158,104,116,113,114,115,120,117,118,119, + 172,105,237,238,235,239,236,191,128,253,254,251,252,186,174, 89, + 68, 69, 66, 70, 67, 71,156, 72, 84, 81, 82, 83, 88, 85, 86, 87, + 140, 73,205,206,203,207,204,225,112,221,222,219,220,141,142,223 + ); + $url =~ s/%([0-9a-fA-F]{2})/pack("c",$a2e_1047[hex($1)])/ge; + +Conversely, here is a partial solution for the task of encoding such +a URL under the 1047 code page: + + $url = 'http://www.pvhp.com/~pvhp/'; + # this array assumes code page 1047 + my @e2a_1047 = ( + 0, 1, 2, 3,156, 9,134,127,151,141,142, 11, 12, 13, 14, 15, + 16, 17, 18, 19,157, 10, 8,135, 24, 25,146,143, 28, 29, 30, 31, + 128,129,130,131,132,133, 23, 27,136,137,138,139,140, 5, 6, 7, + 144,145, 22,147,148,149,150, 4,152,153,154,155, 20, 21,158, 26, + 32,160,226,228,224,225,227,229,231,241,162, 46, 60, 40, 43,124, + 38,233,234,235,232,237,238,239,236,223, 33, 36, 42, 41, 59, 94, + 45, 47,194,196,192,193,195,197,199,209,166, 44, 37, 95, 62, 63, + 248,201,202,203,200,205,206,207,204, 96, 58, 35, 64, 39, 61, 34, + 216, 97, 98, 99,100,101,102,103,104,105,171,187,240,253,254,177, + 176,106,107,108,109,110,111,112,113,114,170,186,230,184,198,164, + 181,126,115,116,117,118,119,120,121,122,161,191,208, 91,222,174, + 172,163,165,183,169,167,182,188,189,190,221,168,175, 93,180,215, + 123, 65, 66, 67, 68, 69, 70, 71, 72, 73,173,244,246,242,243,245, + 125, 74, 75, 76, 77, 78, 79, 80, 81, 82,185,251,252,249,250,255, + 92,247, 83, 84, 85, 86, 87, 88, 89, 90,178,212,214,210,211,213, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,179,219,220,217,218,159 + ); + # The following regular expression does not address the + # mappings for: ('.' => '%2E', '/' => '%2F', ':' => '%3A') + $url =~ s/([\t "#%&\(\),;<=>\?\@\[\\\]^`{|}~])/sprintf("%%%02X",$e2a_1047[ord($1)])/ge; + +where a more complete solution would split the URL into components +and apply a full s/// substitution only to the appropriate parts. + +In the remaining examples a @e2a or @a2e array may be employed +but the assignment will not be shown explicitly. For code page 1047 +you could use the @a2e_1047 or @e2a_1047 arrays just shown. + +=head2 uu encoding and decoding + +The C<u> template to pack() or unpack() will render EBCDIC data in EBCDIC +characters equivalent to their ASCII counterparts. For example, the +following will print "Yes indeed\n" on either an ASCII or EBCDIC computer: + + $all_byte_chrs = ''; + for (0..255) { $all_byte_chrs .= chr($_); } + $uuencode_byte_chrs = pack('u', $all_byte_chrs); + ($uu = <<' ENDOFHEREDOC') =~ s/^\s*//gm; + M``$"`P0%!@<("0H+#`T.#Q`1$A,4%187&!D:&QP='A\@(2(C)"4F)R@I*BLL + M+2XO,#$R,S0U-C<X.3H[/#T^/T!!0D-$149'2$E*2TQ-3D]045)35%565UA9 + M6EM<75Y?8&%B8V1E9F=H:6IK;&UN;W!Q<G-T=79W>'EZ>WQ]?G^`@8*#A(6& + MAXB)BHN,C8Z/D)&2DY25EI>8F9J;G)V>GZ"AHJ.DI::GJ*FJJZRMKJ^PL;*S + MM+6VM[BYNKN\O;Z_P,'"P\3%QL?(R<K+S,W.S]#1TM/4U=;7V-G:V]S=WM_@ + ?X>+CY.7FY^CIZNOL[>[O\/'R\_3U]O?X^?K[_/W^_P`` + ENDOFHEREDOC + if ($uuencode_byte_chrs eq $uu) { + print "Yes "; + } + $uudecode_byte_chrs = unpack('u', $uuencode_byte_chrs); + if ($uudecode_byte_chrs eq $all_byte_chrs) { + print "indeed\n"; + } + +Here is a very spartan uudecoder that will work on EBCDIC provided +that the @e2a array is filled in appropriately: + + #!/usr/local/bin/perl + @e2a = ( # this must be filled in + ); + $_ = <> until ($mode,$file) = /^begin\s*(\d*)\s*(\S*)/; + open(OUT, "> $file") if $file ne ""; + while(<>) { + last if /^end/; + next if /[a-z]/; + next unless int(((($e2a[ord()] - 32 ) & 077) + 2) / 3) == + int(length() / 4); + print OUT unpack("u", $_); + } + close(OUT); + chmod oct($mode), $file; + + +=head2 Quoted-Printable encoding and decoding + +On ASCII encoded machines it is possible to strip characters outside of +the printable set using: + + # This QP encoder works on ASCII only + $qp_string =~ s/([=\x00-\x1F\x80-\xFF])/sprintf("=%02X",ord($1))/ge; + +Whereas a QP encoder that works on both ASCII and EBCDIC machines +would look somewhat like the following (where the EBCDIC branch @e2a +array is omitted for brevity): + + if (ord('A') == 65) { # ASCII + $delete = "\x7F"; # ASCII + @e2a = (0 .. 255) # ASCII to ASCII identity map + } + else { # EBCDIC + $delete = "\x07"; # EBCDIC + @e2a = # EBCDIC to ASCII map (as shown above) + } + $qp_string =~ + s/([^ !"\#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~$delete])/sprintf("=%02X",$e2a[ord($1)])/ge; + +(although in production code the substitutions might be done +in the EBCDIC branch with the @e2a array and separately in the +ASCII branch without the expense of the identity map). + +Such QP strings can be decoded with: + + # This QP decoder is limited to ASCII only + $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge; + $string =~ s/=[\n\r]+$//; + +Whereas a QP decoder that works on both ASCII and EBCDIC machines +would look somewhat like the following (where the @a2e array is +omitted for brevity): + + $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr $a2e[hex $1]/ge; + $string =~ s/=[\n\r]+$//; + +=head2 Caesarian cyphers + +The practice of shifting an alphabet one or more characters for encipherment +dates back thousands of years and was explicitly detailed by Gaius Julius +Caesar in his B<Gallic Wars> text. A single alphabet shift is sometimes +referred to as a rotation and the shift amount is given as a number $n after +the string 'rot' or "rot$n". Rot0 and rot26 would designate identity maps +on the 26 letter English version of the Latin alphabet. Rot13 has the +interesting property that alternate subsequent invocations are identity maps +(thus rot13 is its own non-trivial inverse in the group of 26 alphabet +rotations). Hence the following is a rot13 encoder and decoder that will +work on ASCII and EBCDIC machines: + + #!/usr/local/bin/perl + + while(<>){ + tr/n-za-mN-ZA-M/a-zA-Z/; + print; + } + +In one-liner form: + + perl -ne 'tr/n-za-mN-ZA-M/a-zA-Z/;print' + + +=head1 Hashing order and checksums + +XXX + +=head1 I18N AND L10N + +Internationalization(I18N) and localization(L10N) are supported at least +in principle even on EBCDIC machines. The details are system dependent +and discussed under the L<perlebcdic/OS ISSUES> section below. + +=head1 MULTI OCTET CHARACTER SETS + +Multi byte EBCDIC code pages; Unicode, UTF-8, UTF-EBCDIC, XXX. + +=head1 OS ISSUES + +There may be a few system dependent issues +of concern to EBCDIC Perl programmers. + +=head2 OS/400 + +The PASE environment. + +=over 8 + +=item IFS access + +XXX. + +=back + +=head2 OS/390 + +Perl runs under Unix Systems Services or USS. + +=over 8 + +=item chcp + +B<chcp> is supported as a shell utility for displaying and changing +one's code page. See also L<chcp>. + +=item dataset access + +For sequential data set access try: + + my @ds_records = `cat //DSNAME`; + +or: + + my @ds_records = `cat //'HLQ.DSNAME'`; + +See also the OS390::Stdio module on CPAN. + +=item OS/390 iconv + +B<iconv> is supported as both a shell utility and a C RTL routine. +See also the iconv(1) and iconv(3) manual pages. + +=item locales + +On OS/390 see L<locale> for information on locales. The L10N files +are in F</usr/nls/locale>. $Config{d_setlocale} is 'define' on OS/390. + +=back + +=head2 VM/ESA? + +XXX. + +=head2 POSIX-BC? + +XXX. + +=head1 BUGS + +This pod document contains literal Latin 1 characters and may encounter +translation difficulties. In particular one popular nroff implementation +was known to strip accented characters to their unaccented counterparts +while attempting to view this document through the B<pod2man> program +(for example, you may see a plain C<y> rather than one with a diaeresis +as in E<yuml>). Another nroff truncated the resultant man page at +the first occurence of 8 bit characters. + +Not all shells will allow multiple C<-e> string arguments to perl to +be concatenated together properly as recipes 2, 3, and 4 might seem +to imply. + +Perl does not yet work with any Unicode features on EBCDIC platforms. + +=head1 SEE ALSO + +L<perllocale>, L<perlfunc>. + +=head1 REFERENCES + +http://anubis.dkuug.dk/i18n/charmaps + +http://www.unicode.org/ + +http://www.unicode.org/unicode/reports/tr16/ + +http://www.wps.com/texts/codes/ +B<ASCII: American Standard Code for Information Infiltration> Tom Jennings, +September 1999. + +B<The Unicode Standard Version 2.0> The Unicode Consortium, +ISBN 0-201-48345-9, Addison Wesley Developers Press, July 1996. + +B<The Unicode Standard Version 3.0> The Unicode Consortium, Lisa Moore ed., +ISBN 0-201-61633-5, Addison Wesley Developers Press, February 2000. + +B<CDRA: IBM - Character Data Representation Architecture - +Reference and Registry>, IBM SC09-2190-00, December 1996. + +"Demystifying Character Sets", Andrea Vine, Multilingual Computing +& Technology, B<#26 Vol. 10 Issue 4>, August/September 1999; +ISSN 1523-0309; Multilingual Computing Inc. Sandpoint ID, USA. + +B<Codes, Ciphers, and Other Cryptic and Clandestine Communication> +Fred B. Wrixon, ISBN 1-57912-040-7, Black Dog & Leventhal Publishers, +1998. + +=head1 AUTHOR + +Peter Prymmer pvhp@best.com wrote this in 1999 and 2000 +with CCSID 0819 and 0037 help from Chris Leach and +AndrE<eacute> Pirard A.Pirard@ulg.ac.be as well as POSIX-BC +help from Thomas Dorner Thomas.Dorner@start.de. +Thanks also to Vickie Cooper, Philip Newton, William Raffloer, and +Joe Smith. Trademarks, registered trademarks, service marks and +registered service marks used in this document are the property of +their respective owners. + + diff --git a/contrib/perl5/pod/perlmodlib.PL b/contrib/perl5/pod/perlmodlib.PL new file mode 100755 index 0000000000000..0cdadb76c79cf --- /dev/null +++ b/contrib/perl5/pod/perlmodlib.PL @@ -0,0 +1,1383 @@ +#!../miniperl + +open (OUT, ">perlmodlib.tmp") or die $!; +my (@pragma, @mod); +open (MANIFEST, "../MANIFEST") or die $!; + +while (<MANIFEST>) { + my $filename; + next unless s|^lib/|| or m|^ext/|; + ($filename) = /(\S+)/; + $filename =~ s|^[^/]+/|| if $filename =~ s|^ext/||; + next unless $filename =~ /\.p(m|od)$/; + next unless open (MOD, "../lib/$filename"); + + my ($name, $thing); + my $foundit=0; + { + local $/=""; + while (<MOD>) { + next unless /^=head1 NAME/; + $foundit++; + last; + } + } + unless ($foundit) { + warn "$filename missing head1\n"; + next; + } + my $title = <MOD>; + chomp($title); + close MOD; + + my $perlname = $filename; + $perlname =~ s!\.p(m|od)$!!; + $perlname =~ s!/!::!g; + + ($name, $thing) = split / --? /, $title, 2; + + unless ($name and $thing) { + warn "$filename missing name\n" unless $name; + warn "$filename missing thing\n" unless $thing; + next; + } + + $thing =~ s/^perl pragma to //i; + $thing = ucfirst($thing); + $title = "=item $perlname\n\n$thing\n\n"; + + # print "$perlname $thing\n"; + + if ($filename=~/[A-Z]/) { + push @mod, $title; + } else { + push @pragma, $title; + } +} + +print OUT <<'EOF'; +# Generated by perlmodlib.PL DO NOT EDIT! + +=head1 NAME + +perlmodlib - constructing new Perl modules and finding existing ones + +=head1 DESCRIPTION + +=head1 THE PERL MODULE LIBRARY + +Many modules are included the Perl distribution. These are described +below, and all end in F<.pm>. You may discover compiled library +file (usually ending in F<.so>) or small pieces of modules to be +autoloaded (ending in F<.al>); these were automatically generated +by the installation process. You may also discover files in the +library directory that end in either F<.pl> or F<.ph>. These are +old libraries supplied so that old programs that use them still +run. The F<.pl> files will all eventually be converted into standard +modules, and the F<.ph> files made by B<h2ph> will probably end up +as extension modules made by B<h2xs>. (Some F<.ph> values may +already be available through the POSIX, Errno, or Fcntl modules.) +The B<pl2pm> file in the distribution may help in your conversion, +but it's just a mechanical process and therefore far from bulletproof. + +=head2 Pragmatic Modules + +They work somewhat like compiler directives (pragmata) in that they +tend to affect the compilation of your program, and thus will usually +work well only when used within a C<use>, or C<no>. Most of these +are lexically scoped, so an inner BLOCK may countermand them +by saying: + + no integer; + no strict 'refs'; + no warnings; + +which lasts until the end of that BLOCK. + +Some pragmas are lexically scoped--typically those that affect the +C<$^H> hints variable. Others affect the current package instead, +like C<use vars> and C<use subs>, which allow you to predeclare a +variables or subroutines within a particular I<file> rather than +just a block. Such declarations are effective for the entire file +for which they were declared. You cannot rescind them with C<no +vars> or C<no subs>. + +The following pragmas are defined (and have their own documentation). + +=over 12 + +EOF + +print OUT $_ for (sort @pragma); + +print OUT <<EOF; +=back + +=head2 Standard Modules + +Standard, bundled modules are all expected to behave in a well-defined +manner with respect to namespace pollution because they use the +Exporter module. See their own documentation for details. + +=over 12 + +EOF + +print OUT $_ for (sort @mod); + +print OUT <<'EOF'; +=back + +To find out I<all> modules installed on your system, including +those without documentation or outside the standard release, +just do this: + + % find `perl -e 'print "@INC"'` -name '*.pm' -print + +They should all have their own documentation installed and accessible +via your system man(1) command. If you do not have a B<find> +program, you can use the Perl B<find2perl> program instead, which +generates Perl code as output you can run through perl. If you +have a B<man> program but it doesn't find your modules, you'll have +to fix your manpath. See L<perl> for details. If you have no +system B<man> command, you might try the B<perldoc> program. + +=head2 Extension Modules + +Extension modules are written in C (or a mix of Perl and C). They +are usually dynamically loaded into Perl if and when you need them, +but may also be be linked in statically. Supported extension modules +include Socket, Fcntl, and POSIX. + +Many popular C extension modules do not come bundled (at least, not +completely) due to their sizes, volatility, or simply lack of time +for adequate testing and configuration across the multitude of +platforms on which Perl was beta-tested. You are encouraged to +look for them on CPAN (described below), or using web search engines +like Alta Vista or Deja News. + +=head1 CPAN + +CPAN stands for Comprehensive Perl Archive Network; it's a globally +replicated trove of Perl materials, including documentation, style +guides, tricks and traps, alternate ports to non-Unix systems and +occasional binary distributions for these. Search engines for +CPAN can be found at http://cpan.perl.com/ and at +http://theory.uwinnipeg.ca/mod_perl/cpan-search.pl . + +Most importantly, CPAN includes around a thousand unbundled modules, +some of which require a C compiler to build. Major categories of +modules are: + +=over + +=item * + +Language Extensions and Documentation Tools + +=item * + +Development Support + +=item * + +Operating System Interfaces + +=item * + +Networking, Device Control (modems) and InterProcess Communication + +=item * + +Data Types and Data Type Utilities + +=item * + +Database Interfaces + +=item * + +User Interfaces + +=item * + +Interfaces to / Emulations of Other Programming Languages + +=item * + +File Names, File Systems and File Locking (see also File Handles) + +=item * + +String Processing, Language Text Processing, Parsing, and Searching + +=item * + +Option, Argument, Parameter, and Configuration File Processing + +=item * + +Internationalization and Locale + +=item * + +Authentication, Security, and Encryption + +=item * + +World Wide Web, HTML, HTTP, CGI, MIME + +=item * + +Server and Daemon Utilities + +=item * + +Archiving and Compression + +=item * + +Images, Pixmap and Bitmap Manipulation, Drawing, and Graphing + +=item * + +Mail and Usenet News + +=item * + +Control Flow Utilities (callbacks and exceptions etc) + +=item * + +File Handle and Input/Output Stream Utilities + +=item * + +Miscellaneous Modules + +=back + +Registered CPAN sites as of this writing include the following. +You should try to choose one close to you: + +=head2 Africa + +=over 4 + +=item * + +South Africa + + ftp://ftp.is.co.za/programming/perl/CPAN/ + ftp://ftp.saix.net/pub/CPAN/ + ftp://ftpza.co.za/pub/mirrors/cpan/ + ftp://ftp.sun.ac.za/CPAN/ + +=back + +=head2 Asia + +=over 4 + +=item * + +China + + ftp://freesoft.cei.gov.cn/pub/languages/perl/CPAN/ + http://www2.linuxforum.net/mirror/CPAN/ + http://cpan.shellhung.org/ + ftp://ftp.shellhung.org/pub/CPAN + +=item * + +Hong Kong + + http://CPAN.pacific.net.hk/ + ftp://ftp.pacific.net.hk/pub/mirror/CPAN/ + +=item * + +Indonesia + + http://piksi.itb.ac.id/CPAN/ + ftp://mirrors.piksi.itb.ac.id/CPAN/ + http://CPAN.mweb.co.id/ + ftp://ftp.mweb.co.id/pub/languages/perl/CPAN/ + +=item * + +Israel + + http://www.iglu.org.il:/pub/CPAN/ + ftp://ftp.iglu.org.il/pub/CPAN/ + http://bioinfo.weizmann.ac.il/pub/software/perl/CPAN/ + ftp://bioinfo.weizmann.ac.il/pub/software/perl/CPAN/ + +=item * + +Japan + + ftp://ftp.u-aizu.ac.jp/pub/lang/perl/CPAN/ + ftp://ftp.kddlabs.co.jp/CPAN/ + http://mirror.nucba.ac.jp/mirror/Perl/ + ftp://mirror.nucba.ac.jp/mirror/Perl/ + ftp://ftp.meisei-u.ac.jp/pub/CPAN/ + ftp://ftp.jaist.ac.jp/pub/lang/perl/CPAN/ + ftp://ftp.dti.ad.jp/pub/lang/CPAN/ + ftp://ftp.ring.gr.jp/pub/lang/perl/CPAN/ + +=item * + +Saudi Arabia + + ftp://ftp.isu.net.sa/pub/CPAN/ + +=item * + +Singapore + + http://cpan.hjc.edu.sg + http://ftp.nus.edu.sg/unix/perl/CPAN/ + ftp://ftp.nus.edu.sg/pub/unix/perl/CPAN/ + +=item * + +South Korea + + http://CPAN.bora.net/ + ftp://ftp.bora.net/pub/CPAN/ + http://ftp.kornet.net/CPAN/ + ftp://ftp.kornet.net/pub/CPAN/ + ftp://ftp.nuri.net/pub/CPAN/ + +=item * + +Taiwan + + ftp://coda.nctu.edu.tw/UNIX/perl/CPAN + ftp://ftp.ee.ncku.edu.tw/pub/perl/CPAN/ + ftp://ftp1.sinica.edu.tw/pub1/perl/CPAN/ + +=item * + +Thailand + + http://download.nectec.or.th/CPAN/ + ftp://ftp.nectec.or.th/pub/languages/CPAN/ + ftp://ftp.cs.riubon.ac.th/pub/mirrors/CPAN/ + +=back + +=head2 Central America + +=over 4 + +=item * + +Costa Rica + + ftp://ftp.linux.co.cr/mirrors/CPAN/ + http://ftp.ucr.ac.cr/Unix/CPAN/ + ftp://ftp.ucr.ac.cr/pub/Unix/CPAN/ + +=back + +=head2 Europe + +=over 4 + +=item * + +Austria + + ftp://ftp.tuwien.ac.at/pub/languages/perl/CPAN/ + +=item * + +Belgium + + http://ftp.easynet.be/CPAN/ + ftp://ftp.easynet.be/CPAN/ + ftp://ftp.kulnet.kuleuven.ac.be/pub/mirror/CPAN/ + +=item * + +Bulgaria + + ftp://ftp.ntrl.net/pub/mirrors/CPAN/ + +=item * + +Croatia + + ftp://ftp.linux.hr/pub/CPAN/ + +=item * + +Czech Republic + + http://www.fi.muni.cz/pub/perl/ + ftp://ftp.fi.muni.cz/pub/perl/ + ftp://sunsite.mff.cuni.cz/MIRRORS/ftp.funet.fi/pub/languages/perl/CPAN/ + +=item * + +Denmark + + ftp://sunsite.auc.dk/pub/languages/perl/CPAN/ + http://www.cpan.dk/CPAN/ + ftp://www.cpan.dk/ftp.cpan.org/CPAN/ + +=item * + +England + + http://www.mirror.ac.uk/sites/ftp.funet.fi/pub/languages/perl/CPAN + ftp://ftp.mirror.ac.uk/sites/ftp.funet.fi/pub/languages/perl/CPAN/ + ftp://ftp.demon.co.uk/pub/mirrors/perl/CPAN/ + ftp://ftp.flirble.org/pub/languages/perl/CPAN/ + ftp://ftp.plig.org/pub/CPAN/ + ftp://sunsite.doc.ic.ac.uk/packages/CPAN/ + http://mirror.uklinux.net/CPAN/ + ftp://mirror.uklinux.net/pub/CPAN/ + ftp://usit.shef.ac.uk/pub/packages/CPAN/ + +=item * + +Estonia + + ftp://ftp.ut.ee/pub/languages/perl/CPAN/ + +=item * + +Finland + + ftp://ftp.funet.fi/pub/languages/perl/CPAN/ + +=item * + +France + + ftp://cpan.ftp.worldonline.fr/pub/CPAN/ + ftp://ftp.club-internet.fr/pub/perl/CPAN/ + ftp://ftp.lip6.fr/pub/perl/CPAN/ + ftp://ftp.oleane.net/pub/mirrors/CPAN/ + ftp://ftp.pasteur.fr/pub/computing/CPAN/ + ftp://cpan.cict.fr/pub/CPAN/ + ftp://ftp.uvsq.fr/pub/perl/CPAN/ + +=item * + +Germany + + ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN/ + ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/ + ftp://ftp.uni-erlangen.de/pub/source/CPAN/ + ftp://ftp-stud.fht-esslingen.de/pub/Mirrors/CPAN + ftp://ftp.gigabell.net/pub/CPAN/ + http://ftp.gwdg.de/pub/languages/perl/CPAN/ + ftp://ftp.gwdg.de/pub/languages/perl/CPAN/ + ftp://ftp.uni-hamburg.de/pub/soft/lang/perl/CPAN/ + ftp://ftp.leo.org/pub/comp/general/programming/languages/script/perl/CPAN/ + ftp://ftp.mpi-sb.mpg.de/pub/perl/CPAN/ + ftp://ftp.gmd.de/mirrors/CPAN/ + +=item * + +Greece + + ftp://ftp.forthnet.gr/pub/languages/perl/CPAN + ftp://ftp.ntua.gr/pub/lang/perl/ + +=item * + +Hungary + + http://cpan.artifact.hu/ + ftp://cpan.artifact.hu/CPAN/ + ftp://ftp.kfki.hu/pub/packages/perl/CPAN/ + +=item * + +Iceland + + http://cpan.gm.is/ + ftp://ftp.gm.is/pub/CPAN/ + +=item * + +Ireland + + http://cpan.indigo.ie/ + ftp://cpan.indigo.ie/pub/CPAN/ + http://sunsite.compapp.dcu.ie/pub/perl/ + ftp://sunsite.compapp.dcu.ie/pub/perl/ + +=item * + +Italy + + http://cpan.nettuno.it/ + http://gusp.dyndns.org/CPAN/ + ftp://gusp.dyndns.org/pub/CPAN + http://softcity.iol.it/cpan + ftp://softcity.iol.it/pub/cpan + ftp://ftp.unina.it/pub/Other/CPAN/ + ftp://ftp.unipi.it/pub/mirror/perl/CPAN/ + ftp://cis.uniRoma2.it/CPAN/ + ftp://ftp.edisontel.it/pub/CPAN_Mirror/ + ftp://ftp.flashnet.it/pub/CPAN/ + +=item * + +Latvia + + http://kvin.lv/pub/CPAN/ + +=item * + +Netherlands + + ftp://download.xs4all.nl/pub/mirror/CPAN/ + ftp://ftp.nl.uu.net/pub/CPAN/ + ftp://ftp.nluug.nl/pub/languages/perl/CPAN/ + ftp://ftp.cpan.nl/pub/CPAN/ + http://www.cs.uu.nl/mirror/CPAN/ + ftp://ftp.cs.uu.nl/mirror/CPAN/ + +=item * + +Norway + + ftp://sunsite.uio.no/pub/languages/perl/CPAN/ + ftp://ftp.uit.no/pub/languages/perl/cpan/ + +=item * + +Poland + + ftp://ftp.pk.edu.pl/pub/lang/perl/CPAN/ + ftp://ftp.mega.net.pl/pub/mirrors/ftp.perl.com/ + ftp://ftp.man.torun.pl/pub/doc/CPAN/ + ftp://sunsite.icm.edu.pl/pub/CPAN/ + +=item * + +Portugal + + ftp://ftp.ua.pt/pub/CPAN/ + ftp://perl.di.uminho.pt/pub/CPAN/ + ftp://ftp.ist.utl.pt/pub/CPAN/ + ftp://ftp.netc.pt/pub/CPAN/ + +=item * + +Romania + + ftp://archive.logicnet.ro/mirrors/ftp.cpan.org/CPAN/ + ftp://ftp.kappa.ro/pub/mirrors/ftp.perl.org/pub/CPAN/ + ftp://ftp.dntis.ro/pub/cpan/ + ftp://ftp.opsynet.com/cpan/ + ftp://ftp.dnttm.ro/pub/CPAN/ + ftp://ftp.timisoara.roedu.net/mirrors/CPAN/ + +=item * + +Russia + + ftp://ftp.chg.ru/pub/lang/perl/CPAN/ + http://cpan.rinet.ru/ + ftp://cpan.rinet.ru/pub/mirror/CPAN/ + ftp://ftp.aha.ru/pub/CPAN/ + ftp://ftp.sai.msu.su/pub/lang/perl/CPAN/ + +=item * + +Slovakia + + ftp://ftp.entry.sk/pub/languages/perl/CPAN/ + +=item * + +Slovenia + + ftp://ftp.arnes.si/software/perl/CPAN/ + +=item * + +Spain + + ftp://ftp.rediris.es/mirror/CPAN/ + ftp://ftp.etse.urv.es/pub/perl/ + +=item * + +Sweden + + http://ftp.du.se/CPAN/ + ftp://ftp.du.se/pub/CPAN/ + ftp://ftp.sunet.se/pub/lang/perl/CPAN/ + +=item * + +Switzerland + + ftp://ftp.danyk.ch/CPAN/ + ftp://sunsite.cnlab-switch.ch/mirror/CPAN/ + +=item * + +Turkey + + ftp://sunsite.bilkent.edu.tr/pub/languages/CPAN/ + +=back + +=head2 North America + +=over 4 + +=item * + +Canada + +=over 8 + +=item * + +Alberta + + http://sunsite.ualberta.ca/pub/Mirror/CPAN/ + ftp://sunsite.ualberta.ca/pub/Mirror/CPAN/ + +=item * + +Manitoba + + http://theoryx5.uwinnipeg.ca/pub/CPAN/ + ftp://theoryx5.uwinnipeg.ca/pub/CPAN/ + +=item * + +Nova Scotia + + ftp://cpan.chebucto.ns.ca/pub/CPAN/ + +=item * + +Ontario + + ftp://ftp.crc.ca/pub/packages/lang/perl/CPAN/ + +=item * + +Mexico + + http://www.msg.com.mx/CPAN/ + ftp://ftp.msg.com.mx/pub/CPAN/ + +=back + +=item * + +United States + +=over 8 + +=item * + +Alabama + + http://mirror.hiwaay.net/CPAN/ + ftp://mirror.hiwaay.net/CPAN/ + +=item * + +California + + http://www.cpan.org/ + ftp://ftp.cpan.org/CPAN/ + ftp://cpan.nas.nasa.gov/pub/perl/CPAN/ + ftp://ftp.digital.com/pub/plan/perl/CPAN/ + http://www.kernel.org/pub/mirrors/cpan/ + ftp://ftp.kernel.org/pub/mirrors/cpan/ + http://www.perl.com/CPAN/ + http://download.sourceforge.net/mirrors/CPAN/ + +=item * + +Colorado + + ftp://ftp.cs.colorado.edu/pub/perl/CPAN/ + +=item * + +Florida + + ftp://ftp.cise.ufl.edu/pub/perl/CPAN/ + +=item * + +Georgia + + ftp://ftp.twoguys.org/CPAN/ + +=item * + +Illinois + + http://www.neurogames.com/mirrors/CPAN + http://uiarchive.uiuc.edu/mirrors/ftp/ftp.cpan.org/pub/CPAN/ + ftp://uiarchive.uiuc.edu/mirrors/ftp/ftp.cpan.org/pub/CPAN/ + +=item * + +Indiana + + ftp://ftp.uwsg.indiana.edu/pub/perl/CPAN/ + http://cpan.nitco.com/ + ftp://cpan.nitco.com/pub/CPAN/ + ftp://cpan.in-span.net/ + http://csociety-ftp.ecn.purdue.edu/pub/CPAN + ftp://csociety-ftp.ecn.purdue.edu/pub/CPAN + +=item * + +Kentucky + + http://cpan.uky.edu/ + ftp://cpan.uky.edu/pub/CPAN/ + +=item * + +Massachusetts + + ftp://ftp.ccs.neu.edu/net/mirrors/ftp.funet.fi/pub/languages/perl/CPAN/ + ftp://ftp.iguide.com/pub/mirrors/packages/perl/CPAN/ + +=item * + +New Jersey + + ftp://ftp.cpanel.net/pub/CPAN/ + +=item * + +New York + + ftp://ftp.freesoftware.com/pub/perl/CPAN/ + http://www.deao.net/mirrors/CPAN/ + ftp://ftp.deao.net/pub/CPAN/ + ftp://ftp.stealth.net/pub/mirrors/ftp.cpan.org/pub/CPAN/ + http://mirror.nyc.anidea.com/CPAN/ + ftp://mirror.nyc.anidea.com/pub/CPAN/ + http://www.rge.com/pub/languages/perl/ + ftp://ftp.rge.com/pub/languages/perl/ + ftp://mirrors.cloud9.net/pub/mirrors/CPAN/ + +=item * + +North Carolina + + ftp://ftp.duke.edu/pub/perl/ + +=item * + +Ohio + + ftp://ftp.loaded.net/pub/CPAN/ + +=item * + +Oklahoma + + ftp://ftp.ou.edu/mirrors/CPAN/ + +=item * + +Oregon + + ftp://ftp.orst.edu/pub/packages/CPAN/ + +=item * + +Pennsylvania + + http://ftp.epix.net/CPAN/ + ftp://ftp.epix.net/pub/languages/perl/ + ftp://carroll.cac.psu.edu/pub/CPAN/ + +=item * + +Tennessee + + ftp://ftp.sunsite.utk.edu/pub/CPAN/ + +=item * + +Texas + + http://ftp.sedl.org/pub/mirrors/CPAN/ + http://jhcloos.com/pub/mirror/CPAN/ + ftp://jhcloos.com/pub/mirror/CPAN/ + +=item * + +Utah + + ftp://mirror.xmission.com/CPAN/ + +=item * + +Virginia + + http://mirrors.rcn.net/pub/lang/CPAN/ + ftp://mirrors.rcn.net/pub/lang/CPAN/ + ftp://ruff.cs.jmu.edu/pub/CPAN/ + http://perl.Liquidation.com/CPAN/ + +=item * + +Washington + + http://cpan.llarian.net/ + ftp://cpan.llarian.net/pub/CPAN/ + ftp://ftp-mirror.internap.com/pub/CPAN/ + ftp://ftp.spu.edu/pub/CPAN/ + +=back + +=back + +=head2 Oceania + +=over 4 + +=item * + +Australia + + http://ftp.planetmirror.com/pub/CPAN/ + ftp://ftp.planetmirror.com/pub/CPAN/ + ftp://mirror.aarnet.edu.au/pub/perl/CPAN/ + ftp://cpan.topend.com.au/pub/CPAN/ + +=item * + +New Zealand + + ftp://ftp.auckland.ac.nz/pub/perl/CPAN/ + +=back + +=head2 South America + +=over 4 + +=item * + +Argentina + + ftp://mirrors.bannerlandia.com.ar/mirrors/CPAN/ + +=item * + +Brazil + + ftp://cpan.pop-mg.com.br/pub/CPAN/ + ftp://ftp.matrix.com.br/pub/perl/ + ftp://cpan.if.usp.br/pub/mirror/CPAN/ + +=item * + +Chile + + ftp://ftp.psinet.cl/pub/programming/perl/CPAN/ + ftp://sunsite.dcc.uchile.cl/pub/lang/perl/ + +=back + +For an up-to-date listing of CPAN sites, +see http://www.cpan.org/SITES or ftp://www.cpan.org/SITES . + +=head1 Modules: Creation, Use, and Abuse + +(The following section is borrowed directly from Tim Bunce's modules +file, available at your nearest CPAN site.) + +Perl implements a class using a package, but the presence of a +package doesn't imply the presence of a class. A package is just a +namespace. A class is a package that provides subroutines that can be +used as methods. A method is just a subroutine that expects, as its +first argument, either the name of a package (for "static" methods), +or a reference to something (for "virtual" methods). + +A module is a file that (by convention) provides a class of the same +name (sans the .pm), plus an import method in that class that can be +called to fetch exported symbols. This module may implement some of +its methods by loading dynamic C or C++ objects, but that should be +totally transparent to the user of the module. Likewise, the module +might set up an AUTOLOAD function to slurp in subroutine definitions on +demand, but this is also transparent. Only the F<.pm> file is required to +exist. See L<perlsub>, L<perltoot>, and L<AutoLoader> for details about +the AUTOLOAD mechanism. + +=head2 Guidelines for Module Creation + +=over 4 + +=item * + +Do similar modules already exist in some form? + +If so, please try to reuse the existing modules either in whole or +by inheriting useful features into a new class. If this is not +practical try to get together with the module authors to work on +extending or enhancing the functionality of the existing modules. +A perfect example is the plethora of packages in perl4 for dealing +with command line options. + +If you are writing a module to expand an already existing set of +modules, please coordinate with the author of the package. It +helps if you follow the same naming scheme and module interaction +scheme as the original author. + +=item * + +Try to design the new module to be easy to extend and reuse. + +Try to C<use warnings;> (or C<use warnings qw(...);>). +Remember that you can add C<no warnings qw(...);> to individual blocks +of code that need less warnings. + +Use blessed references. Use the two argument form of bless to bless +into the class name given as the first parameter of the constructor, +e.g.,: + + sub new { + my $class = shift; + return bless {}, $class; + } + +or even this if you'd like it to be used as either a static +or a virtual method. + + sub new { + my $self = shift; + my $class = ref($self) || $self; + return bless {}, $class; + } + +Pass arrays as references so more parameters can be added later +(it's also faster). Convert functions into methods where +appropriate. Split large methods into smaller more flexible ones. +Inherit methods from other modules if appropriate. + +Avoid class name tests like: C<die "Invalid" unless ref $ref eq 'FOO'>. +Generally you can delete the C<eq 'FOO'> part with no harm at all. +Let the objects look after themselves! Generally, avoid hard-wired +class names as far as possible. + +Avoid C<< $r->Class::func() >> where using C<@ISA=qw(... Class ...)> and +C<< $r->func() >> would work (see L<perlbot> for more details). + +Use autosplit so little used or newly added functions won't be a +burden to programs that don't use them. Add test functions to +the module after __END__ either using AutoSplit or by saying: + + eval join('',<main::DATA>) || die $@ unless caller(); + +Does your module pass the 'empty subclass' test? If you say +C<@SUBCLASS::ISA = qw(YOURCLASS);> your applications should be able +to use SUBCLASS in exactly the same way as YOURCLASS. For example, +does your application still work if you change: C<$obj = new YOURCLASS;> +into: C<$obj = new SUBCLASS;> ? + +Avoid keeping any state information in your packages. It makes it +difficult for multiple other packages to use yours. Keep state +information in objects. + +Always use B<-w>. + +Try to C<use strict;> (or C<use strict qw(...);>). +Remember that you can add C<no strict qw(...);> to individual blocks +of code that need less strictness. + +Always use B<-w>. + +Follow the guidelines in the perlstyle(1) manual. + +Always use B<-w>. + +=item * + +Some simple style guidelines + +The perlstyle manual supplied with Perl has many helpful points. + +Coding style is a matter of personal taste. Many people evolve their +style over several years as they learn what helps them write and +maintain good code. Here's one set of assorted suggestions that +seem to be widely used by experienced developers: + +Use underscores to separate words. It is generally easier to read +$var_names_like_this than $VarNamesLikeThis, especially for +non-native speakers of English. It's also a simple rule that works +consistently with VAR_NAMES_LIKE_THIS. + +Package/Module names are an exception to this rule. Perl informally +reserves lowercase module names for 'pragma' modules like integer +and strict. Other modules normally begin with a capital letter and +use mixed case with no underscores (need to be short and portable). + +You may find it helpful to use letter case to indicate the scope +or nature of a variable. For example: + + $ALL_CAPS_HERE constants only (beware clashes with Perl vars) + $Some_Caps_Here package-wide global/static + $no_caps_here function scope my() or local() variables + +Function and method names seem to work best as all lowercase. +e.g., C<< $obj->as_string() >>. + +You can use a leading underscore to indicate that a variable or +function should not be used outside the package that defined it. + +=item * + +Select what to export. + +Do NOT export method names! + +Do NOT export anything else by default without a good reason! + +Exports pollute the namespace of the module user. If you must +export try to use @EXPORT_OK in preference to @EXPORT and avoid +short or common names to reduce the risk of name clashes. + +Generally anything not exported is still accessible from outside the +module using the ModuleName::item_name (or C<< $blessed_ref->method >>) +syntax. By convention you can use a leading underscore on names to +indicate informally that they are 'internal' and not for public use. + +(It is actually possible to get private functions by saying: +C<my $subref = sub { ... }; &$subref;>. But there's no way to call that +directly as a method, because a method must have a name in the symbol +table.) + +As a general rule, if the module is trying to be object oriented +then export nothing. If it's just a collection of functions then +@EXPORT_OK anything but use @EXPORT with caution. + +=item * + +Select a name for the module. + +This name should be as descriptive, accurate, and complete as +possible. Avoid any risk of ambiguity. Always try to use two or +more whole words. Generally the name should reflect what is special +about what the module does rather than how it does it. Please use +nested module names to group informally or categorize a module. +There should be a very good reason for a module not to have a nested name. +Module names should begin with a capital letter. + +Having 57 modules all called Sort will not make life easy for anyone +(though having 23 called Sort::Quick is only marginally better :-). +Imagine someone trying to install your module alongside many others. +If in any doubt ask for suggestions in comp.lang.perl.misc. + +If you are developing a suite of related modules/classes it's good +practice to use nested classes with a common prefix as this will +avoid namespace clashes. For example: Xyz::Control, Xyz::View, +Xyz::Model etc. Use the modules in this list as a naming guide. + +If adding a new module to a set, follow the original author's +standards for naming modules and the interface to methods in +those modules. + +If developing modules for private internal or project specific use, +that will never be released to the public, then you should ensure +that their names will not clash with any future public module. You +can do this either by using the reserved Local::* category or by +using a category name that includes an underscore like Foo_Corp::*. + +To be portable each component of a module name should be limited to +11 characters. If it might be used on MS-DOS then try to ensure each is +unique in the first 8 characters. Nested modules make this easier. + +=item * + +Have you got it right? + +How do you know that you've made the right decisions? Have you +picked an interface design that will cause problems later? Have +you picked the most appropriate name? Do you have any questions? + +The best way to know for sure, and pick up many helpful suggestions, +is to ask someone who knows. Comp.lang.perl.misc is read by just about +all the people who develop modules and it's the best place to ask. + +All you need to do is post a short summary of the module, its +purpose and interfaces. A few lines on each of the main methods is +probably enough. (If you post the whole module it might be ignored +by busy people - generally the very people you want to read it!) + +Don't worry about posting if you can't say when the module will be +ready - just say so in the message. It might be worth inviting +others to help you, they may be able to complete it for you! + +=item * + +README and other Additional Files. + +It's well known that software developers usually fully document the +software they write. If, however, the world is in urgent need of +your software and there is not enough time to write the full +documentation please at least provide a README file containing: + +=over 10 + +=item * + +A description of the module/package/extension etc. + +=item * + +A copyright notice - see below. + +=item * + +Prerequisites - what else you may need to have. + +=item * + +How to build it - possible changes to Makefile.PL etc. + +=item * + +How to install it. + +=item * + +Recent changes in this release, especially incompatibilities + +=item * + +Changes / enhancements you plan to make in the future. + +=back + +If the README file seems to be getting too large you may wish to +split out some of the sections into separate files: INSTALL, +Copying, ToDo etc. + +=over 4 + +=item Adding a Copyright Notice. + + +How you choose to license your work is a personal decision. +The general mechanism is to assert your Copyright and then make +a declaration of how others may copy/use/modify your work. + +Perl, for example, is supplied with two types of licence: The GNU +GPL and The Artistic Licence (see the files README, Copying, and +Artistic). Larry has good reasons for NOT just using the GNU GPL. + +My personal recommendation, out of respect for Larry, Perl, and the +Perl community at large is to state something simply like: + + Copyright (c) 1995 Your Name. All rights reserved. + This program is free software; you can redistribute it and/or + modify it under the same terms as Perl itself. + +This statement should at least appear in the README file. You may +also wish to include it in a Copying file and your source files. +Remember to include the other words in addition to the Copyright. + +=item * + +Give the module a version/issue/release number. + +To be fully compatible with the Exporter and MakeMaker modules you +should store your module's version number in a non-my package +variable called $VERSION. This should be a floating point +number with at least two digits after the decimal (i.e., hundredths, +e.g, C<$VERSION = "0.01">). Don't use a "1.3.2" style version. +See L<Exporter> for details. + +It may be handy to add a function or method to retrieve the number. +Use the number in announcements and archive file names when +releasing the module (ModuleName-1.02.tar.Z). +See perldoc ExtUtils::MakeMaker.pm for details. + +=item * + +How to release and distribute a module. + +It's good idea to post an announcement of the availability of your +module (or the module itself if small) to the comp.lang.perl.announce +Usenet newsgroup. This will at least ensure very wide once-off +distribution. + +If possible, register the module with CPAN. You should +include details of its location in your announcement. + +Some notes about ftp archives: Please use a long descriptive file +name that includes the version number. Most incoming directories +will not be readable/listable, i.e., you won't be able to see your +file after uploading it. Remember to send your email notification +message as soon as possible after uploading else your file may get +deleted automatically. Allow time for the file to be processed +and/or check the file has been processed before announcing its +location. + +FTP Archives for Perl Modules: + +Follow the instructions and links on: + + http://www.cpan.org/modules/00modlist.long.html + http://www.cpan.org/modules/04pause.html + +or upload to one of these sites: + + https://pause.kbx.de/pause/ + http://pause.perl.org/pause/ + +and notify <modules@perl.org>. + +By using the WWW interface you can ask the Upload Server to mirror +your modules from your ftp or WWW site into your own directory on +CPAN! + +Please remember to send me an updated entry for the Module list! + +=item * + +Take care when changing a released module. + +Always strive to remain compatible with previous released versions. +Otherwise try to add a mechanism to revert to the +old behavior if people rely on it. Document incompatible changes. + +=back + +=back + +=head2 Guidelines for Converting Perl 4 Library Scripts into Modules + +=over 4 + +=item * + +There is no requirement to convert anything. + +If it ain't broke, don't fix it! Perl 4 library scripts should +continue to work with no problems. You may need to make some minor +changes (like escaping non-array @'s in double quoted strings) but +there is no need to convert a .pl file into a Module for just that. + +=item * + +Consider the implications. + +All Perl applications that make use of the script will need to +be changed (slightly) if the script is converted into a module. Is +it worth it unless you plan to make other changes at the same time? + +=item * + +Make the most of the opportunity. + +If you are going to convert the script to a module you can use the +opportunity to redesign the interface. The guidelines for module +creation above include many of the issues you should consider. + +=item * + +The pl2pm utility will get you started. + +This utility will read *.pl files (given as parameters) and write +corresponding *.pm files. The pl2pm utilities does the following: + +=over 10 + +=item * + +Adds the standard Module prologue lines + +=item * + +Converts package specifiers from ' to :: + +=item * + +Converts die(...) to croak(...) + +=item * + +Several other minor changes + +=back + +Being a mechanical process pl2pm is not bullet proof. The converted +code will need careful checking, especially any package statements. +Don't delete the original .pl file till the new .pm one works! + +=back + +=head2 Guidelines for Reusing Application Code + +=over 4 + +=item * + +Complete applications rarely belong in the Perl Module Library. + +=item * + +Many applications contain some Perl code that could be reused. + +Help save the world! Share your code in a form that makes it easy +to reuse. + +=item * + +Break-out the reusable code into one or more separate module files. + +=item * + +Take the opportunity to reconsider and redesign the interfaces. + +=item * + +In some cases the 'application' can then be reduced to a small + +fragment of code built on top of the reusable modules. In these cases +the application could invoked as: + + % perl -e 'use Module::Name; method(@ARGV)' ... +or + % perl -mModule::Name ... (in perl5.002 or higher) + +=back + +=head1 NOTE + +Perl does not enforce private and public parts of its modules as you may +have been used to in other languages like C++, Ada, or Modula-17. Perl +doesn't have an infatuation with enforced privacy. It would prefer +that you stayed out of its living room because you weren't invited, not +because it has a shotgun. + +The module and its user have a contract, part of which is common law, +and part of which is "written". Part of the common law contract is +that a module doesn't pollute any namespace it wasn't asked to. The +written contract for the module (A.K.A. documentation) may make other +provisions. But then you know when you C<use RedefineTheWorld> that +you're redefining the world and willing to take the consequences. +EOF + +close MANIFEST or warn "$0: failed to close MANIFEST (../MANIFEST): $!"; +close OUT or warn "$0: failed to close OUT (perlmodlib.tmp): $!"; + diff --git a/contrib/perl5/pod/perlnewmod.pod b/contrib/perl5/pod/perlnewmod.pod new file mode 100644 index 0000000000000..ace8d85130f65 --- /dev/null +++ b/contrib/perl5/pod/perlnewmod.pod @@ -0,0 +1,282 @@ +=head1 NAME + +perlnewmod - preparing a new module for distribution + +=head1 DESCRIPTION + +This document gives you some suggestions about how to go about writing +Perl modules, preparing them for distribution, and making them available +via CPAN. + +One of the things that makes Perl really powerful is the fact that Perl +hackers tend to want to share the solutions to problems they've faced, +so you and I don't have to battle with the same problem again. + +The main way they do this is by abstracting the solution into a Perl +module. If you don't know what one of these is, the rest of this +document isn't going to be much use to you. You're also missing out on +an awful lot of useful code; consider having a look at L<perlmod>, +L<perlmodlib> and L<perlmodinstall> before coming back here. + +When you've found that there isn't a module available for what you're +trying to do, and you've had to write the code yourself, consider +packaging up the solution into a module and uploading it to CPAN so that +others can benefit. + +=head2 Warning + +We're going to primarily concentrate on Perl-only modules here, rather +than XS modules. XS modules serve a rather different purpose, and +you should consider different things before distributing them - the +popularity of the library you are gluing, the portability to other +operating systems, and so on. However, the notes on preparing the Perl +side of the module and packaging and distributing it will apply equally +well to an XS module as a pure-Perl one. + +=head2 What should I make into a module? + +You should make a module out of any code that you think is going to be +useful to others. Anything that's likely to fill a hole in the communal +library and which someone else can slot directly into their program. Any +part of your code which you can isolate and extract and plug into +something else is a likely candidate. + +Let's take an example. Suppose you're reading in data from a local +format into a hash-of-hashes in Perl, turning that into a tree, walking +the tree and then piping each node to an Acme Transmogrifier Server. + +Now, quite a few people have the Acme Transmogrifier, and you've had to +write something to talk the protocol from scratch - you'd almost +certainly want to make that into a module. The level at which you pitch +it is up to you: you might want protocol-level modules analogous to +L<Net::SMTP|Net::SMTP> which then talk to higher level modules analogous +to L<Mail::Send|Mail::Send>. The choice is yours, but you do want to get +a module out for that server protocol. + +Nobody else on the planet is going to talk your local data format, so we +can ignore that. But what about the thing in the middle? Building tree +structures from Perl variables and then traversing them is a nice, +general problem, and if nobody's already written a module that does +that, you might want to modularise that code too. + +So hopefully you've now got a few ideas about what's good to modularise. +Let's now see how it's done. + +=head2 Step-by-step: Preparing the ground + +Before we even start scraping out the code, there are a few things we'll +want to do in advance. + +=over 3 + +=item Look around + +Dig into a bunch of modules to see how they're written. I'd suggest +starting with L<Text::Tabs|Text::Tabs>, since it's in the standard +library and is nice and simple, and then looking at something like +L<Time::Zone|Time::Zone>, L<File::Copy|File::Copy> and then some of the +C<Mail::*> modules if you're planning on writing object oriented code. + +These should give you an overall feel for how modules are laid out and +written. + +=item Check it's new + +There are a lot of modules on CPAN, and it's easy to miss one that's +similar to what you're planning on contributing. Have a good plough +through the modules list and the F<by-module> directories, and make sure +you're not the one reinventing the wheel! + +=item Discuss the need + +You might love it. You might feel that everyone else needs it. But there +might not actually be any real demand for it out there. If you're unsure +about the demand you're module will have, consider sending out feelers +on the C<comp.lang.perl.modules> newsgroup, or as a last resort, ask the +modules list at C<modules@perl.org>. Remember that this is a closed list +with a very long turn-around time - be prepared to wait a good while for +a response from them. + +=item Choose a name + +Perl modules included on CPAN have a naming hierarchy you should try to +fit in with. See L<perlmodlib> for more details on how this works, and +browse around CPAN and the modules list to get a feel of it. At the very +least, remember this: modules should be title capitalised, (This::Thing) +fit in with a category, and explain their purpose succinctly. + +=item Check again + +While you're doing that, make really sure you haven't missed a module +similar to the one you're about to write. + +When you've got your name sorted out and you're sure that your module is +wanted and not currently available, it's time to start coding. + +=back + +=head2 Step-by-step: Making the module + +=over 3 + +=item Start with F<h2xs> + +Originally a utility to convert C header files into XS modules, +L<h2xs|h2xs> has become a useful utility for churning out skeletons for +Perl-only modules as well. If you don't want to use the +L<Autoloader|Autoloader> which splits up big modules into smaller +subroutine-sized chunks, you'll say something like this: + + h2xs -AX -n Net::Acme + +The C<-A> omits the Autoloader code, C<-X> omits XS elements, and C<-n> +specifies the name of the module. + +=item Use L<strict|strict> and L<warnings|warnings> + +A module's code has to be warning and strict-clean, since you can't +guarantee the conditions that it'll be used under. Besides, you wouldn't +want to distribute code that wasn't warning or strict-clean anyway, +right? + +=item Use L<Carp|Carp> + +The L<Carp|Carp> module allows you to present your error messages from +the caller's perspective; this gives you a way to signal a problem with +the caller and not your module. For instance, if you say this: + + warn "No hostname given"; + +the user will see something like this: + + No hostname given at /usr/local/lib/perl5/site_perl/5.6.0/Net/Acme.pm + line 123. + +which looks like your module is doing something wrong. Instead, you want +to put the blame on the user, and say this: + + No hostname given at bad_code, line 10. + +You do this by using L<Carp|Carp> and replacing your C<warn>s with +C<carp>s. If you need to C<die>, say C<croak> instead. However, keep +C<warn> and C<die> in place for your sanity checks - where it really is +your module at fault. + +=item Use L<Exporter|Exporter> - wisely! + +C<h2xs> provides stubs for L<Exporter|Exporter>, which gives you a +standard way of exporting symbols and subroutines from your module into +the caller's namespace. For instance, saying C<use Net::Acme qw(&frob)> +would import the C<frob> subroutine. + +The package variable C<@EXPORT> will determine which symbols will get +exported when the caller simply says C<use Net::Acme> - you will hardly +ever want to put anything in there. C<@EXPORT_OK>, on the other hand, +specifies which symbols you're willing to export. If you do want to +export a bunch of symbols, use the C<%EXPORT_TAGS> and define a standard +export set - look at L<Exporter> for more details. + +=item Use L<plain old documentation|perlpod> + +The work isn't over until the paperwork is done, and you're going to +need to put in some time writing some documentation for your module. +C<h2xs> will provide a stub for you to fill in; if you're not sure about +the format, look at L<perlpod> for an introduction. Provide a good +synopsis of how your module is used in code, a description, and then +notes on the syntax and function of the individual subroutines or +methods. Use Perl comments for developer notes and POD for end-user +notes. + +=item Write tests + +You're encouraged to create self-tests for your module to ensure it's +working as intended on the myriad platforms Perl supports; if you upload +your module to CPAN, a host of testers will build your module and send +you the results of the tests. Again, C<h2xs> provides a test framework +which you can extend - you should do something more than just checking +your module will compile. + +=item Write the README + +If you're uploading to CPAN, the automated gremlins will extract the +README file and place that in your CPAN directory. It'll also appear in +the main F<by-module> and F<by-category> directories if you make it onto +the modules list. It's a good idea to put here what the module actually +does in detail, and the user-visible changes since the last release. + +=back + +=head2 Step-by-step: Distributing your module + +=over 3 + +=item Get a CPAN user ID + +Every developer publishing modules on CPAN needs a CPAN ID. See the +instructions at C<http://www.cpan.org/modules/04pause.html> (or +equivalent on your nearest mirror) to find out how to do this. + +=item C<perl Makefile.PL; make test; make dist> + +Once again, C<h2xs> has done all the work for you. It produces the +standard C<Makefile.PL> you'll have seen when you downloaded and +installs modules, and this produces a Makefile with a C<dist> target. + +Once you've ensured that your module passes its own tests - always a +good thing to make sure - you can C<make dist>, and the Makefile will +hopefully produce you a nice tarball of your module, ready for upload. + +=item Upload the tarball + +The email you got when you received your CPAN ID will tell you how to +log in to PAUSE, the Perl Authors Upload SErver. From the menus there, +you can upload your module to CPAN. + +=item Announce to the modules list + +Once uploaded, it'll sit unnoticed in your author directory. If you want +it connected to the rest of the CPAN, you'll need to tell the modules +list about it. The best way to do this is to email them a line in the +style of the modules list, like this: + + Net::Acme bdpO Interface to Acme Frobnicator servers FOOBAR + ^ ^^^^ ^ ^ + | |||| Module description Your ID + | |||| + | |||\- Interface: (O)OP, (r)eferences, (h)ybrid, (f)unctions + | ||| + | ||\-- Language: (p)ure Perl, C(+)+, (h)ybrid, (C), (o)ther + | || + Module |\--- Support: (d)eveloper, (m)ailing list, (u)senet, (n)one + Name | + \---- Maturity: (i)dea, (c)onstructions, (a)lpha, (b)eta, + (R)eleased, (M)ature, (S)tandard + +plus a description of the module and why you think it should be +included. If you hear nothing back, that means your module will +probably appear on the modules list at the next update. Don't try +subscribing to C<modules@perl.org>; it's not another mailing list. Just +have patience. + +=item Announce to clpa + +If you have a burning desire to tell the world about your release, post +an announcement to the moderated C<comp.lang.perl.announce> newsgroup. + +=item Fix bugs! + +Once you start accumulating users, they'll send you bug reports. If +you're lucky, they'll even send you patches. Welcome to the joys of +maintaining a software project... + +=back + +=head1 AUTHOR + +Simon Cozens, C<simon@cpan.org> + +=head1 SEE ALSO + +L<perlmod>, L<perlmodlib>, L<perlmodinstall>, L<h2xs>, L<strict>, +L<Carp>, L<Exporter>, L<perlpod>, L<Test>, L<ExtUtils::MakeMaker>, +http://www.cpan.org/ diff --git a/contrib/perl5/pod/perlrequick.pod b/contrib/perl5/pod/perlrequick.pod new file mode 100644 index 0000000000000..5b72a35187faf --- /dev/null +++ b/contrib/perl5/pod/perlrequick.pod @@ -0,0 +1,503 @@ +=head1 NAME + +perlrequick - Perl regular expressions quick start + +=head1 DESCRIPTION + +This page covers the very basics of understanding, creating and +using regular expressions ('regexes') in Perl. + + +=head1 The Guide + +=head2 Simple word matching + +The simplest regex is simply a word, or more generally, a string of +characters. A regex consisting of a word matches any string that +contains that word: + + "Hello World" =~ /World/; # matches + +In this statement, C<World> is a regex and the C<//> enclosing +C</World/> tells perl to search a string for a match. The operator +C<=~> associates the string with the regex match and produces a true +value if the regex matched, or false if the regex did not match. In +our case, C<World> matches the second word in C<"Hello World">, so the +expression is true. This idea has several variations. + +Expressions like this are useful in conditionals: + + print "It matches\n" if "Hello World" =~ /World/; + +The sense of the match can be reversed by using C<!~> operator: + + print "It doesn't match\n" if "Hello World" !~ /World/; + +The literal string in the regex can be replaced by a variable: + + $greeting = "World"; + print "It matches\n" if "Hello World" =~ /$greeting/; + +If you're matching against C<$_>, the C<$_ =~> part can be omitted: + + $_ = "Hello World"; + print "It matches\n" if /World/; + +Finally, the C<//> default delimiters for a match can be changed to +arbitrary delimiters by putting an C<'m'> out front: + + "Hello World" =~ m!World!; # matches, delimited by '!' + "Hello World" =~ m{World}; # matches, note the matching '{}' + "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin', + # '/' becomes an ordinary char + +Regexes must match a part of the string I<exactly> in order for the +statement to be true: + + "Hello World" =~ /world/; # doesn't match, case sensitive + "Hello World" =~ /o W/; # matches, ' ' is an ordinary char + "Hello World" =~ /World /; # doesn't match, no ' ' at end + +perl will always match at the earliest possible point in the string: + + "Hello World" =~ /o/; # matches 'o' in 'Hello' + "That hat is red" =~ /hat/; # matches 'hat' in 'That' + +Not all characters can be used 'as is' in a match. Some characters, +called B<metacharacters>, are reserved for use in regex notation. +The metacharacters are + + {}[]()^$.|*+?\ + +A metacharacter can be matched by putting a backslash before it: + + "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter + "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary + + 'C:\WIN32' =~ /C:\\WIN/; # matches + "/usr/bin/perl" =~ /\/usr\/local\/bin\/perl/; # matches + +In the last regex, the forward slash C<'/'> is also backslashed, +because it is used to delimit the regex. + +Non-printable ASCII characters are represented by B<escape sequences>. +Common examples are C<\t> for a tab, C<\n> for a newline, and C<\r> +for a carriage return. Arbitrary bytes are represented by octal +escape sequences, e.g., C<\033>, or hexadecimal escape sequences, +e.g., C<\x1B>: + + "1000\t2000" =~ m(0\t2) # matches + "cat" =~ /\143\x61\x74/ # matches, but a weird way to spell cat + +Regexes are treated mostly as double quoted strings, so variable +substitution works: + + $foo = 'house'; + 'cathouse' =~ /cat$foo/; # matches + 'housecat' =~ /${foo}cat/; # matches + +With all of the regexes above, if the regex matched anywhere in the +string, it was considered a match. To specify I<where> it should +match, we would use the B<anchor> metacharacters C<^> and C<$>. The +anchor C<^> means match at the beginning of the string and the anchor +C<$> means match at the end of the string, or before a newline at the +end of the string. Some examples: + + "housekeeper" =~ /keeper/; # matches + "housekeeper" =~ /^keeper/; # doesn't match + "housekeeper" =~ /keeper$/; # matches + "housekeeper\n" =~ /keeper$/; # matches + "housekeeper" =~ /^housekeeper$/; # matches + +=head2 Using character classes + +A B<character class> allows a set of possible characters, rather than +just a single character, to match at a particular point in a regex. +Character classes are denoted by brackets C<[...]>, with the set of +characters to be possibly matched inside. Here are some examples: + + /cat/; # matches 'cat' + /[bcr]at/; # matches 'bat', 'cat', or 'rat' + "abc" =~ /[cab]/; # matches 'a' + +In the last statement, even though C<'c'> is the first character in +the class, the earliest point at which the regex can match is C<'a'>. + + /[yY][eE][sS]/; # match 'yes' in a case-insensitive way + # 'yes', 'Yes', 'YES', etc. + /yes/i; # also match 'yes' in a case-insensitive way + +The last example shows a match with an C<'i'> B<modifier>, which makes +the match case-insensitive. + +Character classes also have ordinary and special characters, but the +sets of ordinary and special characters inside a character class are +different than those outside a character class. The special +characters for a character class are C<-]\^$> and are matched using an +escape: + + /[\]c]def/; # matches ']def' or 'cdef' + $x = 'bcr'; + /[$x]at/; # matches 'bat, 'cat', or 'rat' + /[\$x]at/; # matches '$at' or 'xat' + /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat' + +The special character C<'-'> acts as a range operator within character +classes, so that the unwieldy C<[0123456789]> and C<[abc...xyz]> +become the svelte C<[0-9]> and C<[a-z]>: + + /item[0-9]/; # matches 'item0' or ... or 'item9' + /[0-9a-fA-F]/; # matches a hexadecimal digit + +If C<'-'> is the first or last character in a character class, it is +treated as an ordinary character. + +The special character C<^> in the first position of a character class +denotes a B<negated character class>, which matches any character but +those in the brackets. Both C<[...]> and C<[^...]> must match a +character, or the match fails. Then + + /[^a]at/; # doesn't match 'aat' or 'at', but matches + # all other 'bat', 'cat, '0at', '%at', etc. + /[^0-9]/; # matches a non-numeric character + /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary + +Perl has several abbreviations for common character classes: + +=over 4 + +=item * + +\d is a digit and represents [0-9] + +=item * + +\s is a whitespace character and represents [\ \t\r\n\f] + +=item * + +\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_] + +=item * + +\D is a negated \d; it represents any character but a digit [^0-9] + +=item * + +\S is a negated \s; it represents any non-whitespace character [^\s] + +=item * + +\W is a negated \w; it represents any non-word character [^\w] + +=item * + +The period '.' matches any character but "\n" + +=back + +The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside +of character classes. Here are some in use: + + /\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format + /[\d\s]/; # matches any digit or whitespace character + /\w\W\w/; # matches a word char, followed by a + # non-word char, followed by a word char + /..rt/; # matches any two chars, followed by 'rt' + /end\./; # matches 'end.' + /end[.]/; # same thing, matches 'end.' + +The S<B<word anchor> > C<\b> matches a boundary between a word +character and a non-word character C<\w\W> or C<\W\w>: + + $x = "Housecat catenates house and cat"; + $x =~ /\bcat/; # matches cat in 'catenates' + $x =~ /cat\b/; # matches cat in 'housecat' + $x =~ /\bcat\b/; # matches 'cat' at end of string + +In the last example, the end of the string is considered a word +boundary. + +=head2 Matching this or that + +We can match match different character strings with the B<alternation> +metacharacter C<'|'>. To match C<dog> or C<cat>, we form the regex +C<dog|cat>. As before, perl will try to match the regex at the +earliest possible point in the string. At each character position, +perl will first try to match the the first alternative, C<dog>. If +C<dog> doesn't match, perl will then try the next alternative, C<cat>. +If C<cat> doesn't match either, then the match fails and perl moves to +the next position in the string. Some examples: + + "cats and dogs" =~ /cat|dog|bird/; # matches "cat" + "cats and dogs" =~ /dog|cat|bird/; # matches "cat" + +Even though C<dog> is the first alternative in the second regex, +C<cat> is able to match earlier in the string. + + "cats" =~ /c|ca|cat|cats/; # matches "c" + "cats" =~ /cats|cat|ca|c/; # matches "cats" + +At a given character position, the first alternative that allows the +regex match to succeed wil be the one that matches. Here, all the +alternatives match at the first string position, so th first matches. + +=head2 Grouping things and hierarchical matching + +The B<grouping> metacharacters C<()> allow a part of a regex to be +treated as a single unit. Parts of a regex are grouped by enclosing +them in parentheses. The regex C<house(cat|keeper)> means match +C<house> followed by either C<cat> or C<keeper>. Some more examples +are + + /(a|b)b/; # matches 'ab' or 'bb' + /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere + + /house(cat|)/; # matches either 'housecat' or 'house' + /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or + # 'house'. Note groups can be nested. + + "20" =~ /(19|20|)\d\d/; # matches the null alternative '()\d\d', + # because '20\d\d' can't match + +=head2 Extracting matches + +The grouping metacharacters C<()> also allow the extraction of the +parts of a string that matched. For each grouping, the part that +matched inside goes into the special variables C<$1>, C<$2>, etc. +They can be used just as ordinary variables: + + # extract hours, minutes, seconds + $time =~ /(\d\d):(\d\d):(\d\d)/; # match hh:mm:ss format + $hours = $1; + $minutes = $2; + $seconds = $3; + +In list context, a match C</regex/> with groupings will return the +list of matched values C<($1,$2,...)>. So we could rewrite it as + + ($hours, $minutes, $second) = ($time =~ /(\d\d):(\d\d):(\d\d)/); + +If the groupings in a regex are nested, C<$1> gets the group with the +leftmost opening parenthesis, C<$2> the next opening parenthesis, +etc. For example, here is a complex regex and the matching variables +indicated below it: + + /(ab(cd|ef)((gi)|j))/; + 1 2 34 + +Associated with the matching variables C<$1>, C<$2>, ... are +the B<backreferences> C<\1>, C<\2>, ... Backreferences are +matching variables that can be used I<inside> a regex: + + /(\w\w\w)\s\1/; # find sequences like 'the the' in string + +C<$1>, C<$2>, ... should only be used outside of a regex, and C<\1>, +C<\2>, ... only inside a regex. + +=head2 Matching repetitions + +The B<quantifier> metacharacters C<?>, C<*>, C<+>, and C<{}> allow us +to determine the number of repeats of a portion of a regex we +consider to be a match. Quantifiers are put immediately after the +character, character class, or grouping that we want to specify. They +have the following meanings: + +=over 4 + +=item * + +C<a?> = match 'a' 1 or 0 times + +=item * + +C<a*> = match 'a' 0 or more times, i.e., any number of times + +=item * + +C<a+> = match 'a' 1 or more times, i.e., at least once + +=item * + +C<a{n,m}> = match at least C<n> times, but not more than C<m> +times. + +=item * + +C<a{n,}> = match at least C<n> or more times + +=item * + +C<a{n}> = match exactly C<n> times + +=back + +Here are some examples: + + /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and + # any number of digits + /(\w+)\s+\1/; # match doubled words of arbitrary length + $year =~ /\d{2,4}/; # make sure year is at least 2 but not more + # than 4 digits + $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates + +These quantifiers will try to match as much of the string as possible, +while still allowing the regex to match. So we have + + $x = 'the cat in the hat'; + $x =~ /^(.*)(at)(.*)$/; # matches, + # $1 = 'the cat in the h' + # $2 = 'at' + # $3 = '' (0 matches) + +The first quantifier C<.*> grabs as much of the string as possible +while still having the regex match. The second quantifier C<.*> has +no string left to it, so it matches 0 times. + +=head2 More matching + +There are a few more things you might want to know about matching +operators. In the code + + $pattern = 'Seuss'; + while (<>) { + print if /$pattern/; + } + +perl has to re-evaluate C<$pattern> each time through the loop. If +C<$pattern> won't be changing, use the C<//o> modifier, to only +perform variable substitutions once. If you don't want any +substitutions at all, use the special delimiter C<m''>: + + $pattern = 'Seuss'; + m'$pattern'; # matches '$pattern', not 'Seuss' + +The global modifier C<//g> allows the matching operator to match +within a string as many times as possible. In scalar context, +successive matches against a string will have C<//g> jump from match +to match, keeping track of position in the string as it goes along. +You can get or set the position with the C<pos()> function. +For example, + + $x = "cat dog house"; # 3 words + while ($x =~ /(\w+)/g) { + print "Word is $1, ends at position ", pos $x, "\n"; + } + +prints + + Word is cat, ends at position 3 + Word is dog, ends at position 7 + Word is house, ends at position 13 + +A failed match or changing the target string resets the position. If +you don't want the position reset after failure to match, add the +C<//c>, as in C</regex/gc>. + +In list context, C<//g> returns a list of matched groupings, or if +there are no groupings, a list of matches to the whole regex. So + + @words = ($x =~ /(\w+)/g); # matches, + # $word[0] = 'cat' + # $word[1] = 'dog' + # $word[2] = 'house' + +=head2 Search and replace + +Search and replace is performed using C<s/regex/replacement/modifiers>. +The C<replacement> is a Perl double quoted string that replaces in the +string whatever is matched with the C<regex>. The operator C<=~> is +also used here to associate a string with C<s///>. If matching +against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match, +C<s///> returns the number of substitutions made, otherwise it returns +false. Here are a few examples: + + $x = "Time to feed the cat!"; + $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!" + $y = "'quoted words'"; + $y =~ s/^'(.*)'$/$1/; # strip single quotes, + # $y contains "quoted words" + +With the C<s///> operator, the matched variables C<$1>, C<$2>, etc. +are immediately available for use in the replacement expression. With +the global modifier, C<s///g> will search and replace all occurrences +of the regex in the string: + + $x = "I batted 4 for 4"; + $x =~ s/4/four/; # $x contains "I batted four for 4" + $x = "I batted 4 for 4"; + $x =~ s/4/four/g; # $x contains "I batted four for four" + +The evaluation modifier C<s///e> wraps an C<eval{...}> around the +replacement string and the evaluated result is substituted for the +matched substring. Some examples: + + # reverse all the words in a string + $x = "the cat in the hat"; + $x =~ s/(\w+)/reverse $1/ge; # $x contains "eht tac ni eht tah" + + # convert percentage to decimal + $x = "A 39% hit rate"; + $x =~ s!(\d+)%!$1/100!e; # $x contains "A 0.39 hit rate" + +The last example shows that C<s///> can use other delimiters, such as +C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are used +C<s'''>, then the regex and replacement are treated as single quoted +strings. + +=head2 The split operator + +C<split /regex/, string> splits C<string> into a list of substrings +and returns that list. The regex determines the character sequence +that C<string> is split with respect to. For example, to split a +string into words, use + + $x = "Calvin and Hobbes"; + @word = split /\s+/, $x; # $word[0] = 'Calvin' + # $word[1] = 'and' + # $word[2] = 'Hobbes' + +To extract a comma-delimited list of numbers, use + + $x = "1.618,2.718, 3.142"; + @const = split /,\s*/, $x; # $const[0] = '1.618' + # $const[1] = '2.718' + # $const[2] = '3.142' + +If the empty regex C<//> is used, the string is split into individual +characters. If the regex has groupings, then list produced contains +the matched substrings from the groupings as well: + + $x = "/usr/bin"; + @parts = split m!(/)!, $x; # $parts[0] = '' + # $parts[1] = '/' + # $parts[2] = 'usr' + # $parts[3] = '/' + # $parts[4] = 'bin' + +Since the first character of $x matched the regex, C<split> prepended +an empty initial element to the list. + +=head1 BUGS + +None. + +=head1 SEE ALSO + +This is just a quick start guide. For a more in-depth tutorial on +regexes, see L<perlretut> and for the reference page, see L<perlre>. + +=head1 AUTHOR AND COPYRIGHT + +Copyright (c) 2000 Mark Kvale +All rights reserved. + +This document may be distributed under the same terms as Perl itself. + +=head2 Acknowledgments + +The author would like to thank Mark-Jason Dominus, Tom Christiansen, +Ilya Zakharevich, Brad Hughes, and Mike Giroux for all their helpful +comments. + +=cut + diff --git a/contrib/perl5/pod/perlretut.pod b/contrib/perl5/pod/perlretut.pod new file mode 100644 index 0000000000000..fa6479c0c45bc --- /dev/null +++ b/contrib/perl5/pod/perlretut.pod @@ -0,0 +1,2504 @@ +=head1 NAME + +perlretut - Perl regular expressions tutorial + +=head1 DESCRIPTION + +This page provides a basic tutorial on understanding, creating and +using regular expressions in Perl. It serves as a complement to the +reference page on regular expressions L<perlre>. Regular expressions +are an integral part of the C<m//>, C<s///>, C<qr//> and C<split> +operators and so this tutorial also overlaps with +L<perlop/"Regexp Quote-Like Operators"> and L<perlfunc/split>. + +Perl is widely renowned for excellence in text processing, and regular +expressions are one of the big factors behind this fame. Perl regular +expressions display an efficiency and flexibility unknown in most +other computer languages. Mastering even the basics of regular +expressions will allow you to manipulate text with surprising ease. + +What is a regular expression? A regular expression is simply a string +that describes a pattern. Patterns are in common use these days; +examples are the patterns typed into a search engine to find web pages +and the patterns used to list files in a directory, e.g., C<ls *.txt> +or C<dir *.*>. In Perl, the patterns described by regular expressions +are used to search strings, extract desired parts of strings, and to +do search and replace operations. + +Regular expressions have the undeserved reputation of being abstract +and difficult to understand. Regular expressions are constructed using +simple concepts like conditionals and loops and are no more difficult +to understand than the corresponding C<if> conditionals and C<while> +loops in the Perl language itself. In fact, the main challenge in +learning regular expressions is just getting used to the terse +notation used to express these concepts. + +This tutorial flattens the learning curve by discussing regular +expression concepts, along with their notation, one at a time and with +many examples. The first part of the tutorial will progress from the +simplest word searches to the basic regular expression concepts. If +you master the first part, you will have all the tools needed to solve +about 98% of your needs. The second part of the tutorial is for those +comfortable with the basics and hungry for more power tools. It +discusses the more advanced regular expression operators and +introduces the latest cutting edge innovations in 5.6.0. + +A note: to save time, 'regular expression' is often abbreviated as +regexp or regex. Regexp is a more natural abbreviation than regex, but +is harder to pronounce. The Perl pod documentation is evenly split on +regexp vs regex; in Perl, there is more than one way to abbreviate it. +We'll use regexp in this tutorial. + +=head1 Part 1: The basics + +=head2 Simple word matching + +The simplest regexp is simply a word, or more generally, a string of +characters. A regexp consisting of a word matches any string that +contains that word: + + "Hello World" =~ /World/; # matches + +What is this perl statement all about? C<"Hello World"> is a simple +double quoted string. C<World> is the regular expression and the +C<//> enclosing C</World/> tells perl to search a string for a match. +The operator C<=~> associates the string with the regexp match and +produces a true value if the regexp matched, or false if the regexp +did not match. In our case, C<World> matches the second word in +C<"Hello World">, so the expression is true. Expressions like this +are useful in conditionals: + + if ("Hello World" =~ /World/) { + print "It matches\n"; + } + else { + print "It doesn't match\n"; + } + +There are useful variations on this theme. The sense of the match can +be reversed by using C<!~> operator: + + if ("Hello World" !~ /World/) { + print "It doesn't match\n"; + } + else { + print "It matches\n"; + } + +The literal string in the regexp can be replaced by a variable: + + $greeting = "World"; + if ("Hello World" =~ /$greeting/) { + print "It matches\n"; + } + else { + print "It doesn't match\n"; + } + +If you're matching against the special default variable C<$_>, the +C<$_ =~> part can be omitted: + + $_ = "Hello World"; + if (/World/) { + print "It matches\n"; + } + else { + print "It doesn't match\n"; + } + +And finally, the C<//> default delimiters for a match can be changed +to arbitrary delimiters by putting an C<'m'> out front: + + "Hello World" =~ m!World!; # matches, delimited by '!' + "Hello World" =~ m{World}; # matches, note the matching '{}' + "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin', + # '/' becomes an ordinary char + +C</World/>, C<m!World!>, and C<m{World}> all represent the +same thing. When, e.g., C<""> is used as a delimiter, the forward +slash C<'/'> becomes an ordinary character and can be used in a regexp +without trouble. + +Let's consider how different regexps would match C<"Hello World">: + + "Hello World" =~ /world/; # doesn't match + "Hello World" =~ /o W/; # matches + "Hello World" =~ /oW/; # doesn't match + "Hello World" =~ /World /; # doesn't match + +The first regexp C<world> doesn't match because regexps are +case-sensitive. The second regexp matches because the substring +S<C<'o W'> > occurs in the string S<C<"Hello World"> >. The space +character ' ' is treated like any other character in a regexp and is +needed to match in this case. The lack of a space character is the +reason the third regexp C<'oW'> doesn't match. The fourth regexp +C<'World '> doesn't match because there is a space at the end of the +regexp, but not at the end of the string. The lesson here is that +regexps must match a part of the string I<exactly> in order for the +statement to be true. + +If a regexp matches in more than one place in the string, perl will +always match at the earliest possible point in the string: + + "Hello World" =~ /o/; # matches 'o' in 'Hello' + "That hat is red" =~ /hat/; # matches 'hat' in 'That' + +With respect to character matching, there are a few more points you +need to know about. First of all, not all characters can be used 'as +is' in a match. Some characters, called B<metacharacters>, are reserved +for use in regexp notation. The metacharacters are + + {}[]()^$.|*+?\ + +The significance of each of these will be explained +in the rest of the tutorial, but for now, it is important only to know +that a metacharacter can be matched by putting a backslash before it: + + "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter + "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary + + "The interval is [0,1)." =~ /[0,1)./ # is a syntax error! + "The interval is [0,1)." =~ /\[0,1\)\./ # matches + "/usr/bin/perl" =~ /\/usr\/local\/bin\/perl/; # matches + +In the last regexp, the forward slash C<'/'> is also backslashed, +because it is used to delimit the regexp. This can lead to LTS +(leaning toothpick syndrome), however, and it is often more readable +to change delimiters. + + +The backslash character C<'\'> is a metacharacter itself and needs to +be backslashed: + + 'C:\WIN32' =~ /C:\\WIN/; # matches + +In addition to the metacharacters, there are some ASCII characters +which don't have printable character equivalents and are instead +represented by B<escape sequences>. Common examples are C<\t> for a +tab, C<\n> for a newline, C<\r> for a carriage return and C<\a> for a +bell. If your string is better thought of as a sequence of arbitrary +bytes, the octal escape sequence, e.g., C<\033>, or hexadecimal escape +sequence, e.g., C<\x1B> may be a more natural representation for your +bytes. Here are some examples of escapes: + + "1000\t2000" =~ m(0\t2) # matches + "1000\n2000" =~ /0\n20/ # matches + "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000" + "cat" =~ /\143\x61\x74/ # matches, but a weird way to spell cat + +If you've been around Perl a while, all this talk of escape sequences +may seem familiar. Similar escape sequences are used in double-quoted +strings and in fact the regexps in Perl are mostly treated as +double-quoted strings. This means that variables can be used in +regexps as well. Just like double-quoted strings, the values of the +variables in the regexp will be substituted in before the regexp is +evaluated for matching purposes. So we have: + + $foo = 'house'; + 'housecat' =~ /$foo/; # matches + 'cathouse' =~ /cat$foo/; # matches + 'housecat' =~ /${foo}cat/; # matches + +So far, so good. With the knowledge above you can already perform +searches with just about any literal string regexp you can dream up. +Here is a I<very simple> emulation of the Unix grep program: + + % cat > simple_grep + #!/usr/bin/perl + $regexp = shift; + while (<>) { + print if /$regexp/; + } + ^D + + % chmod +x simple_grep + + % simple_grep abba /usr/dict/words + Babbage + cabbage + cabbages + sabbath + Sabbathize + Sabbathizes + sabbatical + scabbard + scabbards + +This program is easy to understand. C<#!/usr/bin/perl> is the standard +way to invoke a perl program from the shell. +S<C<$regexp = shift;> > saves the first command line argument as the +regexp to be used, leaving the rest of the command line arguments to +be treated as files. S<C<< while (<>) >> > loops over all the lines in +all the files. For each line, S<C<print if /$regexp/;> > prints the +line if the regexp matches the line. In this line, both C<print> and +C</$regexp/> use the default variable C<$_> implicitly. + +With all of the regexps above, if the regexp matched anywhere in the +string, it was considered a match. Sometimes, however, we'd like to +specify I<where> in the string the regexp should try to match. To do +this, we would use the B<anchor> metacharacters C<^> and C<$>. The +anchor C<^> means match at the beginning of the string and the anchor +C<$> means match at the end of the string, or before a newline at the +end of the string. Here is how they are used: + + "housekeeper" =~ /keeper/; # matches + "housekeeper" =~ /^keeper/; # doesn't match + "housekeeper" =~ /keeper$/; # matches + "housekeeper\n" =~ /keeper$/; # matches + +The second regexp doesn't match because C<^> constrains C<keeper> to +match only at the beginning of the string, but C<"housekeeper"> has +keeper starting in the middle. The third regexp does match, since the +C<$> constrains C<keeper> to match only at the end of the string. + +When both C<^> and C<$> are used at the same time, the regexp has to +match both the beginning and the end of the string, i.e., the regexp +matches the whole string. Consider + + "keeper" =~ /^keep$/; # doesn't match + "keeper" =~ /^keeper$/; # matches + "" =~ /^$/; # ^$ matches an empty string + +The first regexp doesn't match because the string has more to it than +C<keep>. Since the second regexp is exactly the string, it +matches. Using both C<^> and C<$> in a regexp forces the complete +string to match, so it gives you complete control over which strings +match and which don't. Suppose you are looking for a fellow named +bert, off in a string by himself: + + "dogbert" =~ /bert/; # matches, but not what you want + + "dilbert" =~ /^bert/; # doesn't match, but .. + "bertram" =~ /^bert/; # matches, so still not good enough + + "bertram" =~ /^bert$/; # doesn't match, good + "dilbert" =~ /^bert$/; # doesn't match, good + "bert" =~ /^bert$/; # matches, perfect + +Of course, in the case of a literal string, one could just as easily +use the string equivalence S<C<$string eq 'bert'> > and it would be +more efficient. The C<^...$> regexp really becomes useful when we +add in the more powerful regexp tools below. + +=head2 Using character classes + +Although one can already do quite a lot with the literal string +regexps above, we've only scratched the surface of regular expression +technology. In this and subsequent sections we will introduce regexp +concepts (and associated metacharacter notations) that will allow a +regexp to not just represent a single character sequence, but a I<whole +class> of them. + +One such concept is that of a B<character class>. A character class +allows a set of possible characters, rather than just a single +character, to match at a particular point in a regexp. Character +classes are denoted by brackets C<[...]>, with the set of characters +to be possibly matched inside. Here are some examples: + + /cat/; # matches 'cat' + /[bcr]at/; # matches 'bat, 'cat', or 'rat' + /item[0123456789]/; # matches 'item0' or ... or 'item9' + "abc" =~ /[cab]/; # matches 'a' + +In the last statement, even though C<'c'> is the first character in +the class, C<'a'> matches because the first character position in the +string is the earliest point at which the regexp can match. + + /[yY][eE][sS]/; # match 'yes' in a case-insensitive way + # 'yes', 'Yes', 'YES', etc. + +This regexp displays a common task: perform a a case-insensitive +match. Perl provides away of avoiding all those brackets by simply +appending an C<'i'> to the end of the match. Then C</[yY][eE][sS]/;> +can be rewritten as C</yes/i;>. The C<'i'> stands for +case-insensitive and is an example of a B<modifier> of the matching +operation. We will meet other modifiers later in the tutorial. + +We saw in the section above that there were ordinary characters, which +represented themselves, and special characters, which needed a +backslash C<\> to represent themselves. The same is true in a +character class, but the sets of ordinary and special characters +inside a character class are different than those outside a character +class. The special characters for a character class are C<-]\^$>. C<]> +is special because it denotes the end of a character class. C<$> is +special because it denotes a scalar variable. C<\> is special because +it is used in escape sequences, just like above. Here is how the +special characters C<]$\> are handled: + + /[\]c]def/; # matches ']def' or 'cdef' + $x = 'bcr'; + /[$x]at/; # matches 'bat', 'cat', or 'rat' + /[\$x]at/; # matches '$at' or 'xat' + /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat' + +The last two are a little tricky. in C<[\$x]>, the backslash protects +the dollar sign, so the character class has two members C<$> and C<x>. +In C<[\\$x]>, the backslash is protected, so C<$x> is treated as a +variable and substituted in double quote fashion. + +The special character C<'-'> acts as a range operator within character +classes, so that a contiguous set of characters can be written as a +range. With ranges, the unwieldy C<[0123456789]> and C<[abc...xyz]> +become the svelte C<[0-9]> and C<[a-z]>. Some examples are + + /item[0-9]/; # matches 'item0' or ... or 'item9' + /[0-9bx-z]aa/; # matches '0aa', ..., '9aa', + # 'baa', 'xaa', 'yaa', or 'zaa' + /[0-9a-fA-F]/; # matches a hexadecimal digit + /[0-9a-zA-Z_]/; # matches a "word" character, + # like those in a perl variable name + +If C<'-'> is the first or last character in a character class, it is +treated as an ordinary character; C<[-ab]>, C<[ab-]> and C<[a\-b]> are +all equivalent. + +The special character C<^> in the first position of a character class +denotes a B<negated character class>, which matches any character but +those in the brackets. Both C<[...]> and C<[^...]> must match a +character, or the match fails. Then + + /[^a]at/; # doesn't match 'aat' or 'at', but matches + # all other 'bat', 'cat, '0at', '%at', etc. + /[^0-9]/; # matches a non-numeric character + /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary + +Now, even C<[0-9]> can be a bother the write multiple times, so in the +interest of saving keystrokes and making regexps more readable, Perl +has several abbreviations for common character classes: + +=over 4 + +=item * + +\d is a digit and represents [0-9] + +=item * + +\s is a whitespace character and represents [\ \t\r\n\f] + +=item * + +\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_] + +=item * + +\D is a negated \d; it represents any character but a digit [^0-9] + +=item * + +\S is a negated \s; it represents any non-whitespace character [^\s] + +=item * + +\W is a negated \w; it represents any non-word character [^\w] + +=item * + +The period '.' matches any character but "\n" + +=back + +The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside +of character classes. Here are some in use: + + /\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format + /[\d\s]/; # matches any digit or whitespace character + /\w\W\w/; # matches a word char, followed by a + # non-word char, followed by a word char + /..rt/; # matches any two chars, followed by 'rt' + /end\./; # matches 'end.' + /end[.]/; # same thing, matches 'end.' + +Because a period is a metacharacter, it needs to be escaped to match +as an ordinary period. Because, for example, C<\d> and C<\w> are sets +of characters, it is incorrect to think of C<[^\d\w]> as C<[\D\W]>; in +fact C<[^\d\w]> is the same as C<[^\w]>, which is the same as +C<[\W]>. Think DeMorgan's laws. + +An anchor useful in basic regexps is the S<B<word anchor> > +C<\b>. This matches a boundary between a word character and a non-word +character C<\w\W> or C<\W\w>: + + $x = "Housecat catenates house and cat"; + $x =~ /cat/; # matches cat in 'housecat' + $x =~ /\bcat/; # matches cat in 'catenates' + $x =~ /cat\b/; # matches cat in 'housecat' + $x =~ /\bcat\b/; # matches 'cat' at end of string + +Note in the last example, the end of the string is considered a word +boundary. + +You might wonder why C<'.'> matches everything but C<"\n"> - why not +every character? The reason is that often one is matching against +lines and would like to ignore the newline characters. For instance, +while the string C<"\n"> represents one line, we would like to think +of as empty. Then + + "" =~ /^$/; # matches + "\n" =~ /^$/; # matches, "\n" is ignored + + "" =~ /./; # doesn't match; it needs a char + "" =~ /^.$/; # doesn't match; it needs a char + "\n" =~ /^.$/; # doesn't match; it needs a char other than "\n" + "a" =~ /^.$/; # matches + "a\n" =~ /^.$/; # matches, ignores the "\n" + +This behavior is convenient, because we usually want to ignore +newlines when we count and match characters in a line. Sometimes, +however, we want to keep track of newlines. We might even want C<^> +and C<$> to anchor at the beginning and end of lines within the +string, rather than just the beginning and end of the string. Perl +allows us to choose between ignoring and paying attention to newlines +by using the C<//s> and C<//m> modifiers. C<//s> and C<//m> stand for +single line and multi-line and they determine whether a string is to +be treated as one continuous string, or as a set of lines. The two +modifiers affect two aspects of how the regexp is interpreted: 1) how +the C<'.'> character class is defined, and 2) where the anchors C<^> +and C<$> are able to match. Here are the four possible combinations: + +=over 4 + +=item * + +no modifiers (//): Default behavior. C<'.'> matches any character +except C<"\n">. C<^> matches only at the beginning of the string and +C<$> matches only at the end or before a newline at the end. + +=item * + +s modifier (//s): Treat string as a single long line. C<'.'> matches +any character, even C<"\n">. C<^> matches only at the beginning of +the string and C<$> matches only at the end or before a newline at the +end. + +=item * + +m modifier (//m): Treat string as a set of multiple lines. C<'.'> +matches any character except C<"\n">. C<^> and C<$> are able to match +at the start or end of I<any> line within the string. + +=item * + +both s and m modifiers (//sm): Treat string as a single long line, but +detect multiple lines. C<'.'> matches any character, even +C<"\n">. C<^> and C<$>, however, are able to match at the start or end +of I<any> line within the string. + +=back + +Here are examples of C<//s> and C<//m> in action: + + $x = "There once was a girl\nWho programmed in Perl\n"; + + $x =~ /^Who/; # doesn't match, "Who" not at start of string + $x =~ /^Who/s; # doesn't match, "Who" not at start of string + $x =~ /^Who/m; # matches, "Who" at start of second line + $x =~ /^Who/sm; # matches, "Who" at start of second line + + $x =~ /girl.Who/; # doesn't match, "." doesn't match "\n" + $x =~ /girl.Who/s; # matches, "." matches "\n" + $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\n" + $x =~ /girl.Who/sm; # matches, "." matches "\n" + +Most of the time, the default behavior is what is want, but C<//s> and +C<//m> are occasionally very useful. If C<//m> is being used, the start +of the string can still be matched with C<\A> and the end of string +can still be matched with the anchors C<\Z> (matches both the end and +the newline before, like C<$>), and C<\z> (matches only the end): + + $x =~ /^Who/m; # matches, "Who" at start of second line + $x =~ /\AWho/m; # doesn't match, "Who" is not at start of string + + $x =~ /girl$/m; # matches, "girl" at end of first line + $x =~ /girl\Z/m; # doesn't match, "girl" is not at end of string + + $x =~ /Perl\Z/m; # matches, "Perl" is at newline before end + $x =~ /Perl\z/m; # doesn't match, "Perl" is not at end of string + +We now know how to create choices among classes of characters in a +regexp. What about choices among words or character strings? Such +choices are described in the next section. + +=head2 Matching this or that + +Sometimes we would like to our regexp to be able to match different +possible words or character strings. This is accomplished by using +the B<alternation> metacharacter C<|>. To match C<dog> or C<cat>, we +form the regexp C<dog|cat>. As before, perl will try to match the +regexp at the earliest possible point in the string. At each +character position, perl will first try to match the first +alternative, C<dog>. If C<dog> doesn't match, perl will then try the +next alternative, C<cat>. If C<cat> doesn't match either, then the +match fails and perl moves to the next position in the string. Some +examples: + + "cats and dogs" =~ /cat|dog|bird/; # matches "cat" + "cats and dogs" =~ /dog|cat|bird/; # matches "cat" + +Even though C<dog> is the first alternative in the second regexp, +C<cat> is able to match earlier in the string. + + "cats" =~ /c|ca|cat|cats/; # matches "c" + "cats" =~ /cats|cat|ca|c/; # matches "cats" + +Here, all the alternatives match at the first string position, so the +first alternative is the one that matches. If some of the +alternatives are truncations of the others, put the longest ones first +to give them a chance to match. + + "cab" =~ /a|b|c/ # matches "c" + # /a|b|c/ == /[abc]/ + +The last example points out that character classes are like +alternations of characters. At a given character position, the first +alternative that allows the regexp match to succeed wil be the one +that matches. + +=head2 Grouping things and hierarchical matching + +Alternation allows a regexp to choose among alternatives, but by +itself it unsatisfying. The reason is that each alternative is a whole +regexp, but sometime we want alternatives for just part of a +regexp. For instance, suppose we want to search for housecats or +housekeepers. The regexp C<housecat|housekeeper> fits the bill, but is +inefficient because we had to type C<house> twice. It would be nice to +have parts of the regexp be constant, like C<house>, and and some +parts have alternatives, like C<cat|keeper>. + +The B<grouping> metacharacters C<()> solve this problem. Grouping +allows parts of a regexp to be treated as a single unit. Parts of a +regexp are grouped by enclosing them in parentheses. Thus we could solve +the C<housecat|housekeeper> by forming the regexp as +C<house(cat|keeper)>. The regexp C<house(cat|keeper)> means match +C<house> followed by either C<cat> or C<keeper>. Some more examples +are + + /(a|b)b/; # matches 'ab' or 'bb' + /(ac|b)b/; # matches 'acb' or 'bb' + /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere + /(a|[bc])d/; # matches 'ad', 'bd', or 'cd' + + /house(cat|)/; # matches either 'housecat' or 'house' + /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or + # 'house'. Note groups can be nested. + + /(19|20|)\d\d/; # match years 19xx, 20xx, or the Y2K problem, xx + "20" =~ /(19|20|)\d\d/; # matches the null alternative '()\d\d', + # because '20\d\d' can't match + +Alternations behave the same way in groups as out of them: at a given +string position, the leftmost alternative that allows the regexp to +match is taken. So in the last example at tth first string position, +C<"20"> matches the second alternative, but there is nothing left over +to match the next two digits C<\d\d>. So perl moves on to the next +alternative, which is the null alternative and that works, since +C<"20"> is two digits. + +The process of trying one alternative, seeing if it matches, and +moving on to the next alternative if it doesn't, is called +B<backtracking>. The term 'backtracking' comes from the idea that +matching a regexp is like a walk in the woods. Successfully matching +a regexp is like arriving at a destination. There are many possible +trailheads, one for each string position, and each one is tried in +order, left to right. From each trailhead there may be many paths, +some of which get you there, and some which are dead ends. When you +walk along a trail and hit a dead end, you have to backtrack along the +trail to an earlier point to try another trail. If you hit your +destination, you stop immediately and forget about trying all the +other trails. You are persistent, and only if you have tried all the +trails from all the trailheads and not arrived at your destination, do +you declare failure. To be concrete, here is a step-by-step analysis +of what perl does when it tries to match the regexp + + "abcde" =~ /(abd|abc)(df|d|de)/; + +=over 4 + +=item 0 + +Start with the first letter in the string 'a'. + +=item 1 + +Try the first alternative in the first group 'abd'. + +=item 2 + +Match 'a' followed by 'b'. So far so good. + +=item 3 + +'d' in the regexp doesn't match 'c' in the string - a dead +end. So backtrack two characters and pick the second alternative in +the first group 'abc'. + +=item 4 + +Match 'a' followed by 'b' followed by 'c'. We are on a roll +and have satisfied the first group. Set $1 to 'abc'. + +=item 5 + +Move on to the second group and pick the first alternative +'df'. + +=item 6 + +Match the 'd'. + +=item 7 + +'f' in the regexp doesn't match 'e' in the string, so a dead +end. Backtrack one character and pick the second alternative in the +second group 'd'. + +=item 8 + +'d' matches. The second grouping is satisfied, so set $2 to +'d'. + +=item 9 + +We are at the end of the regexp, so we are done! We have +matched 'abcd' out of the string "abcde". + +=back + +There are a couple of things to note about this analysis. First, the +third alternative in the second group 'de' also allows a match, but we +stopped before we got to it - at a given character position, leftmost +wins. Second, we were able to get a match at the first character +position of the string 'a'. If there were no matches at the first +position, perl would move to the second character position 'b' and +attempt the match all over again. Only when all possible paths at all +possible character positions have been exhausted does perl give give +up and declare S<C<$string =~ /(abd|abc)(df|d|de)/;> > to be false. + +Even with all this work, regexp matching happens remarkably fast. To +speed things up, during compilation stage, perl compiles the regexp +into a compact sequence of opcodes that can often fit inside a +processor cache. When the code is executed, these opcodes can then run +at full throttle and search very quickly. + +=head2 Extracting matches + +The grouping metacharacters C<()> also serve another completely +different function: they allow the extraction of the parts of a string +that matched. This is very useful to find out what matched and for +text processing in general. For each grouping, the part that matched +inside goes into the special variables C<$1>, C<$2>, etc. They can be +used just as ordinary variables: + + # extract hours, minutes, seconds + $time =~ /(\d\d):(\d\d):(\d\d)/; # match hh:mm:ss format + $hours = $1; + $minutes = $2; + $seconds = $3; + +Now, we know that in scalar context, +S<C<$time =~ /(\d\d):(\d\d):(\d\d)/> > returns a true or false +value. In list context, however, it returns the list of matched values +C<($1,$2,$3)>. So we could write the code more compactly as + + # extract hours, minutes, seconds + ($hours, $minutes, $second) = ($time =~ /(\d\d):(\d\d):(\d\d)/); + +If the groupings in a regexp are nested, C<$1> gets the group with the +leftmost opening parenthesis, C<$2> the next opening parenthesis, +etc. For example, here is a complex regexp and the matching variables +indicated below it: + + /(ab(cd|ef)((gi)|j))/; + 1 2 34 + +so that if the regexp matched, e.g., C<$2> would contain 'cd' or 'ef'. +For convenience, perl sets C<$+> to the highest numbered C<$1>, C<$2>, +... that got assigned. + +Closely associated with the matching variables C<$1>, C<$2>, ... are +the B<backreferences> C<\1>, C<\2>, ... . Backreferences are simply +matching variables that can be used I<inside> a regexp. This is a +really nice feature - what matches later in a regexp can depend on +what matched earlier in the regexp. Suppose we wanted to look +for doubled words in text, like 'the the'. The following regexp finds +all 3-letter doubles with a space in between: + + /(\w\w\w)\s\1/; + +The grouping assigns a value to \1, so that the same 3 letter sequence +is used for both parts. Here are some words with repeated parts: + + % simple_grep '^(\w\w\w\w|\w\w\w|\w\w|\w)\1$' /usr/dict/words + beriberi + booboo + coco + mama + murmur + papa + +The regexp has a single grouping which considers 4-letter +combinations, then 3-letter combinations, etc. and uses C<\1> to look for +a repeat. Although C<$1> and C<\1> represent the same thing, care should be +taken to use matched variables C<$1>, C<$2>, ... only outside a regexp +and backreferences C<\1>, C<\2>, ... only inside a regexp; not doing +so may lead to surprising and/or undefined results. + +In addition to what was matched, Perl 5.6.0 also provides the +positions of what was matched with the C<@-> and C<@+> +arrays. C<$-[0]> is the position of the start of the entire match and +C<$+[0]> is the position of the end. Similarly, C<$-[n]> is the +position of the start of the C<$n> match and C<$+[n]> is the position +of the end. If C<$n> is undefined, so are C<$-[n]> and C<$+[n]>. Then +this code + + $x = "Mmm...donut, thought Homer"; + $x =~ /^(Mmm|Yech)\.\.\.(donut|peas)/; # matches + foreach $expr (1..$#-) { + print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\n"; + } + +prints + + Match 1: 'Mmm' at position (0,3) + Match 2: 'donut' at position (6,11) + +Even if there are no groupings in a regexp, it is still possible to +find out what exactly matched in a string. If you use them, perl +will set C<$`> to the part of the string before the match, will set C<$&> +to the part of the string that matched, and will set C<$'> to the part +of the string after the match. An example: + + $x = "the cat caught the mouse"; + $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse' + $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse' + +In the second match, S<C<$` = ''> > because the regexp matched at the +first character position in the string and stopped, it never saw the +second 'the'. It is important to note that using C<$`> and C<$'> +slows down regexp matching quite a bit, and C< $& > slows it down to a +lesser extent, because if they are used in one regexp in a program, +they are generated for <all> regexps in the program. So if raw +performance is a goal of your application, they should be avoided. +If you need them, use C<@-> and C<@+> instead: + + $` is the same as substr( $x, 0, $-[0] ) + $& is the same as substr( $x, $-[0], $+[0]-$-[0] ) + $' is the same as substr( $x, $+[0] ) + +=head2 Matching repetitions + +The examples in the previous section display an annoying weakness. We +were only matching 3-letter words, or syllables of 4 letters or +less. We'd like to be able to match words or syllables of any length, +without writing out tedious alternatives like +C<\w\w\w\w|\w\w\w|\w\w|\w>. + +This is exactly the problem the B<quantifier> metacharacters C<?>, +C<*>, C<+>, and C<{}> were created for. They allow us to determine the +number of repeats of a portion of a regexp we consider to be a +match. Quantifiers are put immediately after the character, character +class, or grouping that we want to specify. They have the following +meanings: + +=over 4 + +=item * + +C<a?> = match 'a' 1 or 0 times + +=item * + +C<a*> = match 'a' 0 or more times, i.e., any number of times + +=item * + +C<a+> = match 'a' 1 or more times, i.e., at least once + +=item * + +C<a{n,m}> = match at least C<n> times, but not more than C<m> +times. + +=item * + +C<a{n,}> = match at least C<n> or more times + +=item * + +C<a{n}> = match exactly C<n> times + +=back + +Here are some examples: + + /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and + # any number of digits + /(\w+)\s+\1/; # match doubled words of arbitrary length + /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes' + $year =~ /\d{2,4}/; # make sure year is at least 2 but not more + # than 4 digits + $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates + $year =~ /\d{2}(\d{2})?/; # same thing written differently. However, + # this produces $1 and the other does not. + + % simple_grep '^(\w+)\1$' /usr/dict/words # isn't this easier? + beriberi + booboo + coco + mama + murmur + papa + +For all of these quantifiers, perl will try to match as much of the +string as possible, while still allowing the regexp to succeed. Thus +with C</a?.../>, perl will first try to match the regexp with the C<a> +present; if that fails, perl will try to match the regexp without the +C<a> present. For the quantifier C<*>, we get the following: + + $x = "the cat in the hat"; + $x =~ /^(.*)(cat)(.*)$/; # matches, + # $1 = 'the ' + # $2 = 'cat' + # $3 = ' in the hat' + +Which is what we might expect, the match finds the only C<cat> in the +string and locks onto it. Consider, however, this regexp: + + $x =~ /^(.*)(at)(.*)$/; # matches, + # $1 = 'the cat in the h' + # $2 = 'at' + # $3 = '' (0 matches) + +One might initially guess that perl would find the C<at> in C<cat> and +stop there, but that wouldn't give the longest possible string to the +first quantifier C<.*>. Instead, the first quantifier C<.*> grabs as +much of the string as possible while still having the regexp match. In +this example, that means having the C<at> sequence with the final C<at> +in the string. The other important principle illustrated here is that +when there are two or more elements in a regexp, the I<leftmost> +quantifier, if there is one, gets to grab as much the string as +possible, leaving the rest of the regexp to fight over scraps. Thus in +our example, the first quantifier C<.*> grabs most of the string, while +the second quantifier C<.*> gets the empty string. Quantifiers that +grab as much of the string as possible are called B<maximal match> or +B<greedy> quantifiers. + +When a regexp can match a string in several different ways, we can use +the principles above to predict which way the regexp will match: + +=over 4 + +=item * + +Principle 0: Taken as a whole, any regexp will be matched at the +earliest possible position in the string. + +=item * + +Principle 1: In an alternation C<a|b|c...>, the leftmost alternative +that allows a match for the whole regexp will be the one used. + +=item * + +Principle 2: The maximal matching quantifiers C<?>, C<*>, C<+> and +C<{n,m}> will in general match as much of the string as possible while +still allowing the whole regexp to match. + +=item * + +Principle 3: If there are two or more elements in a regexp, the +leftmost greedy quantifier, if any, will match as much of the string +as possible while still allowing the whole regexp to match. The next +leftmost greedy quantifier, if any, will try to match as much of the +string remaining available to it as possible, while still allowing the +whole regexp to match. And so on, until all the regexp elements are +satisfied. + +=back + +As we have seen above, Principle 0 overrides the others - the regexp +will be matched as early as possible, with the other principles +determining how the regexp matches at that earliest character +position. + +Here is an example of these principles in action: + + $x = "The programming republic of Perl"; + $x =~ /^(.+)(e|r)(.*)$/; # matches, + # $1 = 'The programming republic of Pe' + # $2 = 'r' + # $3 = 'l' + +This regexp matches at the earliest string position, C<'T'>. One +might think that C<e>, being leftmost in the alternation, would be +matched, but C<r> produces the longest string in the first quantifier. + + $x =~ /(m{1,2})(.*)$/; # matches, + # $1 = 'mm' + # $2 = 'ing republic of Perl' + +Here, The earliest possible match is at the first C<'m'> in +C<programming>. C<m{1,2}> is the first quantifier, so it gets to match +a maximal C<mm>. + + $x =~ /.*(m{1,2})(.*)$/; # matches, + # $1 = 'm' + # $2 = 'ing republic of Perl' + +Here, the regexp matches at the start of the string. The first +quantifier C<.*> grabs as much as possible, leaving just a single +C<'m'> for the second quantifier C<m{1,2}>. + + $x =~ /(.?)(m{1,2})(.*)$/; # matches, + # $1 = 'a' + # $2 = 'mm' + # $3 = 'ing republic of Perl' + +Here, C<.?> eats its maximal one character at the earliest possible +position in the string, C<'a'> in C<programming>, leaving C<m{1,2}> +the opportunity to match both C<m>'s. Finally, + + "aXXXb" =~ /(X*)/; # matches with $1 = '' + +because it can match zero copies of C<'X'> at the beginning of the +string. If you definitely want to match at least one C<'X'>, use +C<X+>, not C<X*>. + +Sometimes greed is not good. At times, we would like quantifiers to +match a I<minimal> piece of string, rather than a maximal piece. For +this purpose, Larry Wall created the S<B<minimal match> > or +B<non-greedy> quantifiers C<??>,C<*?>, C<+?>, and C<{}?>. These are +the usual quantifiers with a C<?> appended to them. They have the +following meanings: + +=over 4 + +=item * + +C<a??> = match 'a' 0 or 1 times. Try 0 first, then 1. + +=item * + +C<a*?> = match 'a' 0 or more times, i.e., any number of times, +but as few times as possible + +=item * + +C<a+?> = match 'a' 1 or more times, i.e., at least once, but +as few times as possible + +=item * + +C<a{n,m}?> = match at least C<n> times, not more than C<m> +times, as few times as possible + +=item * + +C<a{n,}?> = match at least C<n> times, but as few times as +possible + +=item * + +C<a{n}?> = match exactly C<n> times. Because we match exactly +C<n> times, C<a{n}?> is equivalent to C<a{n}> and is just there for +notational consistency. + +=back + +Let's look at the example above, but with minimal quantifiers: + + $x = "The programming republic of Perl"; + $x =~ /^(.+?)(e|r)(.*)$/; # matches, + # $1 = 'Th' + # $2 = 'e' + # $3 = ' programming republic of Perl' + +The minimal string that will allow both the start of the string C<^> +and the alternation to match is C<Th>, with the alternation C<e|r> +matching C<e>. The second quantifier C<.*> is free to gobble up the +rest of the string. + + $x =~ /(m{1,2}?)(.*?)$/; # matches, + # $1 = 'm' + # $2 = 'ming republic of Perl' + +The first string position that this regexp can match is at the first +C<'m'> in C<programming>. At this position, the minimal C<m{1,2}?> +matches just one C<'m'>. Although the second quantifier C<.*?> would +prefer to match no characters, it is constrained by the end-of-string +anchor C<$> to match the rest of the string. + + $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches, + # $1 = 'The progra' + # $2 = 'm' + # $3 = 'ming republic of Perl' + +In this regexp, you might expect the first minimal quantifier C<.*?> +to match the empty string, because it is not constrained by a C<^> +anchor to match the beginning of the word. Principle 0 applies here, +however. Because it is possible for the whole regexp to match at the +start of the string, it I<will> match at the start of the string. Thus +the first quantifier has to match everything up to the first C<m>. The +second minimal quantifier matches just one C<m> and the third +quantifier matches the rest of the string. + + $x =~ /(.??)(m{1,2})(.*)$/; # matches, + # $1 = 'a' + # $2 = 'mm' + # $3 = 'ing republic of Perl' + +Just as in the previous regexp, the first quantifier C<.??> can match +earliest at position C<'a'>, so it does. The second quantifier is +greedy, so it matches C<mm>, and the third matches the rest of the +string. + +We can modify principle 3 above to take into account non-greedy +quantifiers: + +=over 4 + +=item * + +Principle 3: If there are two or more elements in a regexp, the +leftmost greedy (non-greedy) quantifier, if any, will match as much +(little) of the string as possible while still allowing the whole +regexp to match. The next leftmost greedy (non-greedy) quantifier, if +any, will try to match as much (little) of the string remaining +available to it as possible, while still allowing the whole regexp to +match. And so on, until all the regexp elements are satisfied. + +=back + +Just like alternation, quantifiers are also susceptible to +backtracking. Here is a step-by-step analysis of the example + + $x = "the cat in the hat"; + $x =~ /^(.*)(at)(.*)$/; # matches, + # $1 = 'the cat in the h' + # $2 = 'at' + # $3 = '' (0 matches) + +=over 4 + +=item 0 + +Start with the first letter in the string 't'. + +=item 1 + +The first quantifier '.*' starts out by matching the whole +string 'the cat in the hat'. + +=item 2 + +'a' in the regexp element 'at' doesn't match the end of the +string. Backtrack one character. + +=item 3 + +'a' in the regexp element 'at' still doesn't match the last +letter of the string 't', so backtrack one more character. + +=item 4 + +Now we can match the 'a' and the 't'. + +=item 5 + +Move on to the third element '.*'. Since we are at the end of +the string and '.*' can match 0 times, assign it the empty string. + +=item 6 + +We are done! + +=back + +Most of the time, all this moving forward and backtracking happens +quickly and searching is fast. There are some pathological regexps, +however, whose execution time exponentially grows with the size of the +string. A typical structure that blows up in your face is of the form + + /(a|b+)*/; + +The problem is the nested indeterminate quantifiers. There are many +different ways of partitioning a string of length n between the C<+> +and C<*>: one repetition with C<b+> of length n, two repetitions with +the first C<b+> length k and the second with length n-k, m repetitions +whose bits add up to length n, etc. In fact there are an exponential +number of ways to partition a string as a function of length. A +regexp may get lucky and match early in the process, but if there is +no match, perl will try I<every> possibility before giving up. So be +careful with nested C<*>'s, C<{n,m}>'s, and C<+>'s. The book +I<Mastering regular expressions> by Jeffrey Friedl gives a wonderful +discussion of this and other efficiency issues. + +=head2 Building a regexp + +At this point, we have all the basic regexp concepts covered, so let's +give a more involved example of a regular expression. We will build a +regexp that matches numbers. + +The first task in building a regexp is to decide what we want to match +and what we want to exclude. In our case, we want to match both +integers and floating point numbers and we want to reject any string +that isn't a number. + +The next task is to break the problem down into smaller problems that +are easily converted into a regexp. + +The simplest case is integers. These consist of a sequence of digits, +with an optional sign in front. The digits we can represent with +C<\d+> and the sign can be matched with C<[+-]>. Thus the integer +regexp is + + /[+-]?\d+/; # matches integers + +A floating point number potentially has a sign, an integral part, a +decimal point, a fractional part, and an exponent. One or more of these +parts is optional, so we need to check out the different +possibilities. Floating point numbers which are in proper form include +123., 0.345, .34, -1e6, and 25.4E-72. As with integers, the sign out +front is completely optional and can be matched by C<[+-]?>. We can +see that if there is no exponent, floating point numbers must have a +decimal point, otherwise they are integers. We might be tempted to +model these with C<\d*\.\d*>, but this would also match just a single +decimal point, which is not a number. So the three cases of floating +point number sans exponent are + + /[+-]?\d+\./; # 1., 321., etc. + /[+-]?\.\d+/; # .1, .234, etc. + /[+-]?\d+\.\d+/; # 1.0, 30.56, etc. + +These can be combined into a single regexp with a three-way alternation: + + /[+-]?(\d+\.\d+|\d+\.|\.\d+)/; # floating point, no exponent + +In this alternation, it is important to put C<'\d+\.\d+'> before +C<'\d+\.'>. If C<'\d+\.'> were first, the regexp would happily match that +and ignore the fractional part of the number. + +Now consider floating point numbers with exponents. The key +observation here is that I<both> integers and numbers with decimal +points are allowed in front of an exponent. Then exponents, like the +overall sign, are independent of whether we are matching numbers with +or without decimal points, and can be 'decoupled' from the +mantissa. The overall form of the regexp now becomes clear: + + /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/; + +The exponent is an C<e> or C<E>, followed by an integer. So the +exponent regexp is + + /[eE][+-]?\d+/; # exponent + +Putting all the parts together, we get a regexp that matches numbers: + + /^[+-]?(\d+\.\d+|\d+\.|\.\d+|\d+)([eE][+-]?\d+)?$/; # Ta da! + +Long regexps like this may impress your friends, but can be hard to +decipher. In complex situations like this, the C<//x> modifier for a +match is invaluable. It allows one to put nearly arbitrary whitespace +and comments into a regexp without affecting their meaning. Using it, +we can rewrite our 'extended' regexp in the more pleasing form + + /^ + [+-]? # first, match an optional sign + ( # then match integers or f.p. mantissas: + \d+\.\d+ # mantissa of the form a.b + |\d+\. # mantissa of the form a. + |\.\d+ # mantissa of the form .b + |\d+ # integer of the form a + ) + ([eE][+-]?\d+)? # finally, optionally match an exponent + $/x; + +If whitespace is mostly irrelevant, how does one include space +characters in an extended regexp? The answer is to backslash it +S<C<'\ '> > or put it in a character class S<C<[ ]> >. The same thing +goes for pound signs, use C<\#> or C<[#]>. For instance, Perl allows +a space between the sign and the mantissa/integer, and we could add +this to our regexp as follows: + + /^ + [+-]?\ * # first, match an optional sign *and space* + ( # then match integers or f.p. mantissas: + \d+\.\d+ # mantissa of the form a.b + |\d+\. # mantissa of the form a. + |\.\d+ # mantissa of the form .b + |\d+ # integer of the form a + ) + ([eE][+-]?\d+)? # finally, optionally match an exponent + $/x; + +In this form, it is easier to see a way to simplify the +alternation. Alternatives 1, 2, and 4 all start with C<\d+>, so it +could be factored out: + + /^ + [+-]?\ * # first, match an optional sign + ( # then match integers or f.p. mantissas: + \d+ # start out with a ... + ( + \.\d* # mantissa of the form a.b or a. + )? # ? takes care of integers of the form a + |\.\d+ # mantissa of the form .b + ) + ([eE][+-]?\d+)? # finally, optionally match an exponent + $/x; + +or written in the compact form, + + /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; + +This is our final regexp. To recap, we built a regexp by + +=over 4 + +=item * + +specifying the task in detail, + +=item * + +breaking down the problem into smaller parts, + +=item * + +translating the small parts into regexps, + +=item * + +combining the regexps, + +=item * + +and optimizing the final combined regexp. + +=back + +These are also the typical steps involved in writing a computer +program. This makes perfect sense, because regular expressions are +essentially programs written a little computer language that specifies +patterns. + +=head2 Using regular expressions in Perl + +The last topic of Part 1 briefly covers how regexps are used in Perl +programs. Where do they fit into Perl syntax? + +We have already introduced the matching operator in its default +C</regexp/> and arbitrary delimiter C<m!regexp!> forms. We have used +the binding operator C<=~> and its negation C<!~> to test for string +matches. Associated with the matching operator, we have discussed the +single line C<//s>, multi-line C<//m>, case-insensitive C<//i> and +extended C<//x> modifiers. + +There are a few more things you might want to know about matching +operators. First, we pointed out earlier that variables in regexps are +substituted before the regexp is evaluated: + + $pattern = 'Seuss'; + while (<>) { + print if /$pattern/; + } + +This will print any lines containing the word C<Seuss>. It is not as +efficient as it could be, however, because perl has to re-evaluate +C<$pattern> each time through the loop. If C<$pattern> won't be +changing over the lifetime of the script, we can add the C<//o> +modifier, which directs perl to only perform variable substitutions +once: + + #!/usr/bin/perl + # Improved simple_grep + $regexp = shift; + while (<>) { + print if /$regexp/o; # a good deal faster + } + +If you change C<$pattern> after the first substitution happens, perl +will ignore it. If you don't want any substitutions at all, use the +special delimiter C<m''>: + + $pattern = 'Seuss'; + while (<>) { + print if m'$pattern'; # matches '$pattern', not 'Seuss' + } + +C<m''> acts like single quotes on a regexp; all other C<m> delimiters +act like double quotes. If the regexp evaluates to the empty string, +the regexp in the I<last successful match> is used instead. So we have + + "dog" =~ /d/; # 'd' matches + "dogbert =~ //; # this matches the 'd' regexp used before + +The final two modifiers C<//g> and C<//c> concern multiple matches. +The modifier C<//g> stands for global matching and allows the the +matching operator to match within a string as many times as possible. +In scalar context, successive invocations against a string will have +`C<//g> jump from match to match, keeping track of position in the +string as it goes along. You can get or set the position with the +C<pos()> function. + +The use of C<//g> is shown in the following example. Suppose we have +a string that consists of words separated by spaces. If we know how +many words there are in advance, we could extract the words using +groupings: + + $x = "cat dog house"; # 3 words + $x =~ /^\s*(\w+)\s+(\w+)\s+(\w+)\s*$/; # matches, + # $1 = 'cat' + # $2 = 'dog' + # $3 = 'house' + +But what if we had an indeterminate number of words? This is the sort +of task C<//g> was made for. To extract all words, form the simple +regexp C<(\w+)> and loop over all matches with C</(\w+)/g>: + + while ($x =~ /(\w+)/g) { + print "Word is $1, ends at position ", pos $x, "\n"; + } + +prints + + Word is cat, ends at position 3 + Word is dog, ends at position 7 + Word is house, ends at position 13 + +A failed match or changing the target string resets the position. If +you don't want the position reset after failure to match, add the +C<//c>, as in C</regexp/gc>. The current position in the string is +associated with the string, not the regexp. This means that different +strings have different positions and their respective positions can be +set or read independently. + +In list context, C<//g> returns a list of matched groupings, or if +there are no groupings, a list of matches to the whole regexp. So if +we wanted just the words, we could use + + @words = ($x =~ /(\w+)/g); # matches, + # $word[0] = 'cat' + # $word[1] = 'dog' + # $word[2] = 'house' + +Closely associated with the C<//g> modifier is the C<\G> anchor. The +C<\G> anchor matches at the point where the previous C<//g> match left +off. C<\G> allows us to easily do context-sensitive matching: + + $metric = 1; # use metric units + ... + $x = <FILE>; # read in measurement + $x =~ /^([+-]?\d+)\s*/g; # get magnitude + $weight = $1; + if ($metric) { # error checking + print "Units error!" unless $x =~ /\Gkg\./g; + } + else { + print "Units error!" unless $x =~ /\Glbs\./g; + } + $x =~ /\G\s+(widget|sprocket)/g; # continue processing + +The combination of C<//g> and C<\G> allows us to process the string a +bit at a time and use arbitrary Perl logic to decide what to do next. + +C<\G> is also invaluable in processing fixed length records with +regexps. Suppose we have a snippet of coding region DNA, encoded as +base pair letters C<ATCGTTGAAT...> and we want to find all the stop +codons C<TGA>. In a coding region, codons are 3-letter sequences, so +we can think of the DNA snippet as a sequence of 3-letter records. The +naive regexp + + # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC" + $dna = "ATCGTTGAATGCAAATGACATGAC"; + $dna =~ /TGA/; + +doesn't work; it may match an C<TGA>, but there is no guarantee that +the match is aligned with codon boundaries, e.g., the substring +S<C<GTT GAA> > gives a match. A better solution is + + while ($dna =~ /(\w\w\w)*?TGA/g) { # note the minimal *? + print "Got a TGA stop codon at position ", pos $dna, "\n"; + } + +which prints + + Got a TGA stop codon at position 18 + Got a TGA stop codon at position 23 + +Position 18 is good, but position 23 is bogus. What happened? + +The answer is that our regexp works well until we get past the last +real match. Then the regexp will fail to match a synchronized C<TGA> +and start stepping ahead one character position at a time, not what we +want. The solution is to use C<\G> to anchor the match to the codon +alignment: + + while ($dna =~ /\G(\w\w\w)*?TGA/g) { + print "Got a TGA stop codon at position ", pos $dna, "\n"; + } + +This prints + + Got a TGA stop codon at position 18 + +which is the correct answer. This example illustrates that it is +important not only to match what is desired, but to reject what is not +desired. + +B<search and replace> + +Regular expressions also play a big role in B<search and replace> +operations in Perl. Search and replace is accomplished with the +C<s///> operator. The general form is +C<s/regexp/replacement/modifiers>, with everything we know about +regexps and modifiers applying in this case as well. The +C<replacement> is a Perl double quoted string that replaces in the +string whatever is matched with the C<regexp>. The operator C<=~> is +also used here to associate a string with C<s///>. If matching +against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match, +C<s///> returns the number of substitutions made, otherwise it returns +false. Here are a few examples: + + $x = "Time to feed the cat!"; + $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!" + if ($x =~ s/^(Time.*hacker)!$/$1 now!/) { + $more_insistent = 1; + } + $y = "'quoted words'"; + $y =~ s/^'(.*)'$/$1/; # strip single quotes, + # $y contains "quoted words" + +In the last example, the whole string was matched, but only the part +inside the single quotes was grouped. With the C<s///> operator, the +matched variables C<$1>, C<$2>, etc. are immediately available for use +in the replacement expression, so we use C<$1> to replace the quoted +string with just what was quoted. With the global modifier, C<s///g> +will search and replace all occurrences of the regexp in the string: + + $x = "I batted 4 for 4"; + $x =~ s/4/four/; # doesn't do it all: + # $x contains "I batted four for 4" + $x = "I batted 4 for 4"; + $x =~ s/4/four/g; # does it all: + # $x contains "I batted four for four" + +If you prefer 'regex' over 'regexp' in this tutorial, you could use +the following program to replace it: + + % cat > simple_replace + #!/usr/bin/perl + $regexp = shift; + $replacement = shift; + while (<>) { + s/$regexp/$replacement/go; + print; + } + ^D + + % simple_replace regexp regex perlretut.pod + +In C<simple_replace> we used the C<s///g> modifier to replace all +occurrences of the regexp on each line and the C<s///o> modifier to +compile the regexp only once. As with C<simple_grep>, both the +C<print> and the C<s/$regexp/$replacement/go> use C<$_> implicitly. + +A modifier available specifically to search and replace is the +C<s///e> evaluation modifier. C<s///e> wraps an C<eval{...}> around +the replacement string and the evaluated result is substituted for the +matched substring. C<s///e> is useful if you need to do a bit of +computation in the process of replacing text. This example counts +character frequencies in a line: + + $x = "Bill the cat"; + $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself + print "frequency of '$_' is $chars{$_}\n" + foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars); + +This prints + + frequency of ' ' is 2 + frequency of 't' is 2 + frequency of 'l' is 2 + frequency of 'B' is 1 + frequency of 'c' is 1 + frequency of 'e' is 1 + frequency of 'h' is 1 + frequency of 'i' is 1 + frequency of 'a' is 1 + +As with the match C<m//> operator, C<s///> can use other delimiters, +such as C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are +used C<s'''>, then the regexp and replacement are treated as single +quoted strings and there are no substitutions. C<s///> in list context +returns the same thing as in scalar context, i.e., the number of +matches. + +B<The split operator> + +The B<C<split> > function can also optionally use a matching operator +C<m//> to split a string. C<split /regexp/, string, limit> splits +C<string> into a list of substrings and returns that list. The regexp +is used to match the character sequence that the C<string> is split +with respect to. The C<limit>, if present, constrains splitting into +no more than C<limit> number of strings. For example, to split a +string into words, use + + $x = "Calvin and Hobbes"; + @words = split /\s+/, $x; # $word[0] = 'Calvin' + # $word[1] = 'and' + # $word[2] = 'Hobbes' + +If the empty regexp C<//> is used, the regexp always matches and +the string is split into individual characters. If the regexp has +groupings, then list produced contains the matched substrings from the +groupings as well. For instance, + + $x = "/usr/bin/perl"; + @dirs = split m!/!, $x; # $dirs[0] = '' + # $dirs[1] = 'usr' + # $dirs[2] = 'bin' + # $dirs[3] = 'perl' + @parts = split m!(/)!, $x; # $parts[0] = '' + # $parts[1] = '/' + # $parts[2] = 'usr' + # $parts[3] = '/' + # $parts[4] = 'bin' + # $parts[5] = '/' + # $parts[6] = 'perl' + +Since the first character of $x matched the regexp, C<split> prepended +an empty initial element to the list. + +If you have read this far, congratulations! You now have all the basic +tools needed to use regular expressions to solve a wide range of text +processing problems. If this is your first time through the tutorial, +why not stop here and play around with regexps a while... S<Part 2> +concerns the more esoteric aspects of regular expressions and those +concepts certainly aren't needed right at the start. + +=head1 Part 2: Power tools + +OK, you know the basics of regexps and you want to know more. If +matching regular expressions is analogous to a walk in the woods, then +the tools discussed in Part 1 are analogous to topo maps and a +compass, basic tools we use all the time. Most of the tools in part 2 +are are analogous to flare guns and satellite phones. They aren't used +too often on a hike, but when we are stuck, they can be invaluable. + +What follows are the more advanced, less used, or sometimes esoteric +capabilities of perl regexps. In Part 2, we will assume you are +comfortable with the basics and concentrate on the new features. + +=head2 More on characters, strings, and character classes + +There are a number of escape sequences and character classes that we +haven't covered yet. + +There are several escape sequences that convert characters or strings +between upper and lower case. C<\l> and C<\u> convert the next +character to lower or upper case, respectively: + + $x = "perl"; + $string =~ /\u$x/; # matches 'Perl' in $string + $x = "M(rs?|s)\\."; # note the double backslash + $string =~ /\l$x/; # matches 'mr.', 'mrs.', and 'ms.', + +C<\L> and C<\U> converts a whole substring, delimited by C<\L> or +C<\U> and C<\E>, to lower or upper case: + + $x = "This word is in lower case:\L SHOUT\E"; + $x =~ /shout/; # matches + $x = "I STILL KEYPUNCH CARDS FOR MY 360" + $x =~ /\Ukeypunch/; # matches punch card string + +If there is no C<\E>, case is converted until the end of the +string. The regexps C<\L\u$word> or C<\u\L$word> convert the first +character of C<$word> to uppercase and the rest of the characters to +lowercase. + +Control characters can be escaped with C<\c>, so that a control-Z +character would be matched with C<\cZ>. The escape sequence +C<\Q>...C<\E> quotes, or protects most non-alphabetic characters. For +instance, + + $x = "\QThat !^*&%~& cat!"; + $x =~ /\Q!^*&%~&\E/; # check for rough language + +It does not protect C<$> or C<@>, so that variables can still be +substituted. + +With the advent of 5.6.0, perl regexps can handle more than just the +standard ASCII character set. Perl now supports B<Unicode>, a standard +for encoding the character sets from many of the world's written +languages. Unicode does this by allowing characters to be more than +one byte wide. Perl uses the UTF-8 encoding, in which ASCII characters +are still encoded as one byte, but characters greater than C<chr(127)> +may be stored as two or more bytes. + +What does this mean for regexps? Well, regexp users don't need to know +much about perl's internal representation of strings. But they do need +to know 1) how to represent Unicode characters in a regexp and 2) when +a matching operation will treat the string to be searched as a +sequence of bytes (the old way) or as a sequence of Unicode characters +(the new way). The answer to 1) is that Unicode characters greater +than C<chr(127)> may be represented using the C<\x{hex}> notation, +with C<hex> a hexadecimal integer: + + use utf8; # We will be doing Unicode processing + /\x{263a}/; # match a Unicode smiley face :) + +Unicode characters in the range of 128-255 use two hexadecimal digits +with braces: C<\x{ab}>. Note that this is different than C<\xab>, +which is just a hexadecimal byte with no Unicode +significance. + +Figuring out the hexadecimal sequence of a Unicode character you want +or deciphering someone else's hexadecimal Unicode regexp is about as +much fun as programming in machine code. So another way to specify +Unicode characters is to use the S<B<named character> > escape +sequence C<\N{name}>. C<name> is a name for the Unicode character, as +specified in the Unicode standard. For instance, if we wanted to +represent or match the astrological sign for the planet Mercury, we +could use + + use utf8; # We will be doing Unicode processing + use charnames ":full"; # use named chars with Unicode full names + $x = "abc\N{MERCURY}def"; + $x =~ /\N{MERCURY}/; # matches + +One can also use short names or restrict names to a certain alphabet: + + use utf8; # We will be doing Unicode processing + + use charnames ':full'; + print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; + + use charnames ":short"; + print "\N{greek:Sigma} is an upper-case sigma.\n"; + + use charnames qw(greek); + print "\N{sigma} is Greek sigma\n"; + +A list of full names is found in the file Names.txt in the +lib/perl5/5.6.0/unicode directory. + +The answer to requirement 2), as of 5.6.0, is that if a regexp +contains Unicode characters, the string is searched as a sequence of +Unicode characters. Otherwise, the string is searched as a sequence of +bytes. If the string is being searched as a sequence of Unicode +characters, but matching a single byte is required, we can use the C<\C> +escape sequence. C<\C> is a character class akin to C<.> except that +it matches I<any> byte 0-255. So + + use utf8; # We will be doing Unicode processing + use charnames ":full"; # use named chars with Unicode full names + $x = "a"; + $x =~ /\C/; # matches 'a', eats one byte + $x = ""; + $x =~ /\C/; # doesn't match, no bytes to match + $x = "\N{MERCURY}"; # two-byte Unicode character + $x =~ /\C/; # matches, but dangerous! + +The last regexp matches, but is dangerous because the string +I<character> position is no longer synchronized to the string I<byte> +position. This generates the warning 'Malformed UTF-8 +character'. C<\C> is best used for matching the binary data in strings +with binary data intermixed with Unicode characters. + +Let us now discuss the rest of the character classes. Just as with +Unicode characters, there are named Unicode character classes +represented by the C<\p{name}> escape sequence. Closely associated is +the C<\P{name}> character class, which is the negation of the +C<\p{name}> class. For example, to match lower and uppercase +characters, + + use utf8; # We will be doing Unicode processing + use charnames ":full"; # use named chars with Unicode full names + $x = "BOB"; + $x =~ /^\p{IsUpper}/; # matches, uppercase char class + $x =~ /^\P{IsUpper}/; # doesn't match, char class sans uppercase + $x =~ /^\p{IsLower}/; # doesn't match, lowercase char class + $x =~ /^\P{IsLower}/; # matches, char class sans lowercase + +Here is the association between some Perl named classes and the +traditional Unicode classes: + + Perl class name Unicode class name or regular expression + + IsAlpha /^[LM]/ + IsAlnum /^[LMN]/ + IsASCII $code <= 127 + IsCntrl /^C/ + IsBlank $code =~ /^(0020|0009)$/ || /^Z[^lp]/ + IsDigit Nd + IsGraph /^([LMNPS]|Co)/ + IsLower Ll + IsPrint /^([LMNPS]|Co|Zs)/ + IsPunct /^P/ + IsSpace /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/ + IsSpacePerl /^Z/ || ($code =~ /^(0009|000A|000C|000D)$/ + IsUpper /^L[ut]/ + IsWord /^[LMN]/ || $code eq "005F" + IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/ + +You can also use the official Unicode class names with the C<\p> and +C<\P>, like C<\p{L}> for Unicode 'letters', or C<\p{Lu}> for uppercase +letters, or C<\P{Nd}> for non-digits. If a C<name> is just one +letter, the braces can be dropped. For instance, C<\pM> is the +character class of Unicode 'marks'. + +C<\X> is an abbreviation for a character class sequence that includes +the Unicode 'combining character sequences'. A 'combining character +sequence' is a base character followed by any number of combining +characters. An example of a combining character is an accent. Using +the Unicode full names, e.g., S<C<A + COMBINING RING> > is a combining +character sequence with base character C<A> and combining character +S<C<COMBINING RING> >, which translates in Danish to A with the circle +atop it, as in the word Angstrom. C<\X> is equivalent to C<\PM\pM*}>, +i.e., a non-mark followed by one or more marks. + +As if all those classes weren't enough, Perl also defines POSIX style +character classes. These have the form C<[:name:]>, with C<name> the +name of the POSIX class. The POSIX classes are C<alpha>, C<alnum>, +C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>, +C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl +extension to match C<\w>), and C<blank> (a GNU extension). If C<utf8> +is being used, then these classes are defined the same as their +corresponding perl Unicode classes: C<[:upper:]> is the same as +C<\p{IsUpper}>, etc. The POSIX character classes, however, don't +require using C<utf8>. The C<[:digit:]>, C<[:word:]>, and +C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s> +character classes. To negate a POSIX class, put a C<^> in front of +the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under +C<utf8>, C<\P{IsDigit}>. The Unicode and POSIX character classes can +be used just like C<\d>, both inside and outside of character classes: + + /\s+[abc[:digit:]xyz]\s*/; # match a,b,c,x,y,z, or a digit + /^=item\s[:digit:]/; # match '=item', + # followed by a space and a digit + use utf8; + use charnames ":full"; + /\s+[abc\p{IsDigit}xyz]\s+/; # match a,b,c,x,y,z, or a digit + /^=item\s\p{IsDigit}/; # match '=item', + # followed by a space and a digit + +Whew! That is all the rest of the characters and character classes. + +=head2 Compiling and saving regular expressions + +In Part 1 we discussed the C<//o> modifier, which compiles a regexp +just once. This suggests that a compiled regexp is some data structure +that can be stored once and used again and again. The regexp quote +C<qr//> does exactly that: C<qr/string/> compiles the C<string> as a +regexp and transforms the result into a form that can be assigned to a +variable: + + $reg = qr/foo+bar?/; # reg contains a compiled regexp + +Then C<$reg> can be used as a regexp: + + $x = "fooooba"; + $x =~ $reg; # matches, just like /foo+bar?/ + $x =~ /$reg/; # same thing, alternate form + +C<$reg> can also be interpolated into a larger regexp: + + $x =~ /(abc)?$reg/; # still matches + +As with the matching operator, the regexp quote can use different +delimiters, e.g., C<qr!!>, C<qr{}> and C<qr~~>. The single quote +delimiters C<qr''> prevent any interpolation from taking place. + +Pre-compiled regexps are useful for creating dynamic matches that +don't need to be recompiled each time they are encountered. Using +pre-compiled regexps, C<simple_grep> program can be expanded into a +program that matches multiple patterns: + + % cat > multi_grep + #!/usr/bin/perl + # multi_grep - match any of <number> regexps + # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ... + + $number = shift; + $regexp[$_] = shift foreach (0..$number-1); + @compiled = map qr/$_/, @regexp; + while ($line = <>) { + foreach $pattern (@compiled) { + if ($line =~ /$pattern/) { + print $line; + last; # we matched, so move onto the next line + } + } + } + ^D + + % multi_grep 2 last for multi_grep + $regexp[$_] = shift foreach (0..$number-1); + foreach $pattern (@compiled) { + last; + +Storing pre-compiled regexps in an array C<@compiled> allows us to +simply loop through the regexps without any recompilation, thus gaining +flexibility without sacrificing speed. + +=head2 Embedding comments and modifiers in a regular expression + +Starting with this section, we will be discussing Perl's set of +B<extended patterns>. These are extensions to the traditional regular +expression syntax that provide powerful new tools for pattern +matching. We have already seen extensions in the form of the minimal +matching constructs C<??>, C<*?>, C<+?>, C<{n,m}?>, and C<{n,}?>. The +rest of the extensions below have the form C<(?char...)>, where the +C<char> is a character that determines the type of extension. + +The first extension is an embedded comment C<(?#text)>. This embeds a +comment into the regular expression without affecting its meaning. The +comment should not have any closing parentheses in the text. An +example is + + /(?# Match an integer:)[+-]?\d+/; + +This style of commenting has been largely superseded by the raw, +freeform commenting that is allowed with the C<//x> modifier. + +The modifiers C<//i>, C<//m>, C<//s>, and C<//x> can also embedded in +a regexp using C<(?i)>, C<(?m)>, C<(?s)>, and C<(?x)>. For instance, + + /(?i)yes/; # match 'yes' case insensitively + /yes/i; # same thing + /(?x)( # freeform version of an integer regexp + [+-]? # match an optional sign + \d+ # match a sequence of digits + ) + /x; + +Embedded modifiers can have two important advantages over the usual +modifiers. Embedded modifiers allow a custom set of modifiers to +I<each> regexp pattern. This is great for matching an array of regexps +that must have different modifiers: + + $pattern[0] = '(?i)doctor'; + $pattern[1] = 'Johnson'; + ... + while (<>) { + foreach $patt (@pattern) { + print if /$patt/; + } + } + +The second advantage is that embedded modifiers only affect the regexp +inside the group the embedded modifier is contained in. So grouping +can be used to localize the modifier's effects: + + /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc. + +Embedded modifiers can also turn off any modifiers already present +by using, e.g., C<(?-i)>. Modifiers can also be combined into +a single expression, e.g., C<(?s-i)> turns on single line mode and +turns off case insensitivity. + +=head2 Non-capturing groupings + +We noted in Part 1 that groupings C<()> had two distinct functions: 1) +group regexp elements together as a single unit, and 2) extract, or +capture, substrings that matched the regexp in the +grouping. Non-capturing groupings, denoted by C<(?:regexp)>, allow the +regexp to be treated as a single unit, but don't extract substrings or +set matching variables C<$1>, etc. Both capturing and non-capturing +groupings are allowed to co-exist in the same regexp. Because there is +no extraction, non-capturing groupings are faster than capturing +groupings. Non-capturing groupings are also handy for choosing exactly +which parts of a regexp are to be extracted to matching variables: + + # match a number, $1-$4 are set, but we only want $1 + /([+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)/; + + # match a number faster , only $1 is set + /([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)/; + + # match a number, get $1 = whole number, $2 = exponent + /([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE]([+-]?\d+))?)/; + +Non-capturing groupings are also useful for removing nuisance +elements gathered from a split operation: + + $x = '12a34b5'; + @num = split /(a|b)/, $x; # @num = ('12','a','34','b','5') + @num = split /(?:a|b)/, $x; # @num = ('12','34','5') + +Non-capturing groupings may also have embedded modifiers: +C<(?i-m:regexp)> is a non-capturing grouping that matches C<regexp> +case insensitively and turns off multi-line mode. + +=head2 Looking ahead and looking behind + +This section concerns the lookahead and lookbehind assertions. First, +a little background. + +In Perl regular expressions, most regexp elements 'eat up' a certain +amount of string when they match. For instance, the regexp element +C<[abc}]> eats up one character of the string when it matches, in the +sense that perl moves to the next character position in the string +after the match. There are some elements, however, that don't eat up +characters (advance the character position) if they match. The examples +we have seen so far are the anchors. The anchor C<^> matches the +beginning of the line, but doesn't eat any characters. Similarly, the +word boundary anchor C<\b> matches, e.g., if the character to the left +is a word character and the character to the right is a non-word +character, but it doesn't eat up any characters itself. Anchors are +examples of 'zero-width assertions'. Zero-width, because they consume +no characters, and assertions, because they test some property of the +string. In the context of our walk in the woods analogy to regexp +matching, most regexp elements move us along a trail, but anchors have +us stop a moment and check our surroundings. If the local environment +checks out, we can proceed forward. But if the local environment +doesn't satisfy us, we must backtrack. + +Checking the environment entails either looking ahead on the trail, +looking behind, or both. C<^> looks behind, to see that there are no +characters before. C<$> looks ahead, to see that there are no +characters after. C<\b> looks both ahead and behind, to see if the +characters on either side differ in their 'word'-ness. + +The lookahead and lookbehind assertions are generalizations of the +anchor concept. Lookahead and lookbehind are zero-width assertions +that let us specify which characters we want to test for. The +lookahead assertion is denoted by C<(?=regexp)> and the lookbehind +assertion is denoted by C<< (?<=fixed-regexp) >>. Some examples are + + $x = "I catch the housecat 'Tom-cat' with catnip"; + $x =~ /cat(?=\s+)/; # matches 'cat' in 'housecat' + @catwords = ($x =~ /(?<=\s)cat\w+/g); # matches, + # $catwords[0] = 'catch' + # $catwords[1] = 'catnip' + $x =~ /\bcat\b/; # matches 'cat' in 'Tom-cat' + $x =~ /(?<=\s)cat(?=\s)/; # doesn't match; no isolated 'cat' in + # middle of $x + +Note that the parentheses in C<(?=regexp)> and C<< (?<=regexp) >> are +non-capturing, since these are zero-width assertions. Thus in the +second regexp, the substrings captured are those of the whole regexp +itself. Lookahead C<(?=regexp)> can match arbitrary regexps, but +lookbehind C<< (?<=fixed-regexp) >> only works for regexps of fixed +width, i.e., a fixed number of characters long. Thus +C<< (?<=(ab|bc)) >> is fine, but C<< (?<=(ab)*) >> is not. The +negated versions of the lookahead and lookbehind assertions are +denoted by C<(?!regexp)> and C<< (?<!fixed-regexp) >> respectively. +They evaluate true if the regexps do I<not> match: + + $x = "foobar"; + $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo' + $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo' + $x =~ /(?<!\s)foo/; # matches, there is no \s before 'foo' + +=head2 Using independent subexpressions to prevent backtracking + +The last few extended patterns in this tutorial are experimental as of +5.6.0. Play with them, use them in some code, but don't rely on them +just yet for production code. + +S<B<Independent subexpressions> > are regular expressions, in the +context of a larger regular expression, that function independently of +the larger regular expression. That is, they consume as much or as +little of the string as they wish without regard for the ability of +the larger regexp to match. Independent subexpressions are represented +by C<< (?>regexp) >>. We can illustrate their behavior by first +considering an ordinary regexp: + + $x = "ab"; + $x =~ /a*ab/; # matches + +This obviously matches, but in the process of matching, the +subexpression C<a*> first grabbed the C<a>. Doing so, however, +wouldn't allow the whole regexp to match, so after backtracking, C<a*> +eventually gave back the C<a> and matched the empty string. Here, what +C<a*> matched was I<dependent> on what the rest of the regexp matched. + +Contrast that with an independent subexpression: + + $x =~ /(?>a*)ab/; # doesn't match! + +The independent subexpression C<< (?>a*) >> doesn't care about the rest +of the regexp, so it sees an C<a> and grabs it. Then the rest of the +regexp C<ab> cannot match. Because C<< (?>a*) >> is independent, there +is no backtracking and and the independent subexpression does not give +up its C<a>. Thus the match of the regexp as a whole fails. A similar +behavior occurs with completely independent regexps: + + $x = "ab"; + $x =~ /a*/g; # matches, eats an 'a' + $x =~ /\Gab/g; # doesn't match, no 'a' available + +Here C<//g> and C<\G> create a 'tag team' handoff of the string from +one regexp to the other. Regexps with an independent subexpression are +much like this, with a handoff of the string to the independent +subexpression, and a handoff of the string back to the enclosing +regexp. + +The ability of an independent subexpression to prevent backtracking +can be quite useful. Suppose we want to match a non-empty string +enclosed in parentheses up to two levels deep. Then the following +regexp matches: + + $x = "abc(de(fg)h"; # unbalanced parentheses + $x =~ /\( ( [^()]+ | \([^()]*\) )+ \)/x; + +The regexp matches an open parenthesis, one or more copies of an +alternation, and a close parenthesis. The alternation is two-way, with +the first alternative C<[^()]+> matching a substring with no +parentheses and the second alternative C<\([^()]*\)> matching a +substring delimited by parentheses. The problem with this regexp is +that it is pathological: it has nested indeterminate quantifiers + of the form C<(a+|b)+>. We discussed in Part 1 how nested quantifiers +like this could take an exponentially long time to execute if there +was no match possible. To prevent the exponential blowup, we need to +prevent useless backtracking at some point. This can be done by +enclosing the inner quantifier as an independent subexpression: + + $x =~ /\( ( (?>[^()]+) | \([^()]*\) )+ \)/x; + +Here, C<< (?>[^()]+) >> breaks the degeneracy of string partitioning +by gobbling up as much of the string as possible and keeping it. Then +match failures fail much more quickly. + +=head2 Conditional expressions + +A S<B<conditional expression> > is a form of if-then-else statement +that allows one to choose which patterns are to be matched, based on +some condition. There are two types of conditional expression: +C<(?(condition)yes-regexp)> and +C<(?(condition)yes-regexp|no-regexp)>. C<(?(condition)yes-regexp)> is +like an S<C<'if () {}'> > statement in Perl. If the C<condition> is true, +the C<yes-regexp> will be matched. If the C<condition> is false, the +C<yes-regexp> will be skipped and perl will move onto the next regexp +element. The second form is like an S<C<'if () {} else {}'> > statement +in Perl. If the C<condition> is true, the C<yes-regexp> will be +matched, otherwise the C<no-regexp> will be matched. + +The C<condition> can have two forms. The first form is simply an +integer in parentheses C<(integer)>. It is true if the corresponding +backreference C<\integer> matched earlier in the regexp. The second +form is a bare zero width assertion C<(?...)>, either a +lookahead, a lookbehind, or a code assertion (discussed in the next +section). + +The integer form of the C<condition> allows us to choose, with more +flexibility, what to match based on what matched earlier in the +regexp. This searches for words of the form C<"$x$x"> or +C<"$x$y$y$x">: + + % simple_grep '^(\w+)(\w+)?(?(2)\2\1|\1)$' /usr/dict/words + beriberi + coco + couscous + deed + ... + toot + toto + tutu + +The lookbehind C<condition> allows, along with backreferences, +an earlier part of the match to influence a later part of the +match. For instance, + + /[ATGC]+(?(?<=AA)G|C)$/; + +matches a DNA sequence such that it either ends in C<AAG>, or some +other base pair combination and C<C>. Note that the form is +C<< (?(?<=AA)G|C) >> and not C<< (?((?<=AA))G|C) >>; for the +lookahead, lookbehind or code assertions, the parentheses around the +conditional are not needed. + +=head2 A bit of magic: executing Perl code in a regular expression + +Normally, regexps are a part of Perl expressions. +S<B<Code evaluation> > expressions turn that around by allowing +arbitrary Perl code to be a part of of a regexp. A code evaluation +expression is denoted C<(?{code})>, with C<code> a string of Perl +statements. + +Code expressions are zero-width assertions, and the value they return +depends on their environment. There are two possibilities: either the +code expression is used as a conditional in a conditional expression +C<(?(condition)...)>, or it is not. If the code expression is a +conditional, the code is evaluated and the result (i.e., the result of +the last statement) is used to determine truth or falsehood. If the +code expression is not used as a conditional, the assertion always +evaluates true and the result is put into the special variable +C<$^R>. The variable C<$^R> can then be used in code expressions later +in the regexp. Here are some silly examples: + + $x = "abcdef"; + $x =~ /abc(?{print "Hi Mom!";})def/; # matches, + # prints 'Hi Mom!' + $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match, + # no 'Hi Mom!' + +Pay careful attention to the next example: + + $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match, + # no 'Hi Mom!' + # but why not? + +At first glance, you'd think that it shouldn't print, because obviously +the C<ddd> isn't going to match the target string. But look at this +example: + + $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match, + # but _does_ print + +Hmm. What happened here? If you've been following along, you know that +the above pattern should be effectively the same as the last one -- +enclosing the d in a character class isn't going to change what it +matches. So why does the first not print while the second one does? + +The answer lies in the optimizations the REx engine makes. In the first +case, all the engine sees are plain old characters (aside from the +C<?{}> construct). It's smart enough to realize that the string 'ddd' +doesn't occur in our target string before actually running the pattern +through. But in the second case, we've tricked it into thinking that our +pattern is more complicated than it is. It takes a look, sees our +character class, and decides that it will have to actually run the +pattern to determine whether or not it matches, and in the process of +running it hits the print statement before it discovers that we don't +have a match. + +To take a closer look at how the engine does optimizations, see the +section L<"Pragmas and debugging"> below. + +More fun with C<?{}>: + + $x =~ /(?{print "Hi Mom!";})/; # matches, + # prints 'Hi Mom!' + $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches, + # prints '1' + $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches, + # prints '1' + +The bit of magic mentioned in the section title occurs when the regexp +backtracks in the process of searching for a match. If the regexp +backtracks over a code expression and if the variables used within are +localized using C<local>, the changes in the variables produced by the +code expression are undone! Thus, if we wanted to count how many times +a character got matched inside a group, we could use, e.g., + + $x = "aaaa"; + $count = 0; # initialize 'a' count + $c = "bob"; # test if $c gets clobbered + $x =~ /(?{local $c = 0;}) # initialize count + ( a # match 'a' + (?{local $c = $c + 1;}) # increment count + )* # do this any number of times, + aa # but match 'aa' at the end + (?{$count = $c;}) # copy local $c var into $count + /x; + print "'a' count is $count, \$c variable is '$c'\n"; + +This prints + + 'a' count is 2, $c variable is 'bob' + +If we replace the S<C< (?{local $c = $c + 1;})> > with +S<C< (?{$c = $c + 1;})> >, the variable changes are I<not> undone +during backtracking, and we get + + 'a' count is 4, $c variable is 'bob' + +Note that only localized variable changes are undone. Other side +effects of code expression execution are permanent. Thus + + $x = "aaaa"; + $x =~ /(a(?{print "Yow\n";}))*aa/; + +produces + + Yow + Yow + Yow + Yow + +The result C<$^R> is automatically localized, so that it will behave +properly in the presence of backtracking. + +This example uses a code expression in a conditional to match the +article 'the' in either English or German: + + $lang = 'DE'; # use German + ... + $text = "das"; + print "matched\n" + if $text =~ /(?(?{ + $lang eq 'EN'; # is the language English? + }) + the | # if so, then match 'the' + (die|das|der) # else, match 'die|das|der' + ) + /xi; + +Note that the syntax here is C<(?(?{...})yes-regexp|no-regexp)>, not +C<(?((?{...}))yes-regexp|no-regexp)>. In other words, in the case of a +code expression, we don't need the extra parentheses around the +conditional. + +If you try to use code expressions with interpolating variables, perl +may surprise you: + + $bar = 5; + $pat = '(?{ 1 })'; + /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated + /foo(?{ 1 })$bar/; # compile error! + /foo${pat}bar/; # compile error! + + $pat = qr/(?{ $foo = 1 })/; # precompile code regexp + /foo${pat}bar/; # compiles ok + +If a regexp has (1) code expressions and interpolating variables,or +(2) a variable that interpolates a code expression, perl treats the +regexp as an error. If the code expression is precompiled into a +variable, however, interpolating is ok. The question is, why is this +an error? + +The reason is that variable interpolation and code expressions +together pose a security risk. The combination is dangerous because +many programmers who write search engines often take user input and +plug it directly into a regexp: + + $regexp = <>; # read user-supplied regexp + $chomp $regexp; # get rid of possible newline + $text =~ /$regexp/; # search $text for the $regexp + +If the C<$regexp> variable contains a code expression, the user could +then execute arbitrary Perl code. For instance, some joker could +search for S<C<system('rm -rf *');> > to erase your files. In this +sense, the combination of interpolation and code expressions B<taints> +your regexp. So by default, using both interpolation and code +expressions in the same regexp is not allowed. If you're not +concerned about malicious users, it is possible to bypass this +security check by invoking S<C<use re 'eval'> >: + + use re 'eval'; # throw caution out the door + $bar = 5; + $pat = '(?{ 1 })'; + /foo(?{ 1 })$bar/; # compiles ok + /foo${pat}bar/; # compiles ok + +Another form of code expression is the S<B<pattern code expression> >. +The pattern code expression is like a regular code expression, except +that the result of the code evaluation is treated as a regular +expression and matched immediately. A simple example is + + $length = 5; + $char = 'a'; + $x = 'aaaaabb'; + $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a' + + +This final example contains both ordinary and pattern code +expressions. It detects if a binary string C<1101010010001...> has a +Fibonacci spacing 0,1,1,2,3,5,... of the C<1>'s: + + $s0 = 0; $s1 = 1; # initial conditions + $x = "1101010010001000001"; + print "It is a Fibonacci sequence\n" + if $x =~ /^1 # match an initial '1' + ( + (??{'0' x $s0}) # match $s0 of '0' + 1 # and then a '1' + (?{ + $largest = $s0; # largest seq so far + $s2 = $s1 + $s0; # compute next term + $s0 = $s1; # in Fibonacci sequence + $s1 = $s2; + }) + )+ # repeat as needed + $ # that is all there is + /x; + print "Largest sequence matched was $largest\n"; + +This prints + + It is a Fibonacci sequence + Largest sequence matched was 5 + +Ha! Try that with your garden variety regexp package... + +Note that the variables C<$s0> and C<$s1> are not substituted when the +regexp is compiled, as happens for ordinary variables outside a code +expression. Rather, the code expressions are evaluated when perl +encounters them during the search for a match. + +The regexp without the C<//x> modifier is + + /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/; + +and is a great start on an Obfuscated Perl entry :-) When working with +code and conditional expressions, the extended form of regexps is +almost necessary in creating and debugging regexps. + +=head2 Pragmas and debugging + +Speaking of debugging, there are several pragmas available to control +and debug regexps in Perl. We have already encountered one pragma in +the previous section, S<C<use re 'eval';> >, that allows variable +interpolation and code expressions to coexist in a regexp. The other +pragmas are + + use re 'taint'; + $tainted = <>; + @parts = ($tainted =~ /(\w+)\s+(\w+)/; # @parts is now tainted + +The C<taint> pragma causes any substrings from a match with a tainted +variable to be tainted as well. This is not normally the case, as +regexps are often used to extract the safe bits from a tainted +variable. Use C<taint> when you are not extracting safe bits, but are +performing some other processing. Both C<taint> and C<eval> pragmas +are lexically scoped, which means they are in effect only until +the end of the block enclosing the pragmas. + + use re 'debug'; + /^(.*)$/s; # output debugging info + + use re 'debugcolor'; + /^(.*)$/s; # output debugging info in living color + +The global C<debug> and C<debugcolor> pragmas allow one to get +detailed debugging info about regexp compilation and +execution. C<debugcolor> is the same as debug, except the debugging +information is displayed in color on terminals that can display +termcap color sequences. Here is example output: + + % perl -e 'use re "debug"; "abc" =~ /a*b+c/;' + Compiling REx `a*b+c' + size 9 first at 1 + 1: STAR(4) + 2: EXACT <a>(0) + 4: PLUS(7) + 5: EXACT <b>(0) + 7: EXACT <c>(9) + 9: END(0) + floating `bc' at 0..2147483647 (checking floating) minlen 2 + Guessing start of match, REx `a*b+c' against `abc'... + Found floating substr `bc' at offset 1... + Guessed: match at offset 0 + Matching REx `a*b+c' against `abc' + Setting an EVAL scope, savestack=3 + 0 <> <abc> | 1: STAR + EXACT <a> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 1 <a> <bc> | 4: PLUS + EXACT <b> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 2 <ab> <c> | 7: EXACT <c> + 3 <abc> <> | 9: END + Match successful! + Freeing REx: `a*b+c' + +If you have gotten this far into the tutorial, you can probably guess +what the different parts of the debugging output tell you. The first +part + + Compiling REx `a*b+c' + size 9 first at 1 + 1: STAR(4) + 2: EXACT <a>(0) + 4: PLUS(7) + 5: EXACT <b>(0) + 7: EXACT <c>(9) + 9: END(0) + +describes the compilation stage. C<STAR(4)> means that there is a +starred object, in this case C<'a'>, and if it matches, goto line 4, +i.e., C<PLUS(7)>. The middle lines describe some heuristics and +optimizations performed before a match: + + floating `bc' at 0..2147483647 (checking floating) minlen 2 + Guessing start of match, REx `a*b+c' against `abc'... + Found floating substr `bc' at offset 1... + Guessed: match at offset 0 + +Then the match is executed and the remaining lines describe the +process: + + Matching REx `a*b+c' against `abc' + Setting an EVAL scope, savestack=3 + 0 <> <abc> | 1: STAR + EXACT <a> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 1 <a> <bc> | 4: PLUS + EXACT <b> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 2 <ab> <c> | 7: EXACT <c> + 3 <abc> <> | 9: END + Match successful! + Freeing REx: `a*b+c' + +Each step is of the form S<C<< n <x> <y> >> >, with C<< <x> >> the +part of the string matched and C<< <y> >> the part not yet +matched. The S<C<< | 1: STAR >> > says that perl is at line number 1 +n the compilation list above. See +L<perldebguts/"Debugging regular expressions"> for much more detail. + +An alternative method of debugging regexps is to embed C<print> +statements within the regexp. This provides a blow-by-blow account of +the backtracking in an alternation: + + "that this" =~ m@(?{print "Start at position ", pos, "\n";}) + t(?{print "t1\n";}) + h(?{print "h1\n";}) + i(?{print "i1\n";}) + s(?{print "s1\n";}) + | + t(?{print "t2\n";}) + h(?{print "h2\n";}) + a(?{print "a2\n";}) + t(?{print "t2\n";}) + (?{print "Done at position ", pos, "\n";}) + @x; + +prints + + Start at position 0 + t1 + h1 + t2 + h2 + a2 + t2 + Done at position 4 + +=head1 BUGS + +Code expressions, conditional expressions, and independent expressions +are B<experimental>. Don't use them in production code. Yet. + +=head1 SEE ALSO + +This is just a tutorial. For the full story on perl regular +expressions, see the L<perlre> regular expressions reference page. + +For more information on the matching C<m//> and substitution C<s///> +operators, see L<perlop/"Regexp Quote-Like Operators">. For +information on the C<split> operation, see L<perlfunc/split>. + +For an excellent all-around resource on the care and feeding of +regular expressions, see the book I<Mastering Regular Expressions> by +Jeffrey Friedl (published by O'Reilly, ISBN 1556592-257-3). + +=head1 AUTHOR AND COPYRIGHT + +Copyright (c) 2000 Mark Kvale +All rights reserved. + +This document may be distributed under the same terms as Perl itself. + +=head2 Acknowledgments + +The inspiration for the stop codon DNA example came from the ZIP +code example in chapter 7 of I<Mastering Regular Expressions>. + +The author would like to thank Jeff Pinyan, Andrew Johnson, Peter +Haworth, Ronald J Kimball, and Joe Smith for all their helpful +comments. + +=cut + diff --git a/contrib/perl5/pod/perlutil.pod b/contrib/perl5/pod/perlutil.pod new file mode 100644 index 0000000000000..be7a345f79678 --- /dev/null +++ b/contrib/perl5/pod/perlutil.pod @@ -0,0 +1,185 @@ +=head1 NAME + +perlutil - utilities packaged with the Perl distribution + +=head1 DESCRIPTION + +Along with the Perl interpreter itself, the Perl distribution installs a +range of utilities on your system. There are also several utilities +which are used by the Perl distribution itself as part of the install +process. This document exists to list all of these utilities, explain +what they are for and provide pointers to each module's documentation, +if appropriate. + +=head2 DOCUMENTATION + +=over 3 + +=item L<perldoc|perldoc> + +The main interface to Perl's documentation is C<perldoc>, although +if you're reading this, it's more than likely that you've already found +it. F<perldoc> will extract and format the documentation from any file +in the current directory, any Perl module installed on the system, or +any of the standard documentation pages, such as this one. Use +C<perldoc E<lt>nameE<gt>> to get information on any of the utilities +described in this document. + +=item L<pod2man|pod2man> and L<pod2text|pod2text> + +If it's run from a terminal, F<perldoc> will usually call F<pod2man> to +translate POD (Plain Old Documentation - see L<perlpod> for an +explanation) into a man page, and then run F<man> to display it; if +F<man> isn't available, F<pod2text> will be used instead and the output +piped through your favourite pager. + +=item L<pod2html|pod2html> and L<pod2latex|pod2latex> + +As well as these two, there are two other converters: F<pod2html> will +produce HTML pages from POD, and F<pod2latex>, which produces LaTeX +files. + +=item L<pod2usage|pod2usage> + +If you just want to know how to use the utilities described here, +F<pod2usage> will just extract the "USAGE" section; some of +the utilities will automatically call F<pod2usage> on themselves when +you call them with C<-help>. + +=item L<podselect|podselect> + +F<pod2usage> is a special case of F<podselect>, a utility to extract +named sections from documents written in POD. For instance, while +utilities have "USAGE" sections, Perl modules usually have "SYNOPSIS" +sections: C<podselect -s "SYNOPSIS" ...> will extract this section for +a given file. + +=item L<podchecker|podchecker> + +If you're writing your own documentation in POD, the F<podchecker> +utility will look for errors in your markup. + +=item L<splain|splain> + +F<splain> is an interface to L<perldiag> - paste in your error message +to it, and it'll explain it for you. + +=item L<roffitall|roffitall> + +The C<roffitall> utility is not installed on your system but lives in +the F<pod/> directory of your Perl source kit; it converts all the +documentation from the distribution to F<*roff> format, and produces a +typeset PostScript or text file of the whole lot. + +=back + +=head2 CONVERTORS + +To help you convert legacy programs to Perl, we've included three +conversion filters: + +=over 3 + +=item L<a2p|a2p> + +F<a2p> converts F<awk> scripts to Perl programs; for example, C<a2p -F:> +on the simple F<awk> script C<{print $2}> will produce a Perl program +based around this code: + + while (<>) { + ($Fld1,$Fld2) = split(/[:\n]/, $_, 9999); + print $Fld2; + } + +=item L<s2p|s2p> + +Similarly, F<s2p> converts F<sed> scripts to Perl programs. F<s2p> run +on C<s/foo/bar> will produce a Perl program based around this: + + while (<>) { + chomp; + s/foo/bar/g; + print if $printit; + } + +=item L<find2perl|find2perl> + +Finally, F<find2perl> translates C<find> commands to Perl equivalents which +use the L<File::Find|File::Find> module. As an example, +C<find2perl . -user root -perm 4000 -print> produces the following callback +subroutine for C<File::Find>: + + sub wanted { + my ($dev,$ino,$mode,$nlink,$uid,$gid); + (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && + $uid == $uid{'root'}) && + (($mode & 0777) == 04000); + print("$name\n"); + } + +=back + +As well as these filters for converting other languages, the +L<pl2pm|pl2pm> utility will help you convert old-style Perl 4 libraries to +new-style Perl5 modules. + +=head2 Development + +There are a set of utilities which help you in developing Perl programs, +and in particular, extending Perl with C. + +=over 3 + +=item L<perlbug|perlbug> + +F<perlbug> is the recommended way to report bugs in the perl interpreter +itself or any of the standard library modules back to the developers; +please read through the documentation for F<perlbug> thoroughly before +using it to submit a bug report. + +=item L<h2ph|h2ph> + +Back before Perl had the XS system for connecting with C libraries, +programmers used to get library constants by reading through the C +header files. You may still see C<require 'syscall.ph'> or similar +around - the F<.ph> file should be created by running F<h2ph> on the +corresponding F<.h> file. See the F<h2ph> documentation for more on how +to convert a whole bunch of header files at ones. + +=item L<c2ph|c2ph> and L<pstruct|pstruct> + +F<c2ph> and F<pstruct>, which are actually the same program but behave +differently depending on how they are called, provide another way of +getting at C with Perl - they'll convert C structures and union declarations +to Perl code. This is deprecated in favour of F<h2xs> these days. + +=item L<h2xs|h2xs> + +F<h2xs> converts C header files into XS modules, and will try and write +as much glue between C libraries and Perl modules as it can. It's also +very useful for creating skeletons of pure Perl modules. + +=item L<dprofpp|dprofpp> + +Perl comes with a profiler, the F<Devel::Dprof> module. The +F<dprofpp> utility analyzes the output of this profiler and tells you +which subroutines are taking up the most run time. See L<Devel::Dprof> +for more information. + +=item L<perlcc|perlcc> + +F<perlcc> is the interface to the experimental Perl compiler suite. + +=back + +=head2 SEE ALSO + +L<perldoc|perldoc>, L<pod2man|pod2man>, L<perlpod>, +L<pod2html|pod2html>, L<pod2usage|pod2usage>, L<podselect|podselect>, +L<podchecker|podchecker>, L<splain|splain>, L<perldiag>, +L<roffitall|roffitall>, L<a2p|a2p>, L<s2p|s2p>, L<find2perl|find2perl>, +L<File::Find|File::Find>, L<pl2pm|pl2pm>, L<perlbug|perlbug>, +L<h2ph|h2ph>, L<c2ph|c2ph>, L<h2xs|h2xs>, L<dprofpp|dprofpp>, +L<Devel::Dprof>, L<perlcc|perlcc> + +=cut |