eaiovnaovbqoebvqoeavibavo Common/CC.pm000064400000006046147634422050006634 0ustar00package Regexp::Common::CC; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::_support qw /luhn/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my @cards = ( # Name Prefix Length mod 10 [Mastercard => '5[1-5]', 16, 1], [Visa => '4', [13, 16], 1], [Amex => '3[47]', 15, 1], # Carte Blanche ['Diners Club' => '3(?:0[0-5]|[68])', 14, 1], [Discover => '6011', 16, 1], [enRoute => '2(?:014|149)', 15, 0], [JCB => [['3', 16, 1], ['2131|1800', 15, 1]]], ); foreach my $card (@cards) { my ($name, $prefix, $length, $mod) = @$card; # Skip the harder ones for now. next if ref $prefix || ref $length; next unless $mod; my $times = $length + $mod; pattern name => [CC => $name], version => 5.006, create => sub { use re 'eval'; qr <((?=($prefix))[0-9]{$length}) (?(?{Regexp::Common::_support::luhn $1})|(?!))>x } ; } 1; __END__ =pod =head1 NAME Regexp::Common::CC -- provide patterns for credit card numbers. =head1 SYNOPSIS use Regexp::Common qw /CC/; while (<>) { /^$RE{CC}{Mastercard}$/ and print "Mastercard card number\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. This module offers patterns for credit card numbers of several major credit card types. Currently, the supported cards are: I, I, I, and I. =head1 SEE ALSO L for a general description of how to use this interface. =over 4 =item L Credit Card Validation - Check Digits =item L Everything you ever wanted to know about CC's =item L Luhn formula =back =head1 AUTHORS Damian Conway S<(I)> and Abigail S<(I)>. =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. Send them in to S>. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/lingua.pm000064400000004627147634422050007631 0ustar00package Regexp::Common::lingua; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; pattern name => [qw /lingua palindrome -chars=[A-Za-z]/], create => sub { use re 'eval'; my $keep = exists $_ [1] -> {-keep}; my $ch = $_ [1] -> {-chars}; my $idx = $keep ? "1:$ch" : "0:$ch"; my $r = "(??{\$Regexp::Common::lingua::pd{'" . $idx . "'}})"; $Regexp::Common::lingua::pd {$idx} = $keep ? qr /($ch|($ch)($r)?\2)/ : qr /$ch|($ch)($r)?\1/; # print "[$ch]: ", $Regexp::Common::lingua::pd {$idx}, "\n"; # $Regexp::Common::lingua::pd {$idx}; }, version => 5.006 ; 1; __END__ =pod =head1 NAME Regexp::Common::lingua -- provide regexes for language related stuff. =head1 SYNOPSIS use Regexp::Common qw /lingua/; while (<>) { /^$RE{lingua}{palindrome}$/ and print "is a palindrome\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{lingua}{palindrome}> Returns a pattern that recognizes a palindrome, a string that is the same if you reverse it. By default, it only matches strings consisting of letters, but this can be changed using the C<{-chars}> option. This option takes a character class (default is C<[A-Za-z]>) as argument. If C<{-keep}> is used, only C<$1> will be set, and set to the entire match. This pattern requires at least perl 5.6.0. =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Many regexes are missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/number.pm000064400000032633147634422050007640 0ustar00package Regexp::Common::number; use Config; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2013031101'; sub _croak { require Carp; goto &Carp::croak; } my $digits = join ("", 0 .. 9, "A" .. "Z"); sub int_creator { my $flags = $_ [1]; my ($sep, $group, $base, $places, $sign) = @{$flags} {qw /-sep -group -base -places -sign/}; # Deal with the bases. _croak "Base must be between 1 and 36" unless $base >= 1 && $base <= 36; my $chars = substr $digits, 0, $base; $sep = ',' if exists $flags -> {-sep} && !defined $flags -> {-sep}; my $max = $group; $max = $2 if $group =~ /^\s*(\d+)\s*,\s*(\d+)\s*$/; my $quant = $places ? "{$places}" : "+"; return $sep ? qq {(?k:(?k:$sign)(?k:[$chars]{1,$max}} . qq {(?:$sep} . qq {[$chars]{$group})*))} : qq {(?k:(?k:$sign)(?k:[$chars]$quant))} } sub real_creator { my ($base, $places, $radix, $sep, $group, $expon, $sign) = @{$_[1]}{-base, -places, -radix, -sep, -group, -expon, -sign}; _croak "Base must be between 1 and 36" unless $base >= 1 && $base <= 36; $sep = ',' if exists $_[1]->{-sep} && !defined $_[1]->{-sep}; if ($base > 14 && $expon =~ /^[Ee]$/) {$expon = 'G'} foreach ($radix, $sep, $expon) {$_ = "[$_]" if 1 == length} my $chars = substr $digits, 0, $base; return $sep ? qq {(?k:(?i)(?k:$sign)(?k:(?=$radix?[$chars])} . qq {(?k:[$chars]{1,$group}(?:(?:$sep)[$chars]{$group})*)} . qq {(?:(?k:$radix)(?k:[$chars]{$places}))?)} . qq {(?:(?k:$expon)(?k:(?k:$sign)(?k:[$chars]+))|))} : qq {(?k:(?i)(?k:$sign)(?k:(?=$radix?[$chars])} . qq {(?k:[$chars]*)(?:(?k:$radix)(?k:[$chars]{$places}))?)} . qq {(?:(?k:$expon)(?k:(?k:$sign)(?k:[$chars]+))|))}; } sub decimal_creator { my ($base, $places, $radix, $sep, $group, $sign) = @{$_[1]}{-base, -places, -radix, -sep, -group, -sign}; _croak "Base must be between 1 and 36" unless $base >= 1 && $base <= 36; $sep = ',' if exists $_[1]->{-sep} && !defined $_[1]->{-sep}; foreach ($radix, $sep) {$_ = "[$_]" if 1 == length} my $chars = substr $digits, 0, $base; return $sep ? qq {(?k:(?i)(?k:$sign)(?k:(?=$radix?[$chars])} . qq {(?k:[$chars]{1,$group}(?:(?:$sep)[$chars]{$group})*)} . qq {(?:(?k:$radix)(?k:[$chars]{$places}))?))} : qq {(?k:(?i)(?k:$sign)(?k:(?=$radix?[$chars])} . qq {(?k:[$chars]*)(?:(?k:$radix)(?k:[$chars]{$places}))?))} } pattern name => [qw (num int -sep= -base=10 -group=3 -sign=[-+]?)], create => \&int_creator, ; pattern name => [qw (num real -base=10), '-places=0,', qw (-radix=[.] -sep= -group=3 -expon=E -sign=[-+]?)], create => \&real_creator, ; pattern name => [qw (num decimal -base=10), '-places=0,', qw (-radix=[.] -sep= -group=3 -sign=[-+]?)], create => \&decimal_creator, ; sub real_synonym { my ($name, $base) = @_; pattern name => ['num', $name, '-places=0,', '-radix=[.]', '-sep=', '-group=3', '-expon=E', '-sign=[-+]?'], create => sub {my %flags = (%{$_[1]}, -base => $base); real_creator (undef, \%flags); } ; } real_synonym (hex => 16); real_synonym (dec => 10); real_synonym (oct => 8); real_synonym (bin => 2); # 2147483647 pattern name => [qw (num square)], create => sub { use re 'eval'; my $sixty_four_bits = $Config {use64bitint}; # # CPAN testers claim it fails on 5.8.8 and darwin 9.0. # $sixty_four_bits = 0 if $Config {osname} eq 'darwin' && $Config {osvers} eq '9.0' && $] == 5.008008; my $num = $sixty_four_bits ? '0*[1-8]?[0-9]{1,15}' : '0*(?:2(?:[0-0][0-9]{8}' . '|1(?:[0-3][0-9]{7}' . '|4(?:[0-6][0-9]{6}' . '|7(?:[0-3][0-9]{5}' . '|4(?:[0-7][0-9]{4}' . '|8(?:[0-2][0-9]{3}' . '|3(?:[0-5][0-9]{2}' . '|6(?:[0-3][0-9]{1}' . '|4[0-7])))))))))|1?[0-9]{1,9}'; qr {($num)(?(?{sqrt ($^N) == int sqrt ($^N)})|(?!))} }, version => 5.008; ; pattern name => [qw (num roman)], create => '(?xi)(?=[MDCLXVI]) (?k:M{0,3} (D?C{0,3}|CD|CM)? (L?X{0,3}|XL|XC)? (V?I{0,3}|IV|IX)?)' ; 1; __END__ =pod =head1 NAME Regexp::Common::number -- provide regexes for numbers =head1 SYNOPSIS use Regexp::Common qw /number/; while (<>) { /^$RE{num}{int}$/ and print "Integer\n"; /^$RE{num}{real}$/ and print "Real\n"; /^$RE{num}{real}{-base => 16}$/ and print "Hexadecimal real\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{num}{int}{-base}{-sep}{-group}{-places}{-sign}> Returns a pattern that matches an integer. If C<< -base => I >> is specified, the integer is in base I, with C<< 2 <= I <= 36 >>. For bases larger than 10, upper case letters are used. The default base is 10. If C<< -sep => I

>> is specified, the pattern I

is required as a grouping marker within the number. If this option is not given, no grouping marker is used. If C<< -group => I >> is specified, digits between grouping markers must be grouped in sequences of exactly I digits. The default value of I is 3. If C<< -group => I >> is specified, digits between grouping markers must be grouped in sequences of at least I digits, and at most I digits. This option is ignored unless the C<< -sep >> option is used. If C<< -places => I >> is specified, the integer recognized must be exactly I digits wide. If C<< -places => I >> is specified, the integer must be at least I wide, and at most I characters. There is no default, which means that integers are unlimited in size. This option is ignored if the C<< -sep >> option is used. If C<< -sign => I

>> is used, it's a pattern the leading sign has to match. This defaults to C<< [-+]? >>, which means the number is optionally preceded by a minus or a plus. If you want to match unsigned integers, use C<< $RE{num}{int}{-sign => ''} >>. For example: $RE{num}{int} # match 1234567 $RE{num}{int}{-sep=>','} # match 1,234,567 $RE{num}{int}{-sep=>',?'} # match 1234567 or 1,234,567 $RE{num}{int}{-sep=>'.'}{-group=>4} # match 1.2345.6789 Under C<-keep> (see L): =over 4 =item $1 captures the entire number =item $2 captures the optional sign of the number =item $3 captures the complete set of digits =back =head2 C<$RE{num}{real}{-base}{-radix}{-places}{-sep}{-group}{-expon}> Returns a pattern that matches a floating-point number. If C<-base=I> is specified, the number is assumed to be in that base (with A..Z representing the digits for 11..36). By default, the base is 10. If C<-radix=I

> is specified, the pattern I

is used as the radix point for the number (i.e. the "decimal point" in base 10). The default is C. If C<-places=I> is specified, the number is assumed to have exactly I places after the radix point. If C<-places=I> is specified, the number is assumed to have between I and I places after the radix point. By default, the number of places is unrestricted. If C<-sep=I

> specified, the pattern I

is required as a grouping marker within the pre-radix section of the number. By default, no separator is allowed. If C<-group=I> is specified, digits between grouping separators must be grouped in sequences of exactly I characters. The default value of I is 3. If C<-expon=I

> is specified, the pattern I

is used as the exponential marker. The default value of I

is C. If C<-sign=I

> is specified, the pattern I

is used to match the leading sign (and the sign of the exponent). This defaults to C<< [-+]? >>, means means that an optional plus or minus sign can be used. For example: $RE{num}{real} # matches 123.456 or -0.1234567 $RE{num}{real}{-places=>2} # matches 123.45 or -0.12 $RE{num}{real}{-places=>'0,3'} # matches 123.456 or 0 or 9.8 $RE{num}{real}{-sep=>'[,.]?'} # matches 123,456 or 123.456 $RE{num}{real}{-base=>3'} # matches 121.102 Under C<-keep>: =over 4 =item $1 captures the entire match =item $2 captures the optional sign of the number =item $3 captures the complete mantissa =item $4 captures the whole number portion of the mantissa =item $5 captures the radix point =item $6 captures the fractional portion of the mantissa =item $7 captures the optional exponent marker =item $8 captures the entire exponent value =item $9 captures the optional sign of the exponent =item $10 captures the digits of the exponent =back =head2 C<$RE{num}{dec}{-radix}{-places}{-sep}{-group}{-expon}> A synonym for C<< $RE{num}{real}{-base=>10}{...} >> =head2 C<$RE{num}{oct}{-radix}{-places}{-sep}{-group}{-expon}> A synonym for C<< $RE{num}{real}{-base=>8}{...} >> =head2 C<$RE{num}{bin}{-radix}{-places}{-sep}{-group}{-expon}> A synonym for C<< $RE{num}{real}{-base=>2}{...} >> =head2 C<$RE{num}{hex}{-radix}{-places}{-sep}{-group}{-expon}> A synonym for C<< $RE{num}{real}{-base=>16}{...} >> =head2 C<$RE{num}{decimal}{-base}{-radix}{-places}{-sep}{-group}> The same as C<$RE{num}{real}>, except that an exponent isn't allowed. Hence, this returns a pattern matching I numbers. If C<-base=I> is specified, the number is assumed to be in that base (with A..Z representing the digits for 11..36). By default, the base is 10. If C<-radix=I

> is specified, the pattern I

is used as the radix point for the number (i.e. the "decimal point" in base 10). The default is C. If C<-places=I> is specified, the number is assumed to have exactly I places after the radix point. If C<-places=I> is specified, the number is assumed to have between I and I places after the radix point. By default, the number of places is unrestricted. If C<-sep=I

> specified, the pattern I

is required as a grouping marker within the pre-radix section of the number. By default, no separator is allowed. If C<-group=I> is specified, digits between grouping separators must be grouped in sequences of exactly I characters. The default value of I is 3. For example: $RE{num}{decimal} # matches 123.456 or -0.1234567 $RE{num}{decimal}{-places=>2} # matches 123.45 or -0.12 $RE{num}{decimal}{-places=>'0,3'} # matches 123.456 or 0 or 9.8 $RE{num}{decimal}{-sep=>'[,.]?'} # matches 123,456 or 123.456 $RE{num}{decimal}{-base=>3'} # matches 121.102 Under C<-keep>: =over 4 =item $1 captures the entire match =item $2 captures the optional sign of the number =item $3 captures the complete mantissa =item $4 captures the whole number portion of the mantissa =item $5 captures the radix point =item $6 captures the fractional portion of the mantissa =back =head2 C<$RE{num}{square}> Returns a pattern that matches a (decimal) square. Because Perl's arithmetic is lossy when using integers over about 53 bits, this pattern only recognizes numbers less than 9000000000000000, if one uses a Perl that is configured to use 64 bit integers. Otherwise, the limit is 2147483647. These restrictions were introduced in versions 2.116 and 2.117 of Regexp::Common. Regardless whether C<-keep> was set, the matched number will be returned in C<$1>. This pattern is available for version 5.008 and up. =head2 C<$RE{num}{roman}> Returns a pattern that matches an integer written in Roman numbers. Case doesn't matter. Only the more modern style, that is, no more than three repetitions of a letter, is recognized. The largest number matched is I, or 3999. Larger numbers cannot be expressed using ASCII characters. A future version will be able to deal with the Unicode symbols to match larger Roman numbers. Under C<-keep>, the number will be captured in $1. =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2013, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/zip.pm000064400000052354147634422050007154 0ustar00package Regexp::Common::zip; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; # # Prefer '[0-9]' over \d, because the latter may include more # in Unicode string. # my %code = ( Australia => [qw /AUS? AU AUS/], Belgium => [qw /BE? BE B/], Denmark => [qw /DK DK DK/], France => [qw /FR? FR F/], Germany => [qw /DE? DE D/], Greenland => [qw /DK DK DK/], Italy => [qw /IT? IT I/], Netherlands => [qw /NL NL NL/], Norway => [qw /NO? NO N/], Spain => [qw /ES? ES E/], USA => [qw /USA? US USA/], ); # Returns the empty string if the argument is undefined, the argument otherwise. sub __ {defined $_ [0] ? $_ [0] : ""} # Used for allowable options. If the value starts with 'y', the option is # required ("{1,1}" is returned, if the value starts with 'n', the option # is disallowed ("{0,0}" is returned), otherwise, the option is allowed, # but not required ("{0,1}" is returned). sub _t { if (defined $_ [0]) { if ($_ [0] =~ /^y/i) {return "{1,1}"} if ($_ [0] =~ /^n/i) {return "{0,0}"} } "{0,1}" } # Returns the (sub)pattern for the country named '$name', and the # -country option '$country'. sub _c { my ($name, $country) = @_; if (defined $country && $country ne "") { if ($country eq 'iso') {return $code {$name} [1]} if ($country eq 'cept') {return $code {$name} [2]} return $country; } $code {$name} [0] } my %zip = ( Australia => "(?k:(?k:[1-8][0-9]|9[0-7]|0?[28]|0?9(?=09))(?k:[0-9]{2}))", # Postal codes of the form 'DDDD', with the first # two digits 02, 08 or 20-97. Leading 0 may be omitted. # 909 and 0909 are valid as well - but no other postal # codes starting with 9 or 09. Belgium => "(?k:(?k:[1-9])(?k:[0-9]{3}))", # Postal codes of the form: 'DDDD', with the first # digit representing the province; the others # distribution sectors. Postal codes do not start # with a zero. Denmark => "(?k:(?k:[1-9])(?k:[0-9])(?k:[0-9]{2}))", # Postal codes of the form: 'DDDD', with the first # digit representing the distribution region, the # second digit the distribution district. Postal # codes do not start with a zero. Postal codes # starting with '39' are in Greenland. France => "(?k:(?k:[0-8][0-9]|9[0-8])(?k:[0-9]{3}))", # Postal codes of the form: 'DDDDD'. All digits are used. # First two digits indicate the department, and range # from 01 to 98, or 00 for army. Germany => "(?k:(?k:[0-9])(?k:[0-9])(?k:[0-9]{3}))", # Postal codes of the form: 'DDDDD'. All digits are used. # First digit is the distribution zone, second a # distribution region. Other digits indicate the # distribution district and postal town. Greenland => "(?k:(?k:39)(?k:[0-9]{2}))", # Postal codes of Greenland are part of the Danish # system. Codes in Greenland start with 39. Italy => "(?k:(?k:[0-9])(?k:[0-9])(?k:[0-9])(?k:[0-9])(?k:[0-9]))", # First digit: region. # Second digit: province. # Third digit: capital/province (odd for capital). # Fourth digit: route. # Fifth digit: place on route (0 for small places) Norway => "(?k:[0-9]{4})", # Four digits, no significance (??). Spain => "(?k:(?k:0[1-9]|[1-4][0-9]|5[0-2])(?k:[0-9])(?k:[0-9]{2}))", # Five digits, first two indicate the province. # Third digit: large town, main delivery rounds. # Last 2 digits: delivery area, secondary delivery route # or link to rural areas. Switzerland => "(?k:[1-9][0-9]{3})", # Four digits, first is district, second is area, # third is route, fourth is post office number. ); my %alternatives = ( Australia => [qw /Australian/], France => [qw /French/], Germany => [qw /German/], ); while (my ($country, $zip) = each %zip) { my @names = ($country); push @names => @{$alternatives {$country}} if $alternatives {$country}; foreach my $name (@names) { my $pat_name = $name eq "Denmark" && $] < 5.00503 ? [zip => $name, qw /-country=/] : [zip => $name, qw /-prefix= -country=/]; pattern name => $pat_name, create => sub { my $pt = _t $_ [1] {-prefix}; my $cn = _c $country => $_ [1] {-country}; my $pfx = "(?:(?k:$cn)-)"; "(?k:$pfx$pt$zip)"; }, ; } } # Postal codes of the form 'DDDD LL', with F, I, O, Q, U and Y not # used, SA, SD and SS unused combinations, and the first digit # cannot be 0. No specific meaning to the letters or digits. foreach my $country (qw /Netherlands Dutch/) { pattern name => ['zip', $country => qw /-prefix= -country=/, "-sep= "], create => sub { my $pt = _t $_ [1] {-prefix}; # Unused letters: F, I, O, Q, U, Y. # Unused combinations: SA, SD, SS. my $num = '[1-9][0-9]{3}'; my $let = '[A-EGHJ-NPRTVWXZ][A-EGHJ-NPRSTVWXZ]|' . 'S[BCEGHJ-NPRTVWXZ]'; my $sep = __ $_ [1] {-sep}; my $cn = _c Netherlands => $_ [1] {-country}; my $pfx = "(?:(?k:$cn)-)"; "(?k:$pfx$pt(?k:(?k:$num)(?k:$sep)(?k:$let)))"; }, ; } # Postal codes of the form 'DDDDD' or 'DDDDD-DDDD'. All digits are used, # none carry any specific meaning. pattern name => [qw /zip US -prefix= -country= -extended= -sep=-/], create => sub { my $pt = _t $_ [1] {-prefix}; my $et = _t $_ [1] {-extended}; my $sep = __ $_ [1] {-sep}; my $cn = _c USA => $_ [1] {-country}; my $pfx = "(?:(?k:$cn)-)"; # my $zip = "(?k:[0-9]{5})"; # my $ext = "(?:(?k:$sep)(?k:[0-9]{4}))"; my $zip = "(?k:(?k:[0-9]{3})(?k:[0-9]{2}))"; my $ext = "(?:(?k:$sep)(?k:(?k:[0-9]{2})(?k:[0-9]{2})))"; "(?k:$pfx$pt(?k:$zip$ext$et))"; }, version => 5.00503, ; # pattern name => [qw /zip British/, "-sep= "], # create => sub { # my $sep = $_ [1] -> {-sep}; # # my $london = '(?:EC[1-4]|WC[12]|S?W1)[A-Z]'; # my $single = '[BGLMS][0-9]{1,2}'; # my $double = '[A-Z]{2}[0-9]{1,2}'; # # my $left = "(?:$london|$single|$double)"; # my $right = '[0-9][ABD-HJLNP-UW-Z]{2}'; # # "(?k:(?k:$left)(?k:$sep)(?k:$right))"; # }, # ; # # pattern name => [qw /zip Canadian/, "-sep= "], # create => sub { # my $sep = $_ [1] -> {-sep}; # # my $left = '[A-Z][0-9][A-Z]'; # my $right = '[0-9][A-Z][0-9]'; # # "(?k:(?k:$left)(?k:$sep)(?k:$right))"; # }, # ; 1; __END__ =pod =head1 NAME Regexp::Common::zip -- provide regexes for postal codes. =head1 SYNOPSIS use Regexp::Common qw /zip/; while (<>) { /^$RE{zip}{Netherlands}$/ and print "Dutch postal code\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. This module offers patterns for zip or postal codes of many different countries. They all have the form C<$RE{zip}{Country}[{options}]>. The following common options are used: =head2 C<{-prefix=[yes|no|allow]}> and C<{-country=PAT}>. Postal codes can be prefixed with a country abbreviation. That is, a dutch postal code of B<1234 AB> can also be written as B. By default, all the patterns will allow the prefixes. But this can be changed with the C<-prefix> option. With C<-prefix=yes>, the returned pattern requires a country prefix, while C<-prefix=no> disallows a prefix. Any argument that doesn't start with a C or a C allows a country prefix, but doesn't require them. The prefixes used are, unfortunally, not always the same. Officially, ISO country codes need to be used, but the usage of I codes (the same ones as used on cars) is common too. By default, each postal code will recognize a country prefix that's either the ISO standard or the CEPT code. That is, German postal codes may prefixed with either C or C. The recognized prefix can be changed with the C<-country> option, which takes a (sub)pattern as argument. The arguments C and C are special, and indicate the language prefix should be the ISO country code, or the CEPT code. Examples: /$RE{zip}{Netherlands}/; # Matches '1234 AB' and 'NL-1234 AB'. /$RE{zip}{Netherlands}{-prefix => 'no'}/; # Matches '1234 AB' but not 'NL-1234 AB'. /$RE{zip}{Netherlands}{-prefix => 'yes'}/; # Matches 'NL-1234 AB' but not '1234 AB'. /$RE{zip}{Germany}/; # Matches 'DE-12345' and 'D-12345'. /$RE{zip}{Germany}{-country => 'iso'}/; # Matches 'DE-12345' but not 'D-12345'. /$RE{zip}{Germany}{-country => 'cept'}/; # Matches 'D-12345' but not 'DE-12345'. /$RE{zip}{Germany}{-country => 'GER'}/; # Matches 'GER-12345'. =head2 C<{-sep=PAT}> Some countries have postal codes that consist of two parts. Typically there is an official way of separating those parts; but in practise people tend to use different separators. For instance, if the official way to separate parts is to use a space, it happens that the space is left off. The C<-sep> option can be given a pattern as argument which indicates what to use as a separator between the parts. Examples: /$RE{zip}{Netherlands}/; # Matches '1234 AB' but not '1234AB'. /$RE{zip}{Netherlands}{-sep => '\s*'}/; # Matches '1234 AB' and '1234AB'. =head2 C<$RE{zip}{Australia}> Returns a pattern that recognizes Australian postal codes. Australian postal codes consist of four digits; the first two digits, which range from '10' to '97', indicate the state. Territories use '02' or '08' as starting digits; the leading zero is optional. '0909' is the only postal code starting with '09' (the leading zero is optional here as well) - this is the postal code for the Nothern Territory University). The (optional) country prefixes are I (ISO country code) and I (CEPT code). Regexp::Common 2.107 and before used C<$RE{zip}{Australia}>. This is still supported. If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The state or territory. =item $5 The last two digits. =back =head2 C<$RE{zip}{Belgium}> Returns a pattern than recognizes Belgian postal codes. Belgian postal codes consist of 4 digits, of which the first indicates the province. The (optional) country prefixes are I (ISO country code) and I (CEPT code). If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The digit indicating the province. =item $5 The last three digits of the postal code. =back =head2 C<$RE{zip}{Denmark}> Returns a pattern that recognizes Danish postal codes. Danish postal codes consist of four numbers; the first digit (which cannot be 0), indicates the distribution region, the second the distribution district. The (optional) country prefix is I, which is both the ISO country code and the CEPT code. If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The digit indicating the distribution region. =item $5 The digit indicating the distribution district. =item $6 The last two digits of the postal code. =back =head2 C<$RE{zip}{France}> Returns a pattern that recognizes French postal codes. French postal codes consist of five numbers; the first two numbers, which range from '01' to '98', indicate the department. The (optional) country prefixes are I (ISO country code) and I (CEPT code). Regexp::Common 2.107 and before used C<$RE{zip}{French}>. This is still supported. If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The department. =item $5 The last three digits. =back =head2 C<$RE{zip}{Germany}> Returns a pattern that recognizes German postal codes. German postal codes consist of five numbers; the first number indicating the distribution zone, the second the distribution region, while the latter three indicate the distribution district and the postal town. The (optional) country prefixes are I (ISO country code) and I (CEPT code). Regexp::Common 2.107 and before used C<$RE{zip}{German}>. This is still supported. If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The distribution zone. =item $5 The distribution region. =item $6 The distribution district and postal town. =back =head2 C<$RE{zip}{Greenland}> Returns a pattern that recognizes postal codes from Greenland. Greenland, being part of Denmark, uses Danish postal codes. All postal codes of Greenland start with 39. The (optional) country prefix is I, which is both the ISO country code and the CEPT code. If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 39, being the distribution region and distribution district for Greenland. =item $5 The last two digits of the postal code. =back =head2 C<$RE{zip}{Italy}> Returns a pattern recognizing Italian postal codes. Italian postal codes consist of 5 digits. The first digit indicates the region, the second the province. The third digit is odd for province capitals, and even for the province itself. The fourth digit indicates the route, and the fifth a place on the route (0 for small places, alphabetically for the rest). The country prefix is either I (the ISO country code), or I (the CEPT code). If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The region. =item $5 The province. =item $6 Capital or province. =item $7 The route. =item $8 The place on the route. =back =head2 C<$RE{zip}{Netherlands}> Returns a pattern that recognizes Dutch postal codes. Dutch postal codes consist of 4 digits and 2 letters, separated by a space. The separator can be changed using the C<{-sep}> option, as discussed above. The (optional) country prefix is I, which is both the ISO country code and the CEPT code. Regexp::Common 2.107 and earlier used C<$RE{zip}{Dutch}>. This is still supported. If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The digits part of the postal code. =item $5 The separator between the digits and the letters. =item $6 The letters part of the postal code. =back =head2 C<< $RE{zip}{Norway} >> Returns a pattern that recognizes Norwegian postal codes. Norwegian postal codes consist of four digits. The country prefix is either I (the ISO country code), or I (the CEPT code). If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =back =head2 C<< $RE{zip}{Spain} >> Returns a pattern that recognizes Spanish postal codes. Spanish postal codes consist of 5 digits. The first 2 indicate one of Spains fifties provinces (in alphabetical order), starting with C<00>. The third digit indicates a main city or the main delivery rounds. The last two digits are the delivery area, secondary delivery route or a link to rural areas. The country prefix is either I (the ISO country code), or I (the CEPT code). If C<{-keep}> is used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The two digits indicating the province. =item $5 The digit indicating the main city or main delivery route. =item $6 The digits indicating the delivery area, secondary delivery route or a link to rural areas. =back =head2 C<< $RE{zip}{Switzerland} >> Returns a pattern that recognizes Swiss postal codes. Swiss postal codes consist of 4 digits. The first indicates the district, starting with 1. The second indicates the area, the third, the route, and the fourth the post office number. =head2 C<< $RE{zip}{US}{-extended => [yes|no|allow]} >> Returns a pattern that recognizes US zip codes. US zip codes consist of 5 digits, with an optional 4 digit extension. By default, extensions are allowed, but not required. This can be influenced by the C<-extended> option. If its argument starts with a C, extensions are required; if the argument starts with a C, extensions will not be recognized. If an extension is used, a dash is used to separate the main part from the extension, but this can be changed with the C<-sep> option. The country prefix is either I (the ISO country code), or I (the CEPT code). If C<{-keep}> is being used, the following variables will be set: =over 4 =item $1 The entire postal code. =item $2 The country code prefix. =item $3 The postal code without the country prefix. =item $4 The first 5 digits of the postal code. =item $5 The first three digits of the postal code, indicating a sectional center or a large city. New in Regexp::Common 2.119. =item $6 The last 2 digits of the 5 digit part of the postal code, indicating a post office facility or delivery area. New in Regexp::Common 2.119. =item $7 The separator between the 5 digit part and the 4 digit part. Up to Regexp::Common 2.118, this used to be $5. =item $8 The 4 digit part of the postal code (if any). Up to Regexp::Common 2.118, this used to be $6. =item $9 The first two digits of the 4 digit part of the postal code, indicating a sector, or several blocks. New in Regexp::Common 2.119. =item $10 The last two digits of the 4 digit part of the postal code, indicating a segment or one side of a street. New in Regexp::Common 2.119. =back You need at least version 5.005_03 to be able to use US postal codes. Older versions contain a bug that let the pattern match invalid US postal codes. =head3 Questions =over 4 =item Can the 5 digit part of the zip code (in theory) start with 000? =item Can the 5 digit part of the zip code (in theory) end with 00? =item Can the 4 digit part of the zip code (in theory) start with 00? =item Can the 4 digit part of the zip code (in theory) end with 00? =back =head1 SEE ALSO L for a general description of how to use this interface. =over 4 =item L Frank's compulsive guide to postal addresses. =item L Postal addressing systems. =item L Postal code information. =item L Links to Postcode Pages. =item L Information about Australian postal codes. =item L Information about US postal codes. =item L =back =head1 AUTHORS Damian Conway S<(I)> and Abigail S<(I)>. =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Zip codes for most countries are missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/balanced.pm000064400000012427147634422050010100 0ustar00package Regexp::Common::balanced; { use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2013030901'; my %closer = ( '{'=>'}', '('=>')', '['=>']', '<'=>'>' ); my %cache; sub nested { my ($start, $finish) = @_; return $cache {$start} {$finish} if exists $cache {$start} {$finish}; my @starts = map {s/\\(.)/$1/g; $_} grep {length} $start =~ /([^|\\]+|\\.)+/gs; my @finishes = map {s/\\(.)/$1/g; $_} grep {length} $finish =~ /([^|\\]+|\\.)+/gs; push @finishes => ($finishes [-1]) x (@starts - @finishes); my @re; local $" = "|"; foreach my $begin (@starts) { my $end = shift @finishes; my $qb = quotemeta $begin; my $qe = quotemeta $end; my $fb = quotemeta substr $begin => 0, 1; my $fe = quotemeta substr $end => 0, 1; my $tb = quotemeta substr $begin => 1; my $te = quotemeta substr $end => 1; my $add; if ($fb eq $fe) { push @re => qq /(?:$qb(?:(?>[^$fb]+)|$fb(?!$tb)(?!$te)|(?-1))*$qe)/; } else { my @clauses = "(?>[^$fb$fe]+)"; push @clauses => "$fb(?!$tb)" if length $tb; push @clauses => "$fe(?!$te)" if length $te; push @clauses => "(?-1)"; push @re => qq /(?:$qb(?:@clauses)*$qe)/; } } $cache {$start} {$finish} = qr /(@re)/; } pattern name => [qw /balanced -parens=() -begin= -end=/], create => sub { my $flag = $_[1]; unless (defined $flag -> {-begin} && length $flag -> {-begin} && defined $flag -> {-end} && length $flag -> {-end}) { my @open = grep {index ($flag->{-parens}, $_) >= 0} ('[','(','{','<'); my @close = map {$closer {$_}} @open; $flag -> {-begin} = join "|" => @open; $flag -> {-end} = join "|" => @close; } return nested @$flag {qw /-begin -end/}; }, version => 5.010, ; } 1; __END__ =pod =head1 NAME Regexp::Common::balanced -- provide regexes for strings with balanced parenthesized delimiters or arbitrary delimiters. =head1 SYNOPSIS use Regexp::Common qw /balanced/; while (<>) { /$RE{balanced}{-parens=>'()'}/ and print q{balanced parentheses\n}; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{balanced}{-parens}> Returns a pattern that matches a string that starts with the nominated opening parenthesis or bracket, contains characters and properly nested parenthesized subsequences, and ends in the matching parenthesis. More than one type of parenthesis can be specified: $RE{balanced}{-parens=>'(){}'} in which case all specified parenthesis types must be correctly balanced within the string. Since version 2013030901, C<< $1 >> will always be set (to the entire matched substring), regardless whether C<< {-keep} >> is used or not. =head2 C<< $RE{balanced}{-begin => "begin"}{-end => "end"} >> Returns a pattern that matches a string that is properly balanced using the I and I strings as start and end delimiters. Multiple sets of begin and end strings can be given by separating them by C<|>s (which can be escaped with a backslash). qr/$RE{balanced}{-begin => "do|if|case"}{-end => "done|fi|esac"}/ will match properly balanced strings that either start with I and end with I, start with I and end with I, or start with I and end with I. If I<-end> contains less cases than I<-begin>, the last case of I<-end> is repeated. If it contains more cases than I<-begin>, the extra cases are ignored. If either of I<-begin> or I<-end> isn't given, or is empty, I<< -begin => '(' >> and I<< -end => ')' >> are assumed. Since version 2013030901, C<< $1 >> will always be set (to the entire matched substring), regardless whether C<< {-keep} >> is used or not. =head2 Note Since version 2013030901 the pattern will make of the recursive construct C<< (?-1) >>, instead of using the problematic C<< (??{ }) >> construct. This fixes an problem that was introduced in the 5.17 development track. This also means the pattern is no longer available for Perls older than 5.010. =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2013, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/net.pm000064400000031060147634422050007127 0ustar00package Regexp::Common::net; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2013031301'; my %IPunit = ( dec => q{(?k:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})}, oct => q{(?k:[0-3]?[0-7]{1,2})}, hex => q{(?k:[0-9a-fA-F]{1,2})}, bin => q{(?k:[0-1]{1,8})}, ); my %MACunit = ( %IPunit, hex => q{(?k:[0-9a-fA-F]{1,2})}, ); my %IPv6unit = ( hex => q {(?k:[0-9a-f]{1,4})}, HEX => q {(?k:[0-9A-F]{1,4})}, HeX => q {(?k:[0-9a-fA-F]{1,4})}, ); sub dec {$_}; sub bin {oct "0b$_"} my $IPdefsep = '[.]'; my $MACdefsep = ':'; my $IPv6defsep = ':'; pattern name => [qw (net IPv4)], create => "(?k:$IPunit{dec}$IPdefsep$IPunit{dec}$IPdefsep" . "$IPunit{dec}$IPdefsep$IPunit{dec})", ; pattern name => [qw (net MAC)], create => "(?k:" . join ($MACdefsep => ($MACunit{hex}) x 6) . ")", subs => sub { $_ [1] = join ":" => map {sprintf "%02x" => hex} split /$MACdefsep/ => $_ [1] if $_ [1] =~ /$_[0]/ }, ; foreach my $type (qw /dec oct hex bin/) { pattern name => [qw (net IPv4), $type, "-sep=$IPdefsep"], create => sub {my $sep = $_ [1] -> {-sep}; "(?k:$IPunit{$type}$sep$IPunit{$type}$sep" . "$IPunit{$type}$sep$IPunit{$type})" }, ; pattern name => [qw (net MAC), $type, "-sep=$MACdefsep"], create => sub {my $sep = $_ [1] -> {-sep}; "(?k:" . join ($sep => ($MACunit{$type}) x 6) . ")", }, subs => sub { return if $] < 5.006 and $type eq 'bin'; $_ [1] = join ":" => map {sprintf "%02x" => eval $type} $2, $3, $4, $5, $6, $7 if $_ [1] =~ $RE {net} {MAC} {$type} {-sep => $_ [0] -> {flags} {-sep}} {-keep}; }, ; } my %cache6; pattern name => [qw (net IPv6), "-sep=$IPv6defsep", "-style=HeX"], create => sub { my $style = $_ [1] {-style}; my $sep = $_ [1] {-sep}; return $cache6 {$style, $sep} if $cache6 {$style, $sep}; my @re; die "Impossible style '$style'\n" unless exists $IPv6unit {$style}; # # Nothing missing # push @re => join $sep => ($IPv6unit {$style}) x 8; # # For "double colon" representations, at least 2 units must # be omitted, leaving us with at most 6 units. 0 units is also # possible. Note we can have at most one double colon. # for (my $l = 0; $l <= 6; $l ++) { # # We prefer to do longest match, so larger $r gets priority # for (my $r = 6 - $l; $r >= 0; $r --) { # # $l is the number of blocks left of the double colon, # $r is the number of blocks left of the double colon, # $m is the number of omitted blocks # my $m = 8 - $l - $r; my $patl = $l ? ($IPv6unit {$style} . $sep) x $l : $sep; my $patr = $r ? ($sep . $IPv6unit {$style}) x $r : $sep; my $patm = "(?k:)" x $m; my $pat = $patl . $patm . $patr; push @re => "(?:$pat)"; } } local $" = "|"; $cache6 {$style, $sep} = qq /(?k:(?|@re))/; }, version => 5.010 ; my $letter = "[A-Za-z]"; my $let_dig = "[A-Za-z0-9]"; my $let_dig_hyp = "[-A-Za-z0-9]"; # Domain names, from RFC 1035. pattern name => [qw (net domain -nospace= -rfc1101=)], create => sub { my $rfc1101 = exists $_ [1] {-rfc1101} && !defined $_ [1] {-rfc1101}; my $lead = $rfc1101 ? "(?!$RE{net}{IPv4}(?:[.]|\$))$let_dig" : $letter; if (exists $_ [1] {-nospace} && !defined $_ [1] {-nospace}) { return "(?k:$lead(?:(?:$let_dig_hyp){0,61}$let_dig)?" . "(?:\\.$lead(?:(?:$let_dig_hyp){0,61}$let_dig)?)*)" } else { return "(?k: |(?:$lead(?:(?:$let_dig_hyp){0,61}$let_dig)?" . "(?:\\.$lead(?:(?:$let_dig_hyp){0,61}$let_dig)?)*))" } }, ; 1; __END__ =head1 NAME Regexp::Common::net -- provide regexes for IPv4 addresses. =head1 SYNOPSIS use Regexp::Common qw /net/; while (<>) { /$RE{net}{IPv4}/ and print "Dotted decimal IP address"; /$RE{net}{IPv4}{hex}/ and print "Dotted hexadecimal IP address"; /$RE{net}{IPv4}{oct}{-sep => ':'}/ and print "Colon separated octal IP address"; /$RE{net}{IPv4}{bin}/ and print "Dotted binary IP address"; /$RE{net}{MAC}/ and print "MAC address"; /$RE{net}{MAC}{oct}{-sep => " "}/ and print "Space separated octal MAC address"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. This modules gives you regular expressions for various style IPv4 and MAC (or ethernet) addresses. =head2 C<$RE{net}{IPv4}> Returns a pattern that matches a valid IP address in "dotted decimal". Note that while C<318.99.183.11> is not a valid IP address, it does match C, but this is because C<318.99.183.11> contains a valid IP address, namely C<18.99.183.11>. To prevent the unwanted matching, one needs to anchor the regexp: C. For this pattern and the next four, under C<-keep> (See L): =over 4 =item $1 captures the entire match =item $2 captures the first component of the address =item $3 captures the second component of the address =item $4 captures the third component of the address =item $5 captures the final component of the address =back =head2 C<$RE{net}{IPv4}{dec}{-sep}> Returns a pattern that matches a valid IP address in "dotted decimal" If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{IPv4}{hex}{-sep}> Returns a pattern that matches a valid IP address in "dotted hexadecimal", with the letters C to C capitalized. If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. C<< -sep="" >> and C<< -sep=" " >> are useful alternatives. =head2 C<$RE{net}{IPv4}{oct}{-sep}> Returns a pattern that matches a valid IP address in "dotted octal" If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{IPv4}{bin}{-sep}> Returns a pattern that matches a valid IP address in "dotted binary" If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{MAC}> Returns a pattern that matches a valid MAC or ethernet address as colon separated hexadecimals. For this pattern, and the next four, under C<-keep> (See L): =over 4 =item $1 captures the entire match =item $2 captures the first component of the address =item $3 captures the second component of the address =item $4 captures the third component of the address =item $5 captures the fourth component of the address =item $6 captures the fifth component of the address =item $7 captures the sixth and final component of the address =back This pattern, and the next four, have a C method as well, which will transform a matching MAC address into so called canonical format. Canonical format means that every component of the address will be exactly two hexadecimals (with a leading zero if necessary), and the components will be separated by a colon. The C method will not work for binary MAC addresses if the Perl version predates 5.6.0. =head2 C<$RE{net}{MAC}{dec}{-sep}> Returns a pattern that matches a valid MAC address as colon separated decimals. If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{MAC}{hex}{-sep}> Returns a pattern that matches a valid MAC address as colon separated hexadecimals, with the letters C to C in lower case. If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{MAC}{oct}{-sep}> Returns a pattern that matches a valid MAC address as colon separated octals. If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{MAC}{bin}{-sep}> Returns a pattern that matches a valid MAC address as colon separated binary numbers. If C<< -sep=I

>> is specified the pattern I

is used as the separator. By default I

is C. =head2 C<$RE{net}{IPv6}{-sep => ':'}{-style => 'HeX'}> Returns a pattern matching IPv6 numbers. An IPv6 address consists of eigth groups of four hexadecimal digits, separated by colons. In each group, leading zeros may be omitted. Two or more consecutive groups consisting of only zeros may be omitted (including any colons separating them), resulting into two sets of groups, separated by a double colon. (Each of the groups may be empty; C<< :: >> is a valid address, equal to C<< 0000:0000:0000:0000:0000:0000:0000:0000 >>). The hex numbers may be in either case. If the C<< -sep >> option is used, its argument is a pattern that matches the separator that separates groups. This defaults to C<< : >>. The C<< -style >> option is used to denote which case the hex numbers may be. The default style, C<< 'HeX' >> indicates both lower case letters C<< 'a' >> to C<< 'f' >> and upper case letters C<< 'A' >> to C<< 'F' >> will be matched. The style C<< 'HEX' >> restricts matching to upper case letters, and C<< 'hex' >> only matches lower case letters. If C<< {-keep} >> is used, C<< $1 >> to C<< $9 >> will be set. C<< $1 >> will be set to the matched address, while C<< $2 >> to C<< $9 >> will be set to each matched group. If a group is omitted because it contains all zeros, its matching variable will be the empty string. Example: "2001:db8:85a3::8a2e:370:7334" =~ /$RE{net}{IPv6}{-keep}/; print $2; # '2001' print $4; # '85a3' print $6; # Empty string print $8; # '370' Perl 5.10 (or later) is required for this pattern. =head2 C<$RE{net}{domain}> Returns a pattern to match domains (and hosts) as defined in RFC 1035. Under I{-keep} only the entire domain name is returned. RFC 1035 says that a single space can be a domainname too. So, the pattern returned by C<$RE{net}{domain}> recognizes a single space as well. This is not always what people want. If you want to recognize domainnames, but not a space, you can do one of two things, either use /(?! )$RE{net}{domain}/ or use the C<{-nospace}> option (without an argument). RFC 1035 does B allow host or domain names to start with a digits; however, this restriction is relaxed in RFC 1101; this RFC allows host and domain names to start with a digit, as long as the first part of a domain does not look like an IP address. If the C<< {-rfc1101} >> option is given (as in C<< $RE {net} {domain} {-rfc1101} >>), we will match using the relaxed rules. =head1 REFERENCES =over 4 =item B Mockapetris, P.: I. November 1987. =item B Mockapetris, P.: I. April 1987. =back =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway I. =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2013, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/delimited.pm000064400000010105147634422050010276 0ustar00package Regexp::Common::delimited; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; sub gen_delimited { my ($dels, $escs) = @_; # return '(?:\S*)' unless $dels =~ /\S/; if (length $escs) { $escs .= substr ($escs, -1) x (length ($dels) - length ($escs)); } my @pat = (); my $i; for ($i=0; $i < length $dels; $i++) { my $del = quotemeta substr ($dels, $i, 1); my $esc = length($escs) ? quotemeta substr ($escs, $i, 1) : ""; if ($del eq $esc) { push @pat, "(?k:$del)(?k:[^$del]*(?:(?:$del$del)[^$del]*)*)(?k:$del)"; } elsif (length $esc) { push @pat, "(?k:$del)(?k:[^$esc$del]*(?:$esc.[^$esc$del]*)*)(?k:$del)"; } else { push @pat, "(?k:$del)(?k:[^$del]*)(?k:$del)"; } } my $pat = join '|', @pat; return "(?k:$pat)"; } sub _croak { require Carp; goto &Carp::croak; } pattern name => [qw( delimited -delim= -esc=\\ )], create => sub {my $flags = $_[1]; _croak 'Must specify delimiter in $RE{delimited}' unless length $flags->{-delim}; return gen_delimited (@{$flags}{-delim, -esc}); }, ; pattern name => [qw( quoted -esc=\\ )], create => sub {my $flags = $_[1]; return gen_delimited (q{"'`}, $flags -> {-esc}); }, ; 1; __END__ =pod =head1 NAME Regexp::Common::delimited -- provides a regex for delimited strings =head1 SYNOPSIS use Regexp::Common qw /delimited/; while (<>) { /$RE{delimited}{-delim=>'"'}/ and print 'a \" delimited string'; /$RE{delimited}{-delim=>'/'}/ and print 'a \/ delimited string'; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{delimited}{-delim}{-esc}> Returns a pattern that matches a single-character-delimited substring, with optional internal escaping of the delimiter. When C<-delim=I> is specified, each character in the sequence I is a possible delimiter. There is no default delimiter, so this flag must always be specified. If C<-esc=I> is specified, each character in the sequence I is the delimiter for the corresponding character in the C<-delim=I> list. The default escape is backslash. For example: $RE{delimited}{-delim=>'"'} # match "a \" delimited string" $RE{delimited}{-delim=>'"'}{-esc=>'"'} # match "a "" delimited string" $RE{delimited}{-delim=>'/'} # match /a \/ delimited string/ $RE{delimited}{-delim=>q{'"}} # match "string" or 'string' Under C<-keep> (See L): =over 4 =item $1 captures the entire match =item $2 captures the opening delimiter (provided only one delimiter was specified) =item $3 captures delimited portion of the string (provided only one delimiter was specified) =item $4 captures the closing delimiter (provided only one delimiter was specified) =back =head2 $RE{quoted}{-esc} A synonym for C<$RE{delimited}{q{-delim='"`}{...}}> =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/profanity.pm000064400000010551147634422050010356 0ustar00package Regexp::Common::profanity; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $profanity = '(?:cvff(?:\\ gnxr|\\-gnxr|gnxr|r(?:ef|[feq])|vat|l)?|dhvzf?|fuvg(?:g(?:r(?:ef|[qe])|vat|l)|r(?:ef|[fqel])|vat|[fr])?|g(?:heqf?|jngf?)|jnax(?:r(?:ef|[eq])|vat|f)?|n(?:ef(?:r(?:\\ ubyr|\\-ubyr|ubyr|[fq])|vat|r)|ff(?:\\ ubyrf?|\\-ubyrf?|rq|ubyrf?|vat))|o(?:hyy(?:\\ fuvg(?:g(?:r(?:ef|[qe])|vat)|f)?|\\-fuvg(?:g(?:r(?:ef|[qe])|vat)|f)?|fuvg(?:g(?:r(?:ef|[qe])|vat)|f)?)|ybj(?:\\ wbof?|\\-wbof?|wbof?))|p(?:bpx(?:\\ fhpx(?:ref?|vat)|\\-fhpx(?:ref?|vat)|fhpx(?:ref?|vat))|enc(?:c(?:r(?:ef|[eq])|vat|l)|f)?|h(?:agf?|z(?:vat|zvat|f)))|qvpx(?:\\ urnq|\\-urnq|rq|urnq|vat|yrff|f)|s(?:hpx(?:rq|vat|f)?|neg(?:r[eq]|vat|[fl])?|rygpu(?:r(?:ef|[efq])|vat)?)|un(?:eq[\\-\\ ]?ba|ys(?:\\ n[fe]|\\-n[fe]|n[fe])frq)|z(?:bgure(?:\\ shpx(?:ref?|vat)|\\-shpx(?:ref?|vat)|shpx(?:ref?|vat))|hgu(?:n(?:\\ shpx(?:ref?|vat|[nnn])|\\-shpx(?:ref?|vat|[nnn])|shpx(?:ref?|vat|[nnn]))|re(?:\\ shpx(?:ref?|vat)|\\-shpx(?:ref?|vat)|shpx(?:ref?|vat)))|reqr?))'; my $contextual = '(?:c(?:bex|e(?:bax|vpxf?)|hff(?:vrf|l)|vff(?:\\ gnxr|\\-gnxr|gnxr|r(?:ef|[feq])|vat|l)?)|dhvzf?|ebbg(?:r(?:ef|[eq])|vat|f)?|f(?:bq(?:q(?:rq|vat)|f)?|chax|perj(?:rq|vat|f)?|u(?:nt(?:t(?:r(?:ef|[qe])|vat)|f)?|vg(?:g(?:r(?:ef|[qe])|vat|l)|r(?:ef|[fqel])|vat|[fr])?))|g(?:heqf?|jngf?|vgf?)|jnax(?:r(?:ef|[eq])|vat|f)?|n(?:ef(?:r(?:\\ ubyr|\\-ubyr|ubyr|[fq])|vat|r)|ff(?:\\ ubyrf?|\\-ubyrf?|rq|ubyrf?|vat))|o(?:ba(?:r(?:ef|[fe])|vat|r)|h(?:ttre|yy(?:\\ fuvg(?:g(?:r(?:ef|[qe])|vat)|f)?|\\-fuvg(?:g(?:r(?:ef|[qe])|vat)|f)?|fuvg(?:g(?:r(?:ef|[qe])|vat)|f)?))|n(?:fgneq|yy(?:r(?:ef|[qe])|vat|f)?)|yb(?:bql|j(?:\\ wbof?|\\-wbof?|wbof?)))|p(?:bpx(?:\\ fhpx(?:ref?|vat)|\\-fhpx(?:ref?|vat)|fhpx(?:ref?|vat)|f)?|enc(?:c(?:r(?:ef|[eq])|vat|l)|f)?|h(?:agf?|z(?:vat|zvat|f)))|q(?:batf?|vpx(?:\\ urnq|\\-urnq|rq|urnq|vat|yrff|f)?)|s(?:hpx(?:rq|vat|f)?|neg(?:r[eq]|vat|[fl])?|rygpu(?:r(?:ef|[efq])|vat)?)|u(?:hzc(?:r(?:ef|[eq])|vat|f)?|n(?:eq[\\-\\ ]?ba|ys(?:\\ n[fe]|\\-n[fe]|n[fe])frq))|z(?:bgure(?:\\ shpx(?:ref?|vat)|\\-shpx(?:ref?|vat)|shpx(?:ref?|vat))|hgu(?:n(?:\\ shpx(?:ref?|vat|[nnn])|\\-shpx(?:ref?|vat|[nnn])|shpx(?:ref?|vat|[nnn]))|re(?:\\ shpx(?:ref?|vat)|\\-shpx(?:ref?|vat)|shpx(?:ref?|vat)))|reqr?))'; tr/A-Za-z/N-ZA-Mn-za-m/ foreach $profanity, $contextual; pattern name => [qw (profanity)], create => '(?:\b(?k:' . $profanity . ')\b)', ; pattern name => [qw (profanity contextual)], create => '(?:\b(?k:' . $contextual . ')\b)', ; 1; __END__ =pod =head1 NAME Regexp::Common::profanity -- provide regexes for profanity =head1 SYNOPSIS use Regexp::Common qw /profanity/; while (<>) { /$RE{profanity}/ and print "Contains profanity\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 $RE{profanity} Returns a pattern matching words -- such as Carlin's "big seven" -- that are most likely to give offense. Note that correct anatomical terms are deliberately I included in the list. Under C<-keep> (see L): =over 4 =item $1 captures the entire word =back =head2 C<$RE{profanity}{contextual}> Returns a pattern matching words that are likely to give offense when used in specific contexts, but which also have genuinely non-offensive meanings. Under C<-keep> (see L): =over 4 =item $1 captures the entire word =back =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/_support.pm000064400000004466147634422050010226 0ustar00package Regexp::Common::_support; BEGIN { # This makes sure 'use warnings' doesn't bomb out on 5.005_*; # warnings won't be enabled on those old versions though. if ($] < 5.006 && !exists $INC {"warnings.pm"}) { $INC {"warnings.pm"} = 1; no strict 'refs'; *{"warnings::unimport"} = sub {0}; } } use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; # # Returns true/false, depending whether the given the argument # satisfies the LUHN checksum. # See http://www.webopedia.com/TERM/L/Luhn_formula.html. # # Note that this function is intended to be called from regular # expression, so it should NOT use a regular expression in any way. # sub luhn { my $arg = shift; my $even = 0; my $sum = 0; while (length $arg) { my $num = chop $arg; return if $num lt '0' || $num gt '9'; if ($even && (($num *= 2) > 9)) {$num = 1 + ($num % 10)} $even = 1 - $even; $sum += $num; } !($sum % 10) } sub import { my $pack = shift; my $caller = caller; no strict 'refs'; *{$caller . "::" . $_} = \&{$pack . "::" . $_} for @_; } 1; __END__ =pod =head1 NAME Regexp::Common::support -- Support functions for Regexp::Common. =head1 SYNOPSIS use Regexp::Common::_support qw /luhn/; luhn ($number) # Returns true/false. =head1 DESCRIPTION This module contains some subroutines to be used by other C modules. It's not intended to be used directly. Subroutines from the module may disappear without any notice, or their meaning or interface may change without notice. =over 4 =item luhn This subroutine returns true if its argument passes the luhn checksum test. =back =head1 SEE ALSO L. =head1 AUTHOR Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/SEN.pm000064400000006731147634422050006775 0ustar00package Regexp::Common::SEN; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; =begin does_not_exist sub par11 { my $string = shift; my $sum = 0; for my $i (0 .. length ($string) - 1) { my $c = substr ($string, $i, 1); $sum += $c * (length ($string) - $i) } !($sum % 11) } =end does_not_exist =cut # http://www.ssa.gov/history/ssn/geocard.html pattern name => [qw /SEN USA SSN -sep=-/], create => sub { my $sep = $_ [1] {-sep}; "(?k:(?k:[1-9][0-9][0-9]|0[1-9][0-9]|00[1-9])$sep" . "(?k:[1-9][0-9]|0[1-9])$sep" . "(?k:[1-9][0-9][0-9][0-9]|0[1-9][0-9][0-9]|" . "00[1-9][0-9]|000[1-9]))" }, ; =begin does_not_exist It's not clear whether this is the right checksum. # http://www.google.nl/search?q=cache:8m1zKNYrEO0J:www.enschede.nl/nieuw/projecten/aanbesteding/integratie/pve%2520Bijlage%25207.5.doc+Sofi+nummer+formaat&hl=en&start=56&lr=lang_en|lang_nl&ie=UTF-8 pattern name => [qw /SEN Netherlands SoFi/], create => sub { # 9 digits (d1 d2 d3 d4 d5 d6 d7 d8 d9) # 9*d1 + 8*d2 + 7*d3 + 6*d4 + 5*d5 + 4*d6 + 3*d7 + 2*d8 + 1*d9 # == 0 mod 11. qr /([0-9]{9})(?(?{par11 ($^N)})|(?!))/; } ; =end does_not_exist =cut 1; __END__ =pod =head1 NAME Regexp::Common::SEN -- provide regexes for Social-Economical Numbers. =head1 SYNOPSIS use Regexp::Common qw /SEN/; while (<>) { /^$RE{SEN}{USA}{SSN}$/ and print "Social Security Number\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{SEN}{USA}{SSN}{-sep}> Returns a pattern that matches an American Social Security Number (SSN). SSNs consist of three groups of numbers, separated by a hypen (C<->). This pattern only checks for a valid structure, that is, it validates whether a number is valid SSN, was a valid SSN, or maybe a valid SSN in the future. There are almost a billion possible SSNs, and about 400 million are in use, or have been in use. If C<-sep=I

> is specified, the pattern I

is used as the separator between the groups of numbers. Under C<-keep> (see L): =over 4 =item $1 captures the entire SSN. =item $2 captures the first group of digits (the area number). =item $3 captures the second group of digits (the group number). =item $4 captures the third group of digits (the serial number). =back =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHORS Damian Conway and Abigail. =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI.pm000064400000006540147634422050007005 0ustar00package Regexp::Common::URI; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use Exporter (); use vars qw /@EXPORT_OK @ISA/; @ISA = qw /Exporter/; @EXPORT_OK = qw /register_uri/; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; # Use 'require' here, not 'use', so we delay running them after we are compiled. # We also do it using an 'eval'; this saves us from have repeated similar # lines. The eval is further explained in 'perldoc -f require'. my @uris = qw /fax file ftp gopher http pop prospero news tel telnet tv wais/; foreach my $uri (@uris) { eval "require Regexp::Common::URI::$uri"; die $@ if $@; } my %uris; sub register_uri { my ($scheme, $uri) = @_; $uris {$scheme} = $uri; } pattern name => [qw (URI)], create => sub {my $uri = join '|' => values %uris; $uri =~ s/\(\?k:/(?:/g; "(?k:$uri)"; }, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI -- provide patterns for URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{HTTP}/ and print "Contains an HTTP URI.\n"; } =head1 DESCRIPTION Patterns for the following URIs are supported: fax, file, FTP, gopher, HTTP, news, NTTP, pop, prospero, tel, telnet, tv and WAIS. Each is documented in the I>, manual page, for the appropriate scheme (in lowercase), except for I URIs which are found in I. =head2 C<$RE{URI}> Return a pattern that recognizes any of the supported URIs. With C<{-keep}>, only the entire URI is returned (in C<$1>). =head1 REFERENCES =over 4 =item B<[DRAFT-URI-TV]> Zigmond, D. and Vickers, M: I. December 2000. =item B<[DRAFT-URL-FTP]> Casey, James: I. November 1996. =item B<[RFC 1035]> Mockapetris, P.: I. November 1987. =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =item B<[RFC 2616]> Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P. and Berners-Lee, Tim: I. June 1999. =item B<[RFC 2806]> Vaha-Sipila, A.: I. April 2000. =back =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/list.pm000064400000010216147634422050007314 0ustar00package Regexp::Common::list; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; sub gen_list_pattern { my ($pat, $sep, $lsep) = @_; $lsep = $sep unless defined $lsep; return "(?k:(?:(?:$pat)(?:$sep))*(?:$pat)(?k:$lsep)(?:$pat))"; } my $defpat = '.*?\S'; my $defsep = '\s*,\s*'; pattern name => ['list', "-pat=$defpat", "-sep=$defsep", '-lastsep'], create => sub {gen_list_pattern (@{$_[1]}{-pat, -sep, -lastsep})}, ; pattern name => ['list', 'conj', '-word=(?:and|or)'], create => sub {gen_list_pattern($defpat, $defsep, '\s*,?\s*'.$_[1]->{-word}.'\s*'); }, ; pattern name => ['list', 'and'], create => sub {gen_list_pattern ($defpat, $defsep, '\s*,?\s*and\s*')}, ; pattern name => ['list', 'or'], create => sub {gen_list_pattern ($defpat, $defsep, '\s*,?\s*or\s*')}, ; 1; __END__ =pod =head1 NAME Regexp::Common::list -- provide regexes for lists =head1 SYNOPSIS use Regexp::Common qw /list/; while (<>) { /$RE{list}{-pat => '\w+'}/ and print "List of words"; /$RE{list}{-pat => $RE{num}{real}}/ and print "List of numbers"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{list}{-pat}{-sep}{-lastsep}> Returns a pattern matching a list of (at least two) substrings. If C<-pat=I

> is specified, it defines the pattern for each substring in the list. By default, I

is C. In Regexp::Common 0.02 or earlier, the default pattern was C. But that will match a single space, causing unintended parsing of C as a list of four elements instead of 3 (with C<-word> being C<(?:and)>). One consequence is that a list of the form "a,,b" will no longer be parsed. Use the pattern C to be able to parse this, but see the previous remark. If C<-sep=I

> is specified, it defines the pattern I

to be used as a separator between each pair of substrings in the list, except the final two. By default I

is C. If C<-lastsep=I

> is specified, it defines the pattern I

to be used as a separator between the final two substrings in the list. By default I

is the same as the pattern specified by the C<-sep> flag. For example: $RE{list}{-pat=>'\w+'} # match a list of word chars $RE{list}{-pat=>$RE{num}{real}} # match a list of numbers $RE{list}{-sep=>"\t"} # match a tab-separated list $RE{list}{-lastsep=>',\s+and\s+'} # match a proper English list Under C<-keep>: =over 4 =item $1 captures the entire list =item $2 captures the last separator =back =head2 C<$RE{list}{conj}{-word=I}> An alias for C<< $RE{list}{-lastsep=>'\s*,?\s*I\s*'} >> If C<-word> is not specified, the default pattern is C. For example: $RE{list}{conj}{-word=>'et'} # match Jean, Paul, et Satre $RE{list}{conj}{-word=>'oder'} # match Bonn, Koln oder Hamburg =head2 C<$RE{list}{and}> An alias for C<< $RE{list}{conj}{-word=>'and'} >> =head2 C<$RE{list}{or}> An alias for C<< $RE{list}{conj}{-word=>'or'} >> =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/whitespace.pm000064400000003633147634422050010502 0ustar00package Regexp::Common::whitespace; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; pattern name => [qw (ws crop)], create => '(?:^\s+|\s+$)', subs => sub {$_[1] =~ s/^\s+//; $_[1] =~ s/\s+$//;} ; 1; __END__ =pod =head1 NAME Regexp::Common::whitespace -- provides a regex for leading or trailing whitescape =head1 SYNOPSIS use Regexp::Common qw /whitespace/; while (<>) { s/$RE{ws}{crop}//g; # Delete surrounding whitespace } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. =head2 C<$RE{ws}{crop}> Returns a pattern that identifies leading or trailing whitespace. For example: $str =~ s/$RE{ws}{crop}//g; # Delete surrounding whitespace The call: $RE{ws}{crop}->subs($str); is optimized (but probably still slower than doing the s///g explicitly). This pattern does not capture under C<-keep>. =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/comment.pm000064400000075527147634422050010023 0ustar00package Regexp::Common::comment; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my @generic = ( {languages => [qw /ABC Forth/], to_eol => ['\\\\']}, # This is for just a *single* backslash. {languages => [qw /Ada Alan Eiffel lua/], to_eol => ['--']}, {languages => [qw /Advisor/], to_eol => ['#|//']}, {languages => [qw /Advsys CQL Lisp LOGO M MUMPS REBOL Scheme SMITH zonefile/], to_eol => [';']}, {languages => ['Algol 60'], from_to => [[qw /comment ;/]]}, {languages => [qw {ALPACA B C C-- LPC PL/I}], from_to => [[qw {/* */}]]}, {languages => [qw /awk fvwm2 Icon m4 mutt Perl Python QML R Ruby shell Tcl/], to_eol => ['#']}, {languages => [[BASIC => 'mvEnterprise']], to_eol => ['[*!]|REM']}, {languages => [qw /Befunge-98 Funge-98 Shelta/], id => [';']}, {languages => ['beta-Juliet', 'Crystal Report', 'Portia', 'Ubercode'], to_eol => ['//']}, {languages => ['BML'], from_to => [['']], }, {languages => [qw /C++/, 'C#', qw /Cg ECMAScript FPL Java JavaScript/], to_eol => ['//'], from_to => [[qw {/* */}]]}, {languages => [qw /CLU LaTeX slrn TeX/], to_eol => ['%']}, {languages => [qw /False/], from_to => [[qw !{ }!]]}, {languages => [qw /Fortran/], to_eol => ['!']}, {languages => [qw /Haifu/], id => [',']}, {languages => [qw /ILLGOL/], to_eol => ['NB']}, {languages => [qw /INTERCAL/], to_eol => [q{(?:(?:PLEASE(?:\s+DO)?|DO)\s+)?(?:NOT|N'T)}]}, {languages => [qw /J/], to_eol => ['NB[.]']}, {languages => [qw /JavaDoc/], from_to => [[qw {/** */}]]}, {languages => [qw /Nickle/], to_eol => ['#'], from_to => [[qw {/* */}]]}, {languages => [qw /Oberon/], from_to => [[qw /(* *)/]]}, {languages => [[qw /Pascal Delphi/], [qw /Pascal Free/], [qw /Pascal GPC/]], to_eol => ['//'], from_to => [[qw !{ }!], [qw !(* *)!]]}, {languages => [[qw /Pascal Workshop/]], id => [qw /"/], from_to => [[qw !{ }!], [qw !(* *)!], [qw !/* */!]]}, {languages => [qw /PEARL/], to_eol => ['!'], from_to => [[qw {/* */}]]}, {languages => [qw /PHP/], to_eol => ['#', '//'], from_to => [[qw {/* */}]]}, {languages => [qw !PL/B!], to_eol => ['[.;]']}, {languages => [qw !PL/SQL!], to_eol => ['--'], from_to => [[qw {/* */}]]}, {languages => [qw /Q-BAL/], to_eol => ['`']}, {languages => [qw /Smalltalk/], id => ['"']}, {languages => [qw /SQL/], to_eol => ['-{2,}']}, {languages => [qw /troff/], to_eol => ['\\\"']}, {languages => [qw /vi/], to_eol => ['"']}, {languages => [qw /*W/], from_to => [[qw {|| !!}]]}, {languages => [qw /ZZT-OOP/], to_eol => ["'"]}, ); my @plain_or_nested = ( [Caml => undef, "(*" => "*)"], [Dylan => "//", "/*" => "*/"], [Haskell => "-{2,}", "{-" => "-}"], [Hugo => "!(?!\\\\)", "!\\" => "\\!"], [SLIDE => "#", "(*" => "*)"], ['Modula-2' => undef, "(*" => "*)"], ['Modula-3' => undef, "(*" => "*)"], ); # # Helper subs. # sub combine { local $_ = join "|", @_; if (@_ > 1) { s/\(\?k:/(?:/g; $_ = "(?k:$_)"; } $_ } sub to_eol ($) {"(?k:(?k:$_[0])(?k:[^\\n]*)(?k:\\n))"} sub id ($) {"(?k:(?k:$_[0])(?k:[^$_[0]]*)(?k:$_[0]))"} # One char only! sub from_to { my ($begin, $end) = @_; my $qb = quotemeta $begin; my $qe = quotemeta $end; my $fe = quotemeta substr $end => 0, 1; my $te = quotemeta substr $end => 1; "(?k:(?k:$qb)(?k:(?:[^$fe]+|$fe(?!$te))*)(?k:$qe))"; } my $count = 0; sub nested { my ($begin, $end) = @_; $count ++; my $r = '(??{$Regexp::Common::comment ['. $count . ']})'; my $qb = quotemeta $begin; my $qe = quotemeta $end; my $fb = quotemeta substr $begin => 0, 1; my $fe = quotemeta substr $end => 0, 1; my $tb = quotemeta substr $begin => 1; my $te = quotemeta substr $end => 1; use re 'eval'; my $re; if ($fb eq $fe) { $re = qr /(?:$qb(?:(?>[^$fb]+)|$fb(?!$tb)(?!$te)|$r)*$qe)/; } else { local $" = "|"; my @clauses = "(?>[^$fb$fe]+)"; push @clauses => "$fb(?!$tb)" if length $tb; push @clauses => "$fe(?!$te)" if length $te; push @clauses => $r; $re = qr /(?:$qb(?:@clauses)*$qe)/; } $Regexp::Common::comment [$count] = qr/$re/; } # # Process data. # foreach my $info (@plain_or_nested) { my ($language, $mark, $begin, $end) = @$info; pattern name => [comment => $language], create => sub {my $re = nested $begin => $end; my $prefix = defined $mark ? $mark . "[^\n]*\n|" : ""; exists $_ [1] -> {-keep} ? qr /($prefix$re)/ : qr /$prefix$re/ }, version => 5.006, ; } foreach my $group (@generic) { my $pattern = combine +(map {to_eol $_} @{$group -> {to_eol}}), (map {from_to @$_} @{$group -> {from_to}}), (map {id $_} @{$group -> {id}}), ; foreach my $language (@{$group -> {languages}}) { pattern name => [comment => ref $language ? @$language : $language], create => $pattern, ; } } # # Other languages. # # http://www.pascal-central.com/docs/iso10206.txt pattern name => [qw /comment Pascal/], create => '(?k:' . '(?k:[{]|[(][*])' . '(?k:[^}*]*(?:[*](?![)])[^}*]*)*)' . '(?k:[}]|[*][)])' . ')' ; # http://www.templetons.com/brad/alice/language/ pattern name => [qw /comment Pascal Alice/], create => '(?k:(?k:[{])(?k:[^}\n]*)(?k:[}]))' ; # http://westein.arb-phys.uni-dortmund.de/~wb/a68s.txt pattern name => [qw (comment), 'Algol 68'], create => q {(?k:(?:#[^#]*#)|} . q {(?:\bco\b(?:[^c]+|\Bc|\bc(?!o\b))*\bco\b)|} . q {(?:\bcomment\b(?:[^c]+|\Bc|\bc(?!omment\b))*\bcomment\b))} ; # See rules 91 and 92 of ISO 8879 (SGML). # Charles F. Goldfarb: "The SGML Handbook". # Oxford: Oxford University Press. 1990. ISBN 0-19-853737-9. # Ch. 10.3, pp 390. pattern name => [qw (comment HTML)], create => q {(?k:(?k:))}, ; pattern name => [qw /comment SQL MySQL/], create => q {(?k:(?:#|-- )[^\n]*\n|} . q {/\*(?:(?>[^*;"']+)|"[^"]*"|'[^']*'|\*(?!/))*(?:;|\*/))}, ; # Anything that isn't <>[]+-., # http://home.wxs.nl/~faase009/Ha_BF.html pattern name => [qw /comment Brainfuck/], create => '(?k:[^<>\[\]+\-.,]+)' ; # Squeak is a variant of Smalltalk-80. # http://www.squeak. # http://mucow.com/squeak-qref.html pattern name => [qw /comment Squeak/], create => '(?k:(?k:")(?k:[^"]*(?:""[^"]*)*)(?k:"))' ; # # Scores of less than 5 or above 17.... # http://www.cliff.biffle.org/esoterica/beatnik.html @Regexp::Common::comment::scores = (1, 3, 3, 2, 1, 4, 2, 4, 1, 8, 5, 1, 3, 1, 1, 3, 10, 1, 1, 1, 1, 4, 4, 8, 4, 10); { my ($s, $x); pattern name => [qw /comment Beatnik/], create => sub { use re 'eval'; my $re = qr {\b([A-Za-z]+)\b (?(?{($s, $x) = (0, lc $^N); $s += $Regexp::Common::comment::scores [ord (chop $x) - ord ('a')] while length $x; $s >= 5 && $s < 18})XXX|)}x; $re; }, version => 5.008, ; } # http://www.cray.com/craydoc/manuals/007-3692-005/html-007-3692-005/ # (Goto table of contents/3.3 Source Form) # Fortran, in fixed format. Comments start with a C, c or * in the first # column, or a ! anywhere, but the sixth column. Then end with a newline. pattern name => [qw /comment Fortran fixed/], create => '(?k:(?k:(?:^[Cc*]|(? [qw /comment COBOL/], create => '(?<=^......)(?k:(?k:[*])(?k:[^\n]*)(?k:\n))', version => '5.008', ; 1; __END__ =pod =head1 NAME Regexp::Common::comment -- provide regexes for comments. =head1 SYNOPSIS use Regexp::Common qw /comment/; while (<>) { /$RE{comment}{C}/ and print "Contains a C comment\n"; /$RE{comment}{C++}/ and print "Contains a C++ comment\n"; /$RE{comment}{PHP}/ and print "Contains a PHP comment\n"; /$RE{comment}{Java}/ and print "Contains a Java comment\n"; /$RE{comment}{Perl}/ and print "Contains a Perl comment\n"; /$RE{comment}{awk}/ and print "Contains an awk comment\n"; /$RE{comment}{HTML}/ and print "Contains an HTML comment\n"; } use Regexp::Common qw /comment RE_comment_HTML/; while (<>) { $_ =~ RE_comment_HTML() and print "Contains an HTML comment\n"; } =head1 DESCRIPTION Please consult the manual of L for a general description of the works of this interface. Do not use this module directly, but load it via I. This modules gives you regular expressions for comments in various languages. =head2 THE LANGUAGES Below, the comments of each of the languages are described. The patterns are available as C<$RE{comment}{I}>, foreach language I. Some languages have variants; it's described at the individual languages how to get the patterns for the variants. Unless mentioned otherwise, C<{-keep}> sets C<$1>, C<$2>, C<$3> and C<$4> to the entire comment, the opening marker, the content of the comment, and the closing marker (for many languages, the latter is a newline) respectively. =over 4 =item ABC Comments in I start with a backslash (C<\>), and last till the end of the line. See L. =item Ada Comments in I start with C<-->, and last till the end of the line. =item Advisor I is a language used by the HP product I. Comments for this language start with either C<#> or C, and last till the end of the line. =item Advsys Comments for the I language start with C<;> and last till the end of the line. See also L. =item Alan I comments start with C<-->, and last till the end of the line. See also L. =item Algol 60 Comments in the I language start with the keyword C, and end with a C<;>. See L. =item Algol 68 In I, comments are either delimited by C<#>, or by one of the keywords C or C. The keywords should not be part of another word. See L. With C<{-keep}>, only C<$1> will be set, returning the entire comment. =item ALPACA The I language has comments starting with C and ending with C<*/>. =item awk The I programming language uses comments that start with C<#> and end at the end of the line. =item B The I language has comments starting with C and ending with C<*/>. =item BASIC There are various forms of BASIC around. Currently, we only support the variant supported by I, whose pattern is available as C<$RE{comment}{BASIC}{mvEnterprise}>. Comments in this language start with a C, a C<*> or the keyword C, and end till the end of the line. See L. =item Beatnik The esotoric language I only uses words consisting of letters. Words are scored according to the rules of Scrabble. Words scoring less than 5 points, or 18 points or more are considered comments (although the compiler might mock at you if you score less than 5 points). Regardless whether C<{-keep}>, C<$1> will be set, and set to the entire comment. This pattern requires I or newer. =item beta-Juliet The I programming language has comments that start with C and that continue till the end of the line. See also L. =item Befunge-98 The esotoric language I uses comments that start and end with a C<;>. See L. =item BML I, or I is an HTML templating language that uses comments starting with C<< >, and ending with C<< c_?> >>. See L. =item Brainfuck The minimal language I uses only eight characters, C>, C>, C<[>, C<]>, C<+>, C<->, C<.> and C<,>. Any other characters are considered comments. With C<{-keep}>, C<$1> is set to the entire comment. =item C The I language has comments starting with C and ending with C<*/>. =item C-- The I language has comments starting with C and ending with C<*/>. See L. =item C++ The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. =item C# The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. See L. =item Caml Comments in I start with C<(*>, end with C<*)>, and can be nested. See L and L. =item Cg The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. See L. =item CLU In C, a comment starts with a procent sign (C<%>), and ends with the next newline. See L and L. =item COBOL Traditionally, comments in I are indicated by an asteriks in the seventh column. This is what the pattern matches. Modern compiler may more lenient though. See L, and L. Due to a bug in the regexp engine of perl 5.6.x, this regexp is only available in version 5.8.0 and up. =item CQL Comments in the chess query language (I) start with a semi colon (C<;>) and last till the end of the line. See L. =item Crystal Report The formula editor in I uses comments that start with C, and end with the end of the line. =item Dylan There are two types of comments in I. They either start with C, or are nested comments, delimited with C and C<*/>. Under C<{-keep}>, only C<$1> will be set, returning the entire comment. This pattern requires I or newer. =item ECMAScript The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. I is Netscapes implementation of I. See L, and L. =item Eiffel I comments start with C<-->, and last till the end of the line. =item False In I, comments start with C<{> and end with C<}>. See L =item FPL The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. =item Forth Comments in Forth start with C<\>, and end with the end of the line. See also L. =item Fortran There are two forms of I. There's free form I, which has comments that start with C, and end at the end of the line. The pattern for this is given by C<$RE{Fortran}>. Fixed form I, which has been obsoleted, has comments that start with C, C or C<*> in the first column, or with C anywhere, but the sixth column. The pattern for this are given by C<$RE{Fortran}{fixed}>. See also L. =item Funge-98 The esotoric language I uses comments that start and end with a C<;>. =item fvwm2 Configuration files for I have comments starting with a C<#> and lasting the rest of the line. =item Haifu I, an esotoric language using haikus, has comments starting and ending with a C<,>. See L. =item Haskell There are two types of comments in I. They either start with at least two dashes, or are nested comments, delimited with C<{-> and C<-}>. Under C<{-keep}>, only C<$1> will be set, returning the entire comment. This pattern requires I or newer. =item HTML In I, comments only appear inside a I. A comment declaration starts with a C!>, and ends with a C>. Inside this declaration, we have zero or more comments. Comments starts with C<--> and end with C<-->, and are optionally followed by whitespace. The pattern C<$RE{comment}{HTML}> recognizes those comment declarations (and hence more than a comment). Note that this is not the same as something that starts with C!--> and ends with C<--E>, because the following will be matched completely: Second Comment Do not be fooled by what your favourite browser thinks is an HTML comment. If C<{-keep}> is used, the following are returned: =over 4 =item $1 captures the entire comment declaration. =item $2 captures the MDO (markup declaration open), C!>. =item $3 captures the content between the MDO and the MDC. =item $4 captures the (last) comment, without the surrounding dashes. =item $5 captures the MDC (markup declaration close), C>. =back =item Hugo There are two types of comments in I. They either start with C (which cannot be followed by a C<\>), or are nested comments, delimited with C and C<\!>. Under C<{-keep}>, only C<$1> will be set, returning the entire comment. This pattern requires I or newer. =item Icon I has comments that start with C<#> and end at the next new line. See L, L, and L. =item ILLGOL The esotoric language I uses comments starting with I and lasting till the end of the line. See L. =item INTERCAL Comments in INTERCAL are single line comments. They start with one of the keywords C or C, and can optionally be preceded by the keywords C and C. If both keywords are used, C precedes C. Keywords are separated by whitespace. =item J The language I uses comments that start with C, and that last till the end of the line. See L, and L. =item Java The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. =item JavaDoc The I documentation syntax is demarked with a subset of ordinary Java comments to separate it from code. Comments start with C end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. See L. =item JavaScript The I language has two forms of comments. Comments that start with C and last till the end of the line, and comments that start with C, and end with C<*/>. If C<{-keep}> is used, only C<$1> will be set, and set to the entire comment. I is Netscapes implementation of I. See L, and L. =item LaTeX The documentation language I uses comments starting with C<%> and ending at the end of the line. =item Lisp Comments in I start with a semi-colon (C<;>) and last till the end of the line. =item LPC The I language has comments starting with C and ending with C<*/>. =item LOGO Comments for the language I start with C<;>, and last till the end of the line. =item lua Comments for the I language start with C<-->, and last till the end of the line. See also L. =item M, MUMPS In C (aka C), comments start with a semi-colon, and last till the end of a line. The language specification requires the semi-colon to be preceded by one or more Is. Those characters default to a space, but that's configurable. This requirement, of preceding the comment with linestart characters is B tested for. See L, L, and L. =item m4 By default, the preprocessor language I uses single line comments, that start with a C<#> and continue to the end of the line, including the newline. The pattern C<$RE {comment} {m4}> matches such comments. In I, it is possible to change the starting token though. See L, L, and L. =item Modula-2 In C, comments start with C<(*>, and end with C<*)>. Comments may be nested. See L. =item Modula-3 In C, comments start with C<(*>, and end with C<*)>. Comments may be nested. See L. =item mutt Configuration files for I have comments starting with a C<#> and lasting the rest of the line. =item Nickle The I language has one line comments starting with C<#> (like Perl), or multiline comments delimited by C and C<*/> (like C). Under C<-keep>, only C<$1> will be set. See also L. =item Oberon Comments in I start with C<(*> and end with C<*)>. See L. =item Pascal There are many implementations of Pascal. This modules provides pattern for comments of several implementations. =over 4 =item C<$RE{comment}{Pascal}> This is the pattern that recognizes comments according to the Pascal ISO standard. This standard says that comments start with either C<{>, or C<(*>, and end with C<}> or C<*)>. This means that C<{*)> and C<(*}> are considered to be comments. Many Pascal applications don't allow this. See L =item C<$RE{comment}{Pascal}{Alice}> The I compiler accepts comments that start with C<{> and end with C<}>. Comments are not allowed to contain newlines. See L. =item C<$RE{comment}{Pascal}{Delphi}>, C<$RE{comment}{Pascal}{Free}> and C<$RE{comment}{Pascal}{GPC}> The I, I and the I implementations of Pascal all have comments that either start with C and last till the end of the line, are delimited with C<{> and C<}> or are delimited with C<(*> and C<*)>. Patterns for those comments are given by C<$RE{comment}{Pascal}{Delphi}>, C<$RE{comment}{Pascal}{Free}> and C<$RE{comment}{Pascal}{GPC}> respectively. These patterns only set C<$1> when C<{-keep}> is used, which will then include the entire comment. See L, L and L. =item C<$RE{comment}{Pascal}{Workshop}> The I compiler, from SUN Microsystems, allows comments that are delimited with either C<{> and C<}>, delimited with C<(*)> and C<*>), delimited with C, and C<*/>, or starting and ending with a double quote (C<">). When C<{-keep}> is used, only C<$1> is set, and returns the entire comment. See L. =back =item PEARL Comments in I start with a C and last till the end of the line, or start with C and end with C<*/>. With C<{-keep}>, C<$1> will be set to the entire comment. =item PHP Comments in I start with either C<#> or C and last till the end of the line, or are delimited by C and C<*/>. With C<{-keep}>, C<$1> will be set to the entire comment. =item PL/B In I, comments start with either C<.> or C<;>, and end with the next newline. See L. =item PL/I The I language has comments starting with C and ending with C<*/>. =item PL/SQL In I, comments either start with C<--> and run till the end of the line, or start with C and end with C<*/>. =item Perl I uses comments that start with a C<#>, and continue till the end of the line. =item Portia The I programming language has comments that start with C, and last till the end of the line. =item Python I uses comments that start with a C<#>, and continue till the end of the line. =item Q-BAL Comments in the I language start with C<`> (a backtick), and contine till the end of the line. =item QML In C, comments start with C<#> and last till the end of the line. See L. =item R The statistical language I uses comments that start with a C<#> and end with the following new line. See L. =item REBOL Comments for the I language start with C<;> and last till the end of the line. =item Ruby Comments in I start with C<#> and last till the end of the time. =item Scheme I comments start with C<;>, and last till the end of the line. See L. =item shell Comments in various Is start with a C<#> and end at the end of the line. =item Shelta The esotoric language I uses comments that start and end with a C<;>. See L. =item SLIDE The I language has two froms of comments. First there is the line comment, which starts with a C<#> and includes the rest of the line (just like Perl). Second, there is the multiline, nested comment, which are delimited by C<(*> and C<*)>. Under C{-keep}>, only C<$1> is set, and is set to the entire comment. This pattern needs at least Perl version 5.6.0. See L. =item slrn Configuration files for I have comments starting with a C<%> and lasting the rest of the line. =item Smalltalk I uses comments that start and end with a double quote, C<">. =item SMITH Comments in the I language start with C<;>, and last till the end of the line. =item Squeak In the Smalltalk variant I, comments start and end with C<">. Double quotes can appear inside comments by doubling them. =item SQL Standard I uses comments starting with two or more dashes, and ending at the end of the line. I does not follow the standard. Instead, it allows comments that start with a C<#> or C<-- > (that's two dashes and a space) ending with the following newline, and comments starting with C, and ending with the next C<;> or C<*/> that isn't inside single or double quotes. A pattern for this is returned by C<$RE{comment}{SQL}{MySQL}>. With C<{-keep}>, only C<$1> will be set, and it returns the entire comment. =item Tcl In I, comments start with C<#> and continue till the end of the line. =item TeX The documentation language I uses comments starting with C<%> and ending at the end of the line. =item troff The document formatting language I uses comments starting with C<\">, and continuing till the end of the line. =item Ubercode The Windows programming language I uses comments that start with C and continue to the end of the line. See L. =item vi In configuration files for the editor I, one can use comments starting with C<">, and ending at the end of the line. =item *W In the language I<*W>, comments start with C<||>, and end with C. =item zonefile Comments in DNS Is start with C<;>, and continue till the end of the line. =item ZZT-OOP The in-game language I uses comments that start with a C<'> character, and end at the following newline. See L. =back =head1 REFERENCES =over 4 =item B<[Go 90]> Charles F. Goldfarb: I. Oxford: Oxford University Press. B<1990>. ISBN 0-19-853737-9. Ch. 10.3, pp 390-391. =back =head1 SEE ALSO L for a general description of how to use this interface. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/file.pm000064400000004057147634422050007725 0ustar00package Regexp::Common::URI::file; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$host $fpath/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $scheme = 'file'; my $uri = "(?k:(?k:$scheme)://(?k:(?k:(?:$host|localhost)?)" . "(?k:/(?k:$fpath))))"; register_uri $scheme => $uri; pattern name => [qw (URI file)], create => $uri, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::file -- Returns a pattern for file URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{file}/ and print "Contains a file URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{file} Returns a pattern that matches I URIs, as defined by RFC 1738. File URIs have the form: "file:" "//" [ host | "localhost" ] "/" fpath Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The part of the URI following "file://". =item $4 The hostname. =item $5 The path name, including the leading slash. =item $6 The path name, without the leading slash. =back =head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/http.pm000064400000005553147634422050007767 0ustar00package Regexp::Common::URI::http; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC2396 qw /$host $port $path_segments $query/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $http_uri = "(?k:(?k:http)://(?k:$host)(?::(?k:$port))?" . "(?k:/(?k:(?k:$path_segments)(?:[?](?k:$query))?))?)"; my $https_uri = $http_uri; $https_uri =~ s/http/https?/; register_uri HTTP => $https_uri; pattern name => [qw (URI HTTP), "-scheme=http"], create => sub { my $scheme = $_ [1] -> {-scheme}; my $uri = $http_uri; $uri =~ s/http/$scheme/; $uri; } ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::http -- Returns a pattern for HTTP URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{HTTP}/ and print "Contains an HTTP URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{HTTP}{-scheme} Provides a regex for an HTTP URI as defined by RFC 2396 (generic syntax) and RFC 2616 (HTTP). If C<< -scheme => I

>> is specified the pattern I

is used as the scheme. By default I

is C. C and C are reasonable alternatives. The syntax for an HTTP URI is: "http:" "//" host [ ":" port ] [ "/" path [ "?" query ]] Under C<{-keep}>, the following are returned: =over 4 =item $1 The entire URI. =item $2 The scheme. =item $3 The host (name or address). =item $4 The port (if any). =item $5 The absolute path, including the query and leading slash. =item $6 The absolute path, including the query, without the leading slash. =item $7 The absolute path, without the query or leading slash. =item $8 The query, without the question mark. =back =head1 REFERENCES =over 4 =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =item B<[RFC 2616]> Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P. and Berners-Lee, Tim: I. June 1999. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/RFC2396.pm000064400000012076147634422050007744 0ustar00package Regexp::Common::URI::RFC2396; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; use vars qw /@EXPORT @EXPORT_OK %EXPORT_TAGS @ISA/; use Exporter (); @ISA = qw /Exporter/; my %vars; BEGIN { $vars {low} = [qw /$digit $upalpha $lowalpha $alpha $alphanum $hex $escaped $mark $unreserved $reserved $pchar $uric $urics $userinfo $userinfo_no_colon $uric_no_slash/]; $vars {parts} = [qw /$query $fragment $param $segment $path_segments $ftp_segments $rel_segment $abs_path $rel_path $path/]; $vars {connect} = [qw /$port $IPv4address $toplabel $domainlabel $hostname $host $hostport $server $reg_name $authority/]; $vars {URI} = [qw /$scheme $net_path $opaque_part $hier_part $relativeURI $absoluteURI $URI_reference/]; } use vars map {@$_} values %vars; @EXPORT = (); @EXPORT_OK = map {@$_} values %vars; %EXPORT_TAGS = (%vars, ALL => [@EXPORT_OK]); # RFC 2396, base definitions. $digit = '[0-9]'; $upalpha = '[A-Z]'; $lowalpha = '[a-z]'; $alpha = '[a-zA-Z]'; # lowalpha | upalpha $alphanum = '[a-zA-Z0-9]'; # alpha | digit $hex = '[a-fA-F0-9]'; $escaped = "(?:%$hex$hex)"; $mark = "[\\-_.!~*'()]"; $unreserved = "[a-zA-Z0-9\\-_.!~*'()]"; # alphanum | mark # %61-%7A, %41-%5A, %30-%39 # a - z A - Z 0 - 9 # %21, %27, %28, %29, %2A, %2D, %2E, %5F, %7E # ! ' ( ) * - . _ ~ $reserved = "[;/?:@&=+\$,]"; $pchar = "(?:[a-zA-Z0-9\\-_.!~*'():\@&=+\$,]|$escaped)"; # unreserved | escaped | [:@&=+$,] $uric = "(?:[;/?:\@&=+\$,a-zA-Z0-9\\-_.!~*'()]|$escaped)"; # reserved | unreserved | escaped $urics = "(?:(?:[;/?:\@&=+\$,a-zA-Z0-9\\-_.!~*'()]+|" . "$escaped)*)"; $query = $urics; $fragment = $urics; $param = "(?:(?:[a-zA-Z0-9\\-_.!~*'():\@&=+\$,]+|$escaped)*)"; $segment = "(?:$param(?:;$param)*)"; $path_segments = "(?:$segment(?:/$segment)*)"; $ftp_segments = "(?:$param(?:/$param)*)"; # NOT from RFC 2396. $rel_segment = "(?:(?:[a-zA-Z0-9\\-_.!~*'();\@&=+\$,]*|$escaped)+)"; $abs_path = "(?:/$path_segments)"; $rel_path = "(?:$rel_segment(?:$abs_path)?)"; $path = "(?:(?:$abs_path|$rel_path)?)"; $port = "(?:$digit*)"; $IPv4address = "(?:$digit+[.]$digit+[.]$digit+[.]$digit+)"; $toplabel = "(?:$alpha"."[-a-zA-Z0-9]*$alphanum|$alpha)"; $domainlabel = "(?:(?:$alphanum"."[-a-zA-Z0-9]*)?$alphanum)"; $hostname = "(?:(?:$domainlabel\[.])*$toplabel\[.]?)"; $host = "(?:$hostname|$IPv4address)"; $hostport = "(?:$host(?::$port)?)"; $userinfo = "(?:(?:[a-zA-Z0-9\\-_.!~*'();:&=+\$,]+|$escaped)*)"; $userinfo_no_colon = "(?:(?:[a-zA-Z0-9\\-_.!~*'();&=+\$,]+|$escaped)*)"; $server = "(?:(?:$userinfo\@)?$hostport)"; $reg_name = "(?:(?:[a-zA-Z0-9\\-_.!~*'()\$,;:\@&=+]*|$escaped)+)"; $authority = "(?:$server|$reg_name)"; $scheme = "(?:$alpha"."[a-zA-Z0-9+\\-.]*)"; $net_path = "(?://$authority$abs_path?)"; $uric_no_slash = "(?:[a-zA-Z0-9\\-_.!~*'();?:\@&=+\$,]|$escaped)"; $opaque_part = "(?:$uric_no_slash$urics)"; $hier_part = "(?:(?:$net_path|$abs_path)(?:[?]$query)?)"; $relativeURI = "(?:(?:$net_path|$abs_path|$rel_path)(?:[?]$query)?"; $absoluteURI = "(?:$scheme:(?:$hier_part|$opaque_part))"; $URI_reference = "(?:(?:$absoluteURI|$relativeURI)?(?:#$fragment)?)"; 1; __END__ =pod =head1 NAME Regexp::Common::URI::RFC2396 -- Definitions from RFC2396; =head1 SYNOPSIS use Regexp::Common::URI::RFC2396 qw /:ALL/; =head1 DESCRIPTION This package exports definitions from RFC2396. It's intended usage is for Regexp::Common::URI submodules only. Its interface might change without notice. =head1 REFERENCES =over 4 =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =back =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/prospero.pm000064400000004172147634422050010655 0ustar00package Regexp::Common::URI::prospero; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$host $port $ppath $fieldname $fieldvalue $fieldspec/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $scheme = 'prospero'; my $uri = "(?k:(?k:$scheme)://(?k:$host)(?::(?k:$port))?" . "/(?k:$ppath)(?k:$fieldspec*))"; register_uri $scheme => $uri; pattern name => [qw (URI prospero)], create => $uri, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::prospero -- Returns a pattern for prospero URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{prospero}/ and print "Contains a prospero URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{prospero} Returns a pattern that matches I URIs, as defined by RFC 1738. prospero URIs have the form: "prospero:" "//" host [ ":" port ] "/" path [ fieldspec ] * Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The I. =item $3 The I. =item $4 The I, if given. =item $5 The propero path. =item $6 The field specifications, if given. There can be more field specifications; they will all be returned in C<$6>. =back =head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Abigail. (I). =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/fax.pm000064400000005324147634422050007562 0ustar00package Regexp::Common::URI::fax; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC2806 qw /$fax_subscriber $fax_subscriber_no_future/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $fax_scheme = 'fax'; my $fax_uri = "(?k:(?k:$fax_scheme):(?k:$fax_subscriber))"; my $fax_uri_nf = "(?k:(?k:$fax_scheme):(?k:$fax_subscriber_no_future))"; register_uri $fax_scheme => $fax_uri; pattern name => [qw (URI fax)], create => $fax_uri ; pattern name => [qw (URI fax nofuture)], create => $fax_uri_nf ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::fax -- Returns a pattern for fax URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{fax}/ and print "Contains a fax URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{fax} Returns a pattern that matches I URIs, as defined by RFC 2806. Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The phone number, including any possible add-ons like ISDN subaddress, a post dial part, area specifier, service provider, etc. =back =head2 C<$RE{URI}{fax}{nofuture}> As above (including what's returned by C<{-keep}>), with the exception that I are not allowed. Without allowing those I, it becomes much easier to check a URI if the correct syntax for post dial, service provider, phone context, etc has been used - otherwise the regex could always classify them as a I. =head1 REFERENCES =over 4 =item B<[RFC 1035]> Mockapetris, P.: I. November 1987. =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =item B<[RFC 2806]> Vaha-Sipila, A.: I. April 2000. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/tel.pm000064400000005370147634422050007571 0ustar00package Regexp::Common::URI::tel; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC2806 qw /$telephone_subscriber $telephone_subscriber_no_future/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $tel_scheme = 'tel'; my $tel_uri = "(?k:(?k:$tel_scheme):(?k:$telephone_subscriber))"; my $tel_uri_nf = "(?k:(?k:$tel_scheme):(?k:$telephone_subscriber_no_future))"; register_uri $tel_scheme => $tel_uri; pattern name => [qw (URI tel)], create => $tel_uri ; pattern name => [qw (URI tel nofuture)], create => $tel_uri_nf ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::tel -- Returns a pattern for telephone URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{tel}/ and print "Contains a telephone URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{tel} Returns a pattern that matches I URIs, as defined by RFC 2806. Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The phone number, including any possible add-ons like ISDN subaddress, a post dial part, area specifier, service provider, etc. =back =head2 C<$RE{URI}{tel}{nofuture}> As above (including what's returned by C<{-keep}>), with the exception that I are not allowed. Without allowing those I, it becomes much easier to check a URI if the correct syntax for post dial, service provider, phone context, etc has been used - otherwise the regex could always classify them as a I. =head1 REFERENCES =over 4 =item B<[RFC 1035]> Mockapetris, P.: I. November 1987. =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =item B<[RFC 2806]> Vaha-Sipila, A.: I. April 2000. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/RFC2806.pm000064400000017622147634422050007742 0ustar00package Regexp::Common::URI::RFC2806; use Regexp::Common::URI::RFC1035 qw /$domain/; use Regexp::Common::URI::RFC2396 qw /$unreserved $escaped $hex/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; use vars qw /@EXPORT @EXPORT_OK %EXPORT_TAGS @ISA/; use Exporter (); @ISA = qw /Exporter/; my %vars; BEGIN { $vars {low} = [qw /$dtmf_digit $wait_for_dial_tone $one_second_pause $pause_character $visual_separator $phonedigit $escaped_no_dquote $quoted_string $token_char $token_chars/]; $vars {parts} = [qw /$future_extension/]; $vars {connect} = [qw /$provider_hostname $provider_tag $service_provider $private_prefix $local_network_prefix $global_network_prefix $network_prefix/]; $vars {phone} = [qw /$phone_context_ident $phone_context_tag $area_specifier $post_dial $isdn_subaddress $t33_subaddress $local_phone_number $local_phone_number_no_future $base_phone_number $global_phone_number $global_phone_number_no_future $telephone_subscriber $telephone_subscriber_no_future/]; $vars {fax} = [qw /$fax_local_phone $fax_local_phone_no_future $fax_global_phone $fax_global_phone_no_future $fax_subscriber $fax_subscriber_no_future/]; $vars {modem} = [qw //]; } use vars map {@$_} values %vars; @EXPORT = (); @EXPORT_OK = map {@$_} values %vars; %EXPORT_TAGS = (%vars, ALL => [@EXPORT_OK]); # RFC 2806, URIs for tel, fax & modem. $dtmf_digit = "(?:[*#ABCD])"; $wait_for_dial_tone= "(?:w)"; $one_second_pause = "(?:p)"; $pause_character = "(?:[wp])"; # wait_for_dial_tone | one_second_pause. $visual_separator = "(?:[\\-.()])"; $phonedigit = "(?:[0-9\\-.()])"; # DIGIT | visual_separator $escaped_no_dquote = "(?:%(?:[01]$hex)|2[013-9A-Fa-f]|[3-9A-Fa-f]$hex)"; $quoted_string = "(?:%22(?:(?:%5C(?:$unreserved|$escaped))|" . "$unreserved+|$escaped_no_dquote)*%22)"; # It is unclear wether we can allow only unreserved # characters to unescaped, or can we also use uric # characters that are unescaped? Or pchars? $token_char = "(?:[!'*\\-.0-9A-Z_a-z~]|" . "%(?:2[13-7ABDEabde]|3[0-9]|4[1-9A-Fa-f]|" . "5[AEFaef]|6[0-9A-Fa-f]|7[0-9ACEace]))"; # Only allowing unreserved chars to be unescaped. $token_chars = "(?:(?:[!'*\\-.0-9A-Z_a-z~]+|" . "%(?:2[13-7ABDEabde]|3[0-9]|4[1-9A-Fa-f]|" . "5[AEFaef]|6[0-9A-Fa-f]|7[0-9ACEace]))*)"; $future_extension = "(?:;$token_chars" . "(?:=(?:(?:$token_chars(?:[?]$token_chars)?)|" . "$quoted_string))?)"; $provider_hostname = $domain; $provider_tag = "(?:tsp)"; $service_provider = "(?:;$provider_tag=$provider_hostname)"; $private_prefix = "(?:(?:[!'E-OQ-VX-Z_e-oq-vx-z~]|" . "(?:%(?:2[124-7CFcf]|3[AC-Fac-f]|4[05-9A-Fa-f]|" . "5[1-689A-Fa-f]|6[05-9A-Fa-f]|" . "7[1-689A-Ea-e])))" . "(?:[!'()*\\-.0-9A-Z_a-z~]+|" . "(?:%(?:2[1-9A-Fa-f]|3[AC-Fac-f]|" . "[4-6][0-9A-Fa-f]|7[0-9A-Ea-e])))*)"; $local_network_prefix = "(?:[0-9\\-.()*#ABCDwp]+)"; $global_network_prefix = "(?:[+][0-9\\-.()]+)"; $network_prefix = "(?:$global_network_prefix|$local_network_prefix)"; $phone_context_ident = "(?:$network_prefix|$private_prefix)"; $phone_context_tag = "(?:phone-context)"; $area_specifier = "(?:;$phone_context_tag=$phone_context_ident)"; $post_dial = "(?:;postd=[0-9\\-.()*#ABCDwp]+)"; $isdn_subaddress = "(?:;isub=[0-9\\-.()]+)"; $t33_subaddress = "(?:;tsub=[0-9\\-.()]+)"; $local_phone_number= "(?:[0-9\\-.()*#ABCDwp]+$isdn_subaddress?" . "$post_dial?$area_specifier" . "(?:$area_specifier|$service_provider|" . "$future_extension)*)"; $local_phone_number_no_future = "(?:[0-9\\-.()*#ABCDwp]+$isdn_subaddress?" . "$post_dial?$area_specifier" . "(?:$area_specifier|$service_provider)*)"; $fax_local_phone = "(?:[0-9\\-.()*#ABCDwp]+$isdn_subaddress?" . "$t33_subaddress?$post_dial?$area_specifier" . "(?:$area_specifier|$service_provider|" . "$future_extension)*)"; $fax_local_phone_no_future = "(?:[0-9\\-.()*#ABCDwp]+$isdn_subaddress?" . "$t33_subaddress?$post_dial?$area_specifier" . "(?:$area_specifier|$service_provider)*)"; $base_phone_number = "(?:[0-9\\-.()]+)"; $global_phone_number = "(?:[+]$base_phone_number$isdn_subaddress?" . "$post_dial?" . "(?:$area_specifier|$service_provider|" . "$future_extension)*)"; $global_phone_number_no_future = "(?:[+]$base_phone_number$isdn_subaddress?" . "$post_dial?" . "(?:$area_specifier|$service_provider)*)"; $fax_global_phone = "(?:[+]$base_phone_number$isdn_subaddress?" . "$t33_subaddress?$post_dial?" . "(?:$area_specifier|$service_provider|" . "$future_extension)*)"; $fax_global_phone_no_future = "(?:[+]$base_phone_number$isdn_subaddress?" . "$t33_subaddress?$post_dial?" . "(?:$area_specifier|$service_provider)*)"; $telephone_subscriber = "(?:$global_phone_number|$local_phone_number)"; $telephone_subscriber_no_future = "(?:$global_phone_number_no_future|" . "$local_phone_number_no_future)"; $fax_subscriber = "(?:$fax_global_phone|$fax_local_phone)"; $fax_subscriber_no_future = "(?:$fax_global_phone_no_future|" . "$fax_local_phone_no_future)"; 1; __END__ =pod =head1 NAME Regexp::Common::URI::RFC2806 -- Definitions from RFC2806; =head1 SYNOPSIS use Regexp::Common::URI::RFC2806 qw /:ALL/; =head1 DESCRIPTION This package exports definitions from RFC2806. It's intended usage is for Regexp::Common::URI submodules only. Its interface might change without notice. =head1 REFERENCES =over 4 =item B<[RFC 2616]> Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P. and Berners-Lee, Tim: I. June 1999. =back =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/RFC1738.pm000064400000011345147634422050007741 0ustar00package Regexp::Common::URI::RFC1738; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; use vars qw /@EXPORT @EXPORT_OK %EXPORT_TAGS @ISA/; use Exporter (); @ISA = qw /Exporter/; my %vars; BEGIN { $vars {low} = [qw /$digit $digits $hialpha $lowalpha $alpha $alphadigit $safe $extra $national $punctuation $unreserved $unreserved_range $reserved $uchar $uchars $xchar $xchars $hex $escape/]; $vars {connect} = [qw /$port $hostnumber $toplabel $domainlabel $hostname $host $hostport $user $password $login/]; $vars {parts} = [qw /$fsegment $fpath $group $article $grouppart $search $database $wtype $wpath $psegment $fieldname $fieldvalue $fieldspec $ppath/]; } use vars map {@$_} values %vars; @EXPORT = qw /$host/; @EXPORT_OK = map {@$_} values %vars; %EXPORT_TAGS = (%vars, ALL => [@EXPORT_OK]); # RFC 1738, base definitions. # Lowlevel definitions. $digit = '[0-9]'; $digits = '[0-9]+'; $hialpha = '[A-Z]'; $lowalpha = '[a-z]'; $alpha = '[a-zA-Z]'; # lowalpha | hialpha $alphadigit = '[a-zA-Z0-9]'; # alpha | digit $safe = '[-$_.+]'; $extra = "[!*'(),]"; $national = '[][{}|\\^~`]'; $punctuation = '[<>#%"]'; $unreserved_range = q [-a-zA-Z0-9$_.+!*'(),]; # alphadigit | safe | extra $unreserved = "[$unreserved_range]"; $reserved = '[;/?:@&=]'; $hex = '[a-fA-F0-9]'; $escape = "(?:%$hex$hex)"; $uchar = "(?:$unreserved|$escape)"; $uchars = "(?:(?:$unreserved|$escape)*)"; $xchar = "(?:[$unreserved_range;/?:\@&=]|$escape)"; $xchars = "(?:(?:[$unreserved_range;/?:\@&=]|$escape)*)"; # Connection related stuff. $port = "(?:$digits)"; $hostnumber = "(?:$digits\[.]$digits\[.]$digits\[.]$digits)"; $toplabel = "(?:$alpha\[-a-zA-Z0-9]*$alphadigit|$alpha)"; $domainlabel = "(?:(?:$alphadigit\[-a-zA-Z0-9]*)?$alphadigit)"; $hostname = "(?:(?:$domainlabel\[.])*$toplabel)"; $host = "(?:$hostname|$hostnumber)"; $hostport = "(?:$host(?::$port)?)"; $user = "(?:(?:[$unreserved_range;?&=]|$escape)*)"; $password = "(?:(?:[$unreserved_range;?&=]|$escape)*)"; $login = "(?:(?:$user(?::$password)?\@)?$hostport)"; # Parts (might require more if we add more URIs). # FTP/file $fsegment = "(?:(?:[$unreserved_range:\@&=]|$escape)*)"; $fpath = "(?:$fsegment(?:/$fsegment)*)"; # NNTP/news. $group = "(?:$alpha\[-A-Za-z0-9.+_]*)"; $article = "(?:(?:[$unreserved_range;/?:&=]|$escape)+" . '@' . "$host)"; $grouppart = "(?:[*]|$article|$group)"; # It's important that # $article goes before # $group. # WAIS. $search = "(?:(?:[$unreserved_range;:\@&=]|$escape)*)"; $database = $uchars; $wtype = $uchars; $wpath = $uchars; # prospero $psegment = "(?:(?:[$unreserved_range?:\@&=]|$escape)*)"; $fieldname = "(?:(?:[$unreserved_range?:\@&]|$escape)*)"; $fieldvalue = "(?:(?:[$unreserved_range?:\@&]|$escape)*)"; $fieldspec = "(?:;$fieldname=$fieldvalue)"; $ppath = "(?:$psegment(?:/$psegment)*)"; # # The various '(?:(?:[$unreserved_range ...]|$escape)*)' above need # some loop unrolling to speed up the match. # 1; __END__ =pod =head1 NAME Regexp::Common::URI::RFC1738 -- Definitions from RFC1738; =head1 SYNOPSIS use Regexp::Common::URI::RFC1738 qw /:ALL/; =head1 DESCRIPTION This package exports definitions from RFC1738. It's intended usage is for Regexp::Common::URI submodules only. Its interface might change without notice. =head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =back =head1 AUTHOR Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/RFC1035.pm000064400000004005147634422050007722 0ustar00package Regexp::Common::URI::RFC1035; use Regexp::Common qw /pattern clean no_defaults/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; use vars qw /@EXPORT @EXPORT_OK %EXPORT_TAGS @ISA/; use Exporter (); @ISA = qw /Exporter/; my %vars; BEGIN { $vars {low} = [qw /$digit $letter $let_dig $let_dig_hyp $ldh_str/]; $vars {parts} = [qw /$label $subdomain/]; $vars {domain} = [qw /$domain/]; } use vars map {@$_} values %vars; @EXPORT = qw /$host/; @EXPORT_OK = map {@$_} values %vars; %EXPORT_TAGS = (%vars, ALL => [@EXPORT_OK]); # RFC 1035. $digit = "[0-9]"; $letter = "[A-Za-z]"; $let_dig = "[A-Za-z0-9]"; $let_dig_hyp = "[-A-Za-z0-9]"; $ldh_str = "(?:[-A-Za-z0-9]+)"; $label = "(?:$letter(?:(?:$ldh_str){0,61}$let_dig)?)"; $subdomain = "(?:$label(?:[.]$label)*)"; $domain = "(?: |(?:$subdomain))"; 1; __END__ =pod =head1 NAME Regexp::Common::URI::RFC1035 -- Definitions from RFC1035; =head1 SYNOPSIS use Regexp::Common::URI::RFC1035 qw /:ALL/; =head1 DESCRIPTION This package exports definitions from RFC1035. It's intended usage is for Regexp::Common::URI submodules only. Its interface might change without notice. =head1 REFERENCES =over 4 =item B<[RFC 1035]> Mockapetris, P.: I. November 1987. =back =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/RFC2384.pm000064400000004464147634422050007743 0ustar00package Regexp::Common::URI::RFC2384; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI::RFC1738 qw /$unreserved_range $escape $hostport/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; use vars qw /@EXPORT @EXPORT_OK %EXPORT_TAGS @ISA/; use Exporter (); @ISA = qw /Exporter/; my %vars; BEGIN { $vars {low} = [qw /$achar_range $achar $achars $achar_more/]; $vars {connect} = [qw /$enc_sasl $enc_user $enc_ext $enc_auth_type $auth $user_auth $server/]; $vars {parts} = [qw /$pop_url/]; } use vars map {@$_} values %vars; @EXPORT = qw /$host/; @EXPORT_OK = map {@$_} values %vars; %EXPORT_TAGS = (%vars, ALL => [@EXPORT_OK]); # RFC 2384, POP3. # Lowlevel definitions. $achar_range = "$unreserved_range&=~"; $achar = "(?:[$achar_range]|$escape)"; $achars = "(?:(?:[$achar_range]+|$escape)*)"; $achar_more = "(?:(?:[$achar_range]+|$escape)+)"; $enc_sasl = $achar_more; $enc_user = $achar_more; $enc_ext = "(?:[+](?:APOP|$achar_more))"; $enc_auth_type = "(?:$enc_sasl|$enc_ext)"; $auth = "(?:;AUTH=(?:[*]|$enc_auth_type))"; $user_auth = "(?:$enc_user$auth?)"; $server = "(?:(?:$user_auth\@)?$hostport)"; $pop_url = "(?:pop://$server)"; 1; __END__ =pod =head1 NAME Regexp::Common::URI::RFC2384 -- Definitions from RFC2384; =head1 SYNOPSIS use Regexp::Common::URI::RFC2384 qw /:ALL/; =head1 DESCRIPTION This package exports definitions from RFC2384. It's intended usage is for Regexp::Common::URI submodules only. Its interface might change without notice. =head1 REFERENCES =over 4 =item B<[RFC 2384]> Gellens, R.: I August 1998. =back =head1 AUTHOR Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/tv.pm000064400000003751147634422050007437 0ustar00# TV URLs. # Internet draft: draft-zigmond-tv-url-03.txt package Regexp::Common::URI::tv; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC2396 qw /$hostname/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $tv_scheme = 'tv'; my $tv_url = "(?k:(?k:$tv_scheme):(?k:$hostname)?)"; register_uri $tv_scheme => $tv_url; pattern name => [qw (URI tv)], create => $tv_url, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::tv -- Returns a pattern for tv URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{tv}/ and print "Contains a tv URI.\n"; } =head1 DESCRIPTION =head2 C<$RE{URI}{tv}> Returns a pattern that recognizes TV uris as per an Internet draft [DRAFT-URI-TV]. Under C<{-keep}>, the following are returned: =over 4 =item $1 The entire URI. =item $2 The scheme. =item $3 The host. =back =head1 REFERENCES =over 4 =item B<[DRAFT-URI-TV]> Zigmond, D. and Vickers, M: I. December 2000. =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/pop.pm000064400000004054147634422050007601 0ustar00package Regexp::Common::URI::pop; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$host $port/; use Regexp::Common::URI::RFC2384 qw /$enc_user $enc_auth_type/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $scheme = "pop"; my $uri = "(?k:(?k:$scheme)://(?:(?k:$enc_user)" . "(?:;AUTH=(?k:[*]|$enc_auth_type))?\@)?" . "(?k:$host)(?::(?k:$port))?)"; register_uri $scheme => $uri; pattern name => [qw (URI POP)], create => $uri, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::pop -- Returns a pattern for POP URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{POP}/ and print "Contains a POP URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{POP} Returns a pattern that matches I URIs, as defined by RFC 2384. POP URIs have the form: "pop:" "//" [ user [ ";AUTH" ( "*" | auth_type ) ] "@" ] host [ ":" port ] Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The I. =item $3 The I, if given. =item $4 The I, if given (could be a I<*>). =item $5 The I. =item $6 The I, if given. =back =head1 REFERENCES =over 4 =item B<[RFC 2384]> Gellens, R.: I. August 1998. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Abigail. (I). =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/telnet.pm000064400000004413147634422050010275 0ustar00package Regexp::Common::URI::telnet; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$user $password $host $port/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $telnet_uri = "(?k:(?k:telnet)://(?:(?k:(?k:$user)(?::(?k:$password))?)\@)?" . "(?k:(?k:$host)(?::(?k:$port))?)(?k:/)?)"; register_uri telnet => $telnet_uri; pattern name => [qw (URI telnet)], create => $telnet_uri, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::telnet -- Returns a pattern for telnet URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{telnet}/ and print "Contains a telnet URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{telnet} Returns a pattern that matches I URIs, as defined by RFC 1738. Telnet URIs have the form: "telnet:" "//" [ user [ ":" password ] "@" ] host [ ":" port ] [ "/" ] Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The username:password combo, or just the username if there is no password. =item $4 The username, if given. =item $5 The password, if given. =item $6 The host:port combo, or just the host if there's no port. =item $7 The host. =item $8 The port, if given. =item $9 The trailing slash, if any. =back =head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/RFC1808.pm000064400000010455147634422050007740 0ustar00package Regexp::Common::URI::RFC1808; BEGIN { # This makes sure 'use warnings' doesn't bomb out on 5.005_*; # warnings won't be enabled on those old versions though. if ($] < 5.006 && !exists $INC {"warnings.pm"}) { $INC {"warnings.pm"} = 1; no strict 'refs'; *{"warnings::unimport"} = sub {0}; } } use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; use vars qw /@EXPORT @EXPORT_OK %EXPORT_TAGS @ISA/; use Exporter (); @ISA = qw /Exporter/; my %vars; BEGIN { $vars {low} = [qw /$punctuation $reserved_range $reserved $national $extra $safe $digit $digits $hialpha $lowalpha $alpha $alphadigit $hex $escape $unreserved_range $unreserved $uchar $uchars $pchar_range $pchar $pchars/], $vars {parts} = [qw /$fragment $query $param $params $segment $fsegment $path $net_loc $scheme $rel_path $abs_path $net_path $relativeURL $generic_RL $absoluteURL $URL/], } use vars map {@$_} values %vars; @EXPORT = qw /$host/; @EXPORT_OK = map {@$_} values %vars; %EXPORT_TAGS = (%vars, ALL => [@EXPORT_OK]); # RFC 1808, base definitions. # Lowlevel definitions. $punctuation = '[<>#%"]'; $reserved_range = q [;/?:@&=]; $reserved = "[$reserved_range]"; $national = '[][{}|\\^~`]'; $extra = "[!*'(),]"; $safe = '[-$_.+]'; $digit = '[0-9]'; $digits = '[0-9]+'; $hialpha = '[A-Z]'; $lowalpha = '[a-z]'; $alpha = '[a-zA-Z]'; # lowalpha | hialpha $alphadigit = '[a-zA-Z0-9]'; # alpha | digit $hex = '[a-fA-F0-9]'; $escape = "(?:%$hex$hex)"; $unreserved_range = q [-a-zA-Z0-9$_.+!*'(),]; # alphadigit | safe | extra $unreserved = "[$unreserved_range]"; $uchar = "(?:$unreserved|$escape)"; $uchars = "(?:(?:$unreserved+|$escape)*)"; $pchar_range = qq [$unreserved_range:\@&=]; $pchar = "(?:[$pchar_range]|$escape)"; $pchars = "(?:(?:[$pchar_range]+|$escape)*)"; # Parts $fragment = "(?:(?:[$unreserved_range$reserved_range]+|$escape)*)"; $query = "(?:(?:[$unreserved_range$reserved_range]+|$escape)*)"; $param = "(?:(?:[$pchar_range/]+|$escape)*)"; $params = "(?:$param(?:;$param)*)"; $segment = "(?:(?:[$pchar_range]+|$escape)*)"; $fsegment = "(?:(?:[$pchar_range]+|$escape)+)"; $path = "(?:$fsegment(?:/$segment)*)"; $net_loc = "(?:(?:[$pchar_range;?]+|$escape)*)"; $scheme = "(?:(?:[-a-zA-Z0-9+.]+|$escape)+)"; $rel_path = "(?:$path?(?:;$params)?(?:?$query)?)"; $abs_path = "(?:/$rel_path)"; $net_path = "(?://$net_loc$abs_path?)"; $relativeURL = "(?:$net_path|$abs_path|$rel_path)"; $generic_RL = "(?:$scheme:$relativeURL)"; $absoluteURL = "(?:$generic_RL|" . "(?:$scheme:(?:[$unreserved_range$reserved_range]+|$escape)*))"; $URL = "(?:(?:$absoluteURL|$relativeURL)(?:#$fragment)?)"; 1; __END__ =pod =head1 NAME Regexp::Common::URI::RFC1808 -- Definitions from RFC1808; =head1 SYNOPSIS use Regexp::Common::URI::RFC1808 qw /:ALL/; =head1 DESCRIPTION This package exports definitions from RFC1808. It's intended usage is for Regexp::Common::URI submodules only. Its interface might change without notice. =head1 REFERENCES =over 4 =item B<[RFC 1808]> Fielding, R.: I. June 1995. =back =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/news.pm000064400000005412147634422050007756 0ustar00package Regexp::Common::URI::news; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$grouppart $group $article $host $port $digits/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $news_scheme = 'news'; my $news_uri = "(?k:(?k:$news_scheme):(?k:$grouppart))"; my $nntp_scheme = 'nntp'; my $nntp_uri = "(?k:(?k:$nntp_scheme)://(?k:(?k:(?k:$host)(?::(?k:$port))?)" . "/(?k:$group)(?:/(?k:$digits))?))"; register_uri $news_scheme => $news_uri; register_uri $nntp_scheme => $nntp_uri; pattern name => [qw (URI news)], create => $news_uri, ; pattern name => [qw (URI NNTP)], create => $nntp_uri, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::news -- Returns a pattern for file URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{news}/ and print "Contains a news URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{news} Returns a pattern that matches I URIs, as defined by RFC 1738. News URIs have the form: "news:" ( "*" | group | article "@" host ) Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The part of the URI following "news://". =back =head2 $RE{URI}{NNTP} Returns a pattern that matches I URIs, as defined by RFC 1738. NNTP URIs have the form: "nntp://" host [ ":" port ] "/" group [ "/" digits ] Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The part of the URI following "nntp://". =item $4 The host and port, separated by a colon. If no port was given, just the host. =item $5 The host. =item $6 The port, if given. =item $7 The group. =item $8 The digits, if given. =back =head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/wais.pm000064400000004642147634422050007751 0ustar00package Regexp::Common::URI::wais; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$host $port $search $database $wtype $wpath/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $scheme = 'wais'; my $uri = "(?k:(?k:$scheme)://(?k:$host)(?::(?k:$port))?/(?k:(?k:$database)" . "(?k:[?](?k:$search)|/(?k:$wtype)/(?k:$wpath))?))"; register_uri $scheme => $uri; pattern name => [qw (URI WAIS)], create => $uri, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::wais -- Returns a pattern for WAIS URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{WAIS}/ and print "Contains a WAIS URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{WAIS} Returns a pattern that matches I URIs, as defined by RFC 1738. WAIS URIs have the form: "wais:" "//" host [ ":" port ] "/" database [ ( "?" search ) | ( "/" wtype "/" wpath ) ] Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The I. =item $3 The I. =item $4 The I, if given. =item $5 The I, followed by I or I, if given. =item $6 The I. =item $7 The part following the I if given, including the question mark or slash. =item $8 The I part, if given. =item $9 The I, if given. =item $10 The I, if given. =back =head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/gopher.pm000064400000010701147634422050010263 0ustar00package Regexp::Common::URI::gopher; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC1738 qw /$host $port $uchars/; use Regexp::Common::URI::RFC1808 qw /$pchars $pchar_range/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $pchars_notab = "(?:(?:[$pchar_range]+|" . "%(?:[1-9a-fA-F][0-9a-fA-F]|0[0-8a-fA-F]))*)"; my $gopherplus_string = $pchars; my $search = $pchars; my $search_notab = $pchars_notab; my $selector = $pchars; my $selector_notab = $pchars_notab; my $gopher_type = "(?:[0-9+IgT])"; my $scheme = "gopher"; my $uri = "(?k:(?k:$scheme)://(?k:$host)(?::(?k:$port))?" . "/(?k:(?k:$gopher_type)(?k:$selector)))"; my $uri_notab = "(?k:(?k:$scheme)://(?k:$host)(?::(?k:$port))?" . "/(?k:(?k:$gopher_type)(?k:$selector_notab)" . "(?:%09(?k:$search_notab)(?:%09(?k:$gopherplus_string))?)?))"; register_uri $scheme => $uri; pattern name => [qw (URI gopher -notab=)], create => sub { exists $_ [1] {-notab} && !defined $_ [1] {-notab} ? $uri_notab : $uri}, ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::gopher -- Returns a pattern for gopher URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{gopher}/ and print "Contains a gopher URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{gopher}{-notab} Gopher URIs are poorly defined. Originally, RFC 1738 defined gopher URIs, but they were later redefined in an internet draft. One that was expired in June 1997. The internet draft for gopher URIs defines them as follows: "gopher:" "//" host [ ":" port ] "/" gopher-type selector [ "%09" search [ "%09" gopherplus_string ]] Unfortunally, a I is defined in such a way that characters may be escaped using the URI escape mechanism. This includes tabs, which escaped are C<%09>. Hence, the syntax cannot distinguish between a URI that has both a I and a I part, and an URI where the I includes an escaped tab. (The text of the draft forbids tabs to be present in the I though). C<$RE{URI}{gopher}> follows the defined syntax. To disallow escaped tabs in the I and I parts, use C<$RE{URI}{gopher}{-notab}>. There are other differences between the text and the given syntax. According to the text, selector strings cannot have tabs, linefeeds or carriage returns in them. The text also allows the entire I, (the part after the slash following the hostport) to be empty; if this is empty the slash may be omitted as well. However, this isn't reflected in the syntax. Under C<{-keep}>, the following are returned: =over 4 =item $1 The entire URI. =item $2 The scheme. =item $3 The host (name or address). =item $4 The port (if any). =item $5 The "gopher-path", the part after the / following the host and port. =item $6 The gopher-type. =item $7 The selector. (When no C<{-notab}> is used, this includes the search and gopherplus_string, including the separating escaped tabs). =item $8 The search, if given. (Only when C<{-notab}> is given). =item $9 The gopherplus_string, if given. (Only when C<{-notab}> is given). =back head1 REFERENCES =over 4 =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =item B<[RFC 1808]> Fielding, R.: I. June 1995. =item B<[GOPHER URL]> Krishnan, Murali R., Casey, James: "A Gopher URL Format". Expired Internet draft I. December 1996. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common/URI/ftp.pm000064400000012604147634422050007574 0ustar00package Regexp::Common::URI::ftp; use Regexp::Common qw /pattern clean no_defaults/; use Regexp::Common::URI qw /register_uri/; use Regexp::Common::URI::RFC2396 qw /$host $port $ftp_segments $userinfo $userinfo_no_colon/; use strict; use warnings; use vars qw /$VERSION/; $VERSION = '2010010201'; my $ftp_uri = "(?k:(?k:ftp)://(?:(?k:$userinfo)(?k:)\@)?(?k:$host)" . "(?::(?k:$port))?(?k:/(?k:(?k:$ftp_segments)" . "(?:;type=(?k:[AIai]))?))?)"; my $ftp_uri_password = "(?k:(?k:ftp)://(?:(?k:$userinfo_no_colon)" . "(?::(?k:$userinfo_no_colon))?\@)?(?k:$host)" . "(?::(?k:$port))?(?k:/(?k:(?k:$ftp_segments)" . "(?:;type=(?k:[AIai]))?))?)"; register_uri FTP => $ftp_uri; pattern name => [qw (URI FTP), "-type=[AIai]", "-password="], create => sub { my $uri = exists $_ [1] -> {-password} && !defined $_ [1] -> {-password} ? $ftp_uri_password : $ftp_uri; my $type = $_ [1] -> {-type}; $uri =~ s/\[AIai\]/$type/; $uri; } ; 1; __END__ =pod =head1 NAME Regexp::Common::URI::ftp -- Returns a pattern for FTP URIs. =head1 SYNOPSIS use Regexp::Common qw /URI/; while (<>) { /$RE{URI}{FTP}/ and print "Contains an FTP URI.\n"; } =head1 DESCRIPTION =head2 $RE{URI}{FTP}{-type}{-password}; Returns a regex for FTP URIs. Note: FTP URIs are not formally defined. RFC 1738 defines FTP URLs, but parts of that RFC have been obsoleted by RFC 2396. However, the differences between RFC 1738 and RFC 2396 are such that they aren't applicable straightforwardly to FTP URIs. There are two main problems: =over 4 =item Passwords. RFC 1738 allowed an optional username and an optional password (separated by a colon) in the FTP URL. Hence, colons were not allowed in either the username or the password. RFC 2396 strongly recommends passwords should not be used in URIs. It does allow for I instead. This userinfo part may contain colons, and hence contain more than one colon. The regexp returned follows the RFC 2396 specification, unless the I<{-password}> option is given; then the regex allows for an optional username and password, separated by a colon. =item The ;type specifier. RFC 1738 does not allow semi-colons in FTP path names, because a semi-colon is a reserved character for FTP URIs. The semi-colon is used to separate the path from the option I specifier. However, in RFC 2396, paths consist of slash separated segments, and each segment is a semi-colon separated group of parameters. Straigthforward application of RFC 2396 would mean that a trailing I specifier couldn't be distinguished from the last segment of the path having a two parameters, the last one starting with I. Therefore we have opted to disallow a semi-colon in the path part of an FTP URI. Furthermore, RFC 1738 allows three values for the type specifier, I, I and I (either upper case or lower case). However, the internet draft about FTP URIs B<[DRAFT-FTP-URL]> (which expired in May 1997) notes the lack of consistent implementation of the I parameter and drops I from the set of possible values. We follow this practise; however, RFC 1738 behaviour can be archieved by using the I<-type => "[ADIadi]"> parameter. =back FTP URIs have the following syntax: "ftp:" "//" [ userinfo "@" ] host [ ":" port ] [ "/" path [ ";type=" value ]] When using I<{-password}>, we have the syntax: "ftp:" "//" [ user [ ":" password ] "@" ] host [ ":" port ] [ "/" path [ ";type=" value ]] Under C<{-keep}>, the following are returned: =over 4 =item $1 The complete URI. =item $2 The scheme. =item $3 The userinfo, or if I<{-password}> is used, the username. =item $4 If I<{-password}> is used, the password, else C. =item $5 The hostname or IP address. =item $6 The port number. =item $7 The full path and type specification, including the leading slash. =item $8 The full path and type specification, without the leading slash. =item $9 The full path, without the type specification nor the leading slash. =item $10 The value of the type specification. =back =head1 REFERENCES =over 4 =item B<[DRAFT-URL-FTP]> Casey, James: I. November 1996. =item B<[RFC 1738]> Berners-Lee, Tim, Masinter, L., McCahill, M.: I. December 1994. =item B<[RFC 2396]> Berners-Lee, Tim, Fielding, R., and Masinter, L.: I. August 1998. =back =head1 SEE ALSO L for other supported URIs. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2009, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT. =cut Common.pm000064400000064347147634422050006357 0ustar00package Regexp::Common; use 5.00473; use strict; BEGIN { # This makes sure 'use warnings' doesn't bomb out on 5.005_*; # warnings won't be enabled on those old versions though. # Since all other files use this file, we can use 'use warnings' # elsewhere as well, but *AFTER* 'use Regexp::Common'. if ($] < 5.006) { $INC {"warnings.pm"} = 1; no strict 'refs'; *{"warnings::unimport"} = sub {0}; } } use warnings; use vars qw /$VERSION %RE %sub_interface $AUTOLOAD/; $VERSION = '2013031301'; sub _croak { require Carp; goto &Carp::croak; } sub _carp { require Carp; goto &Carp::carp; } sub new { my ($class, @data) = @_; my %self; tie %self, $class, @data; return \%self; } sub TIEHASH { my ($class, @data) = @_; bless \@data, $class; } sub FETCH { my ($self, $extra) = @_; return bless ref($self)->new(@$self, $extra), ref($self); } my %imports = map {$_ => "Regexp::Common::$_"} qw /balanced CC comment delimited lingua list net number profanity SEN URI whitespace zip/; sub import { shift; # Shift off the class. tie %RE, __PACKAGE__; { no strict 'refs'; *{caller() . "::RE"} = \%RE; } my $saw_import; my $no_defaults; my %exclude; foreach my $entry (grep {!/^RE_/} @_) { if ($entry eq 'pattern') { no strict 'refs'; *{caller() . "::pattern"} = \&pattern; next; } # This used to prevent $; from being set. We still recognize it, # but we won't do anything. if ($entry eq 'clean') { next; } if ($entry eq 'no_defaults') { $no_defaults ++; next; } if (my $module = $imports {$entry}) { $saw_import ++; eval "require $module;"; die $@ if $@; next; } if ($entry =~ /^!(.*)/ && $imports {$1}) { $exclude {$1} ++; next; } # As a last resort, try to load the argument. my $module = $entry =~ /^Regexp::Common/ ? $entry : "Regexp::Common::" . $entry; eval "require $module;"; die $@ if $@; } unless ($saw_import || $no_defaults) { foreach my $module (values %imports) { next if $exclude {$module}; eval "require $module;"; die $@ if $@; } } my %exported; foreach my $entry (grep {/^RE_/} @_) { if ($entry =~ /^RE_(\w+_)?ALL$/) { my $m = defined $1 ? $1 : ""; my $re = qr /^RE_${m}.*$/; while (my ($sub, $interface) = each %sub_interface) { next if $exported {$sub}; next unless $sub =~ /$re/; { no strict 'refs'; *{caller() . "::$sub"} = $interface; } $exported {$sub} ++; } } else { next if $exported {$entry}; _croak "Can't export unknown subroutine &$entry" unless $sub_interface {$entry}; { no strict 'refs'; *{caller() . "::$entry"} = $sub_interface {$entry}; } $exported {$entry} ++; } } } sub AUTOLOAD { _croak "Can't $AUTOLOAD" } sub DESTROY {} my %cache; my $fpat = qr/^(-\w+)/; sub _decache { my @args = @{tied %{$_[0]}}; my @nonflags = grep {!/$fpat/} @args; my $cache = get_cache(@nonflags); _croak "Can't create unknown regex: \$RE{" . join("}{",@args) . "}" unless exists $cache->{__VAL__}; _croak "Perl $] does not support the pattern " . "\$RE{" . join("}{",@args) . "}.\nYou need Perl $cache->{__VAL__}{version} or later" unless ($cache->{__VAL__}{version}||0) <= $]; my %flags = ( %{$cache->{__VAL__}{default}}, map { /$fpat\Q$;\E(.*)/ ? ($1 => $2) : /$fpat/ ? ($1 => undef) : () } @args); $cache->{__VAL__}->_clone_with(\@args, \%flags); } use overload q{""} => \&_decache; sub get_cache { my $cache = \%cache; foreach (@_) { $cache = $cache->{$_} || ($cache->{$_} = {}); } return $cache; } sub croak_version { my ($entry, @args) = @_; } sub pattern { my %spec = @_; _croak 'pattern() requires argument: name => [ @list ]' unless $spec{name} && ref $spec{name} eq 'ARRAY'; _croak 'pattern() requires argument: create => $sub_ref_or_string' unless $spec{create}; if (ref $spec{create} ne "CODE") { my $fixed_str = "$spec{create}"; $spec{create} = sub { $fixed_str } } my @nonflags; my %default; foreach ( @{$spec{name}} ) { if (/$fpat=(.*)/) { $default{$1} = $2; } elsif (/$fpat\s*$/) { $default{$1} = undef; } else { push @nonflags, $_; } } my $entry = get_cache(@nonflags); if ($entry->{__VAL__}) { _carp "Overriding \$RE{" . join("}{",@nonflags) . "}"; } $entry->{__VAL__} = bless { create => $spec{create}, match => $spec{match} || \&generic_match, subs => $spec{subs} || \&generic_subs, version => $spec{version}, default => \%default, }, 'Regexp::Common::Entry'; foreach (@nonflags) {s/\W/X/g} my $subname = "RE_" . join ("_", @nonflags); $sub_interface{$subname} = sub { push @_ => undef if @_ % 2; my %flags = @_; my $pat = $spec{create}->($entry->{__VAL__}, {%default, %flags}, \@nonflags); if (exists $flags{-keep}) { $pat =~ s/\Q(?k:/(/g; } else { $pat =~ s/\Q(?k:/(?:/g; } return exists $flags {-i} ? qr /(?i:$pat)/ : qr/$pat/; }; return 1; } sub generic_match {$_ [1] =~ /$_[0]/} sub generic_subs {$_ [1] =~ s/$_[0]/$_[2]/} sub matches { my ($self, $str) = @_; my $entry = $self -> _decache; $entry -> {match} -> ($entry, $str); } sub subs { my ($self, $str, $newstr) = @_; my $entry = $self -> _decache; $entry -> {subs} -> ($entry, $str, $newstr); return $str; } package Regexp::Common::Entry; # use Carp; use overload q{""} => sub { my ($self) = @_; my $pat = $self->{create}->($self, $self->{flags}, $self->{args}); if (exists $self->{flags}{-keep}) { $pat =~ s/\Q(?k:/(/g; } else { $pat =~ s/\Q(?k:/(?:/g; } if (exists $self->{flags}{-i}) { $pat = "(?i)$pat" } return $pat; }; sub _clone_with { my ($self, $args, $flags) = @_; bless { %$self, args=>$args, flags=>$flags }, ref $self; } 1; __END__ =pod =head1 NAME Regexp::Common - Provide commonly requested regular expressions =head1 SYNOPSIS # STANDARD USAGE use Regexp::Common; while (<>) { /$RE{num}{real}/ and print q{a number}; /$RE{quoted}/ and print q{a ['"`] quoted string}; /$RE{delimited}{-delim=>'/'}/ and print q{a /.../ sequence}; /$RE{balanced}{-parens=>'()'}/ and print q{balanced parentheses}; /$RE{profanity}/ and print q{a #*@%-ing word}; } # SUBROUTINE-BASED INTERFACE use Regexp::Common 'RE_ALL'; while (<>) { $_ =~ RE_num_real() and print q{a number}; $_ =~ RE_quoted() and print q{a ['"`] quoted string}; $_ =~ RE_delimited(-delim=>'/') and print q{a /.../ sequence}; $_ =~ RE_balanced(-parens=>'()'} and print q{balanced parentheses}; $_ =~ RE_profanity() and print q{a #*@%-ing word}; } # IN-LINE MATCHING... if ( $RE{num}{int}->matches($text) ) {...} # ...AND SUBSTITUTION my $cropped = $RE{ws}{crop}->subs($uncropped); # ROLL-YOUR-OWN PATTERNS use Regexp::Common 'pattern'; pattern name => ['name', 'mine'], create => '(?i:J[.]?\s+A[.]?\s+Perl-Hacker)', ; my $name_matcher = $RE{name}{mine}; pattern name => [ 'lineof', '-char=_' ], create => sub { my $flags = shift; my $char = quotemeta $flags->{-char}; return '(?:^$char+$)'; }, match => sub { my ($self, $str) = @_; return $str !~ /[^$self->{flags}{-char}]/; }, subs => sub { my ($self, $str, $replacement) = @_; $_[1] =~ s/^$self->{flags}{-char}+$//g; }, ; my $asterisks = $RE{lineof}{-char=>'*'}; # DECIDING WHICH PATTERNS TO LOAD. use Regexp::Common qw /comment number/; # Comment and number patterns. use Regexp::Common qw /no_defaults/; # Don't load any patterns. use Regexp::Common qw /!delimited/; # All, but delimited patterns. =head1 DESCRIPTION By default, this module exports a single hash (C<%RE>) that stores or generates commonly needed regular expressions (see L<"List of available patterns">). There is an alternative, subroutine-based syntax described in L<"Subroutine-based interface">. =head2 General syntax for requesting patterns To access a particular pattern, C<%RE> is treated as a hierarchical hash of hashes (of hashes...), with each successive key being an identifier. For example, to access the pattern that matches real numbers, you specify: $RE{num}{real} and to access the pattern that matches integers: $RE{num}{int} Deeper layers of the hash are used to specify I: arguments that modify the resulting pattern in some way. The keys used to access these layers are prefixed with a minus sign and may have a value; if a value is given, it's done by using a multidimensional key. For example, to access the pattern that matches base-2 real numbers with embedded commas separating groups of three digits (e.g. 10,101,110.110101101): $RE{num}{real}{-base => 2}{-sep => ','}{-group => 3} Through the magic of Perl, these flag layers may be specified in any order (and even interspersed through the identifier keys!) so you could get the same pattern with: $RE{num}{real}{-sep => ','}{-group => 3}{-base => 2} or: $RE{num}{-base => 2}{real}{-group => 3}{-sep => ','} or even: $RE{-base => 2}{-group => 3}{-sep => ','}{num}{real} etc. Note, however, that the relative order of amongst the identifier keys I significant. That is: $RE{list}{set} would not be the same as: $RE{set}{list} =head2 Flag syntax In versions prior to 2.113, flags could also be written as C<{"-flag=value"}>. This no longer works, although C<{"-flag$;value"}> still does. However, C<< {-flag => 'value'} >> is the preferred syntax. =head2 Universal flags Normally, flags are specific to a single pattern. However, there is two flags that all patterns may specify. =over 4 =item C<-keep> By default, the patterns provided by C<%RE> contain no capturing parentheses. However, if the C<-keep> flag is specified (it requires no value) then any significant substrings that the pattern matches are captured. For example: if ($str =~ $RE{num}{real}{-keep}) { $number = $1; $whole = $3; $decimals = $5; } Special care is needed if a "kept" pattern is interpolated into a larger regular expression, as the presence of other capturing parentheses is likely to change the "number variables" into which significant substrings are saved. See also L<"Adding new regular expressions">, which describes how to create new patterns with "optional" capturing brackets that respond to C<-keep>. =item C<-i> Some patterns or subpatterns only match lowercase or uppercase letters. If one wants the do case insensitive matching, one option is to use the C regexp modifier, or the special sequence C<(?i)>. But if the functional interface is used, one does not have this option. The C<-i> switch solves this problem; by using it, the pattern will do case insensitive matching. =back =head2 OO interface and inline matching/substitution The patterns returned from C<%RE> are objects, so rather than writing: if ($str =~ /$RE{some}{pattern}/ ) {...} you can write: if ( $RE{some}{pattern}->matches($str) ) {...} For matching this would seem to have no great advantage apart from readability (but see below). For substitutions, it has other significant benefits. Frequently you want to perform a substitution on a string without changing the original. Most people use this: $changed = $original; $changed =~ s/$RE{some}{pattern}/$replacement/; The more adept use: ($changed = $original) =~ s/$RE{some}{pattern}/$replacement/; Regexp::Common allows you do write this: $changed = $RE{some}{pattern}->subs($original=>$replacement); Apart from reducing precedence-angst, this approach has the added advantages that the substitution behaviour can be optimized from the regular expression, and the replacement string can be provided by default (see L<"Adding new regular expressions">). For example, in the implementation of this substitution: $cropped = $RE{ws}{crop}->subs($uncropped); the default empty string is provided automatically, and the substitution is optimized to use: $uncropped =~ s/^\s+//; $uncropped =~ s/\s+$//; rather than: $uncropped =~ s/^\s+|\s+$//g; =head2 Subroutine-based interface The hash-based interface was chosen because it allows regexes to be effortlessly interpolated, and because it also allows them to be "curried". For example: my $num = $RE{num}{int}; my $commad = $num->{-sep=>','}{-group=>3}; my $duodecimal = $num->{-base=>12}; However, the use of tied hashes does make the access to Regexp::Common patterns slower than it might otherwise be. In contexts where impatience overrules laziness, Regexp::Common provides an additional subroutine-based interface. For each (sub-)entry in the C<%RE> hash (C<$RE{key1}{key2}{etc}>), there is a corresponding exportable subroutine: C. The name of each subroutine is the underscore-separated concatenation of the I keys that locate the same pattern in C<%RE>. Flags are passed to the subroutine in its argument list. Thus: use Regexp::Common qw( RE_ws_crop RE_num_real RE_profanity ); $str =~ RE_ws_crop() and die "Surrounded by whitespace"; $str =~ RE_num_real(-base=>8, -sep=>" ") or next; $offensive = RE_profanity(-keep); $str =~ s/$offensive/$bad{$1}++; ""/ge; Note that, unlike the hash-based interface (which returns objects), these subroutines return ordinary C'd regular expressions. Hence they do not curry, nor do they provide the OO match and substitution inlining described in the previous section. It is also possible to export subroutines for all available patterns like so: use Regexp::Common 'RE_ALL'; Or you can export all subroutines with a common prefix of keys like so: use Regexp::Common 'RE_num_ALL'; which will export C and C (and if you have create more patterns who have first key I, those will be exported as well). In general, I will export all subroutines whose pattern names have first keys I ... I. =head2 Adding new regular expressions You can add your own regular expressions to the C<%RE> hash at run-time, using the exportable C subroutine. It expects a hash-like list of key/value pairs that specify the behaviour of the pattern. The various possible argument pairs are: =over 4 =item C [ @list ]> A required argument that specifies the name of the pattern, and any flags it may take, via a reference to a list of strings. For example: pattern name => [qw( line of -char )], # other args here ; This specifies an entry C<$RE{line}{of}>, which may take a C<-char> flag. Flags may also be specified with a default value, which is then used whenever the flag is specified without an explicit value (but not when the flag is omitted). For example: pattern name => [qw( line of -char=_ )], # default char is '_' # other args here ; =item C $sub_ref_or_string> A required argument that specifies either a string that is to be returned as the pattern: pattern name => [qw( line of underscores )], create => q/(?:^_+$)/ ; or a reference to a subroutine that will be called to create the pattern: pattern name => [qw( line of -char=_ )], create => sub { my ($self, $flags) = @_; my $char = quotemeta $flags->{-char}; return '(?:^$char+$)'; }, ; If the subroutine version is used, the subroutine will be called with three arguments: a reference to the pattern object itself, a reference to a hash containing the flags and their values, and a reference to an array containing the non-flag keys. Whatever the subroutine returns is stringified as the pattern. No matter how the pattern is created, it is immediately postprocessed to include or exclude capturing parentheses (according to the value of the C<-keep> flag). To specify such "optional" capturing parentheses within the regular expression associated with C, use the notation C<(?k:...)>. Any parentheses of this type will be converted to C<(...)> when the C<-keep> flag is specified, or C<(?:...)> when it is not. It is a Regexp::Common convention that the outermost capturing parentheses always capture the entire pattern, but this is not enforced. =item C $sub_ref> An optional argument that specifies a subroutine that is to be called when the C<$RE{...}-Ematches(...)> method of this pattern is invoked. The subroutine should expect two arguments: a reference to the pattern object itself, and the string to be matched against. It should return the same types of values as a C does. pattern name => [qw( line of -char )], create => sub {...}, match => sub { my ($self, $str) = @_; $str !~ /[^$self->{flags}{-char}]/; }, ; =item C $sub_ref> An optional argument that specifies a subroutine that is to be called when the C<$RE{...}-Esubs(...)> method of this pattern is invoked. The subroutine should expect three arguments: a reference to the pattern object itself, the string to be changed, and the value to be substituted into it. The third argument may be C, indicating the default substitution is required. The subroutine should return the same types of values as an C does. For example: pattern name => [ 'lineof', '-char=_' ], create => sub {...}, subs => sub { my ($self, $str, $ignore_replacement) = @_; $_[1] =~ s/^$self->{flags}{-char}+$//g; }, ; Note that such a subroutine will almost always need to modify C<$_[1]> directly. =item C $minimum_perl_version> If this argument is given, it specifies the minimum version of perl required to use the new pattern. Attempts to use the pattern with earlier versions of perl will generate a fatal diagnostic. =back =head2 Loading specific sets of patterns. By default, all the sets of patterns listed below are made available. However, it is possible to indicate which sets of patterns should be made available - the wanted sets should be given as arguments to C. Alternatively, it is also possible to indicate which sets of patterns should not be made available - those sets will be given as argument to the C statement, but are preceeded with an exclaimation mark. The argument I indicates none of the default patterns should be made available. This is useful for instance if all you want is the C subroutine. Examples: use Regexp::Common qw /comment number/; # Comment and number patterns. use Regexp::Common qw /no_defaults/; # Don't load any patterns. use Regexp::Common qw /!delimited/; # All, but delimited patterns. It's also possible to load your own set of patterns. If you have a module C that makes patterns available, you can have it made available with use Regexp::Common qw /my_patterns/; Note that the default patterns will still be made available - only if you use I, or mention one of the default sets explicitely, the non mentioned defaults aren't made available. =head2 List of available patterns The patterns listed below are currently available. Each set of patterns has its own manual page describing the details. For each pattern set named I, the manual page I describes the details. Currently available are: =over 4 =item Regexp::Common::balanced Provides regexes for strings with balanced parenthesized delimiters. =item Regexp::Common::comment Provides regexes for comments of various languages (43 languages currently). =item Regexp::Common::delimited Provides regexes for delimited strings. =item Regexp::Common::lingua Provides regexes for palindromes. =item Regexp::Common::list Provides regexes for lists. =item Regexp::Common::net Provides regexes for IPv4 addresses and MAC addresses. =item Regexp::Common::number Provides regexes for numbers (integers and reals). =item Regexp::Common::profanity Provides regexes for profanity. =item Regexp::Common::whitespace Provides regexes for leading and trailing whitespace. =item Regexp::Common::zip Provides regexes for zip codes. =back =head2 Forthcoming patterns and features Future releases of the module will also provide patterns for the following: * email addresses * HTML/XML tags * more numerical matchers, * mail headers (including multiline ones), * more URLS * telephone numbers of various countries * currency (universal 3 letter format, Latin-1, currency names) * dates * binary formats (e.g. UUencoded, MIMEd) If you have other patterns or pattern generators that you think would be generally useful, please send them to the maintainer -- preferably as source code using the C subroutine. Submissions that include a set of tests will be especially welcome. =head1 DIAGNOSTICS =over 4 =item C The subroutine-based interface didn't recognize the requested subroutine. Often caused by a spelling mistake or an incompletely specified name. =item C Regexp::Common doesn't have a generator for the requested pattern. Often indicates a mispelt or missing parameter. =item C The requested pattern requires advanced regex features (e.g. recursion) that not available in your version of Perl. Time to upgrade. =item C<< pattern() requires argument: name => [ @list ] >> Every user-defined pattern specification must have a name. =item C<< pattern() requires argument: create => $sub_ref_or_string >> Every user-defined pattern specification must provide a pattern creation mechanism: either a pattern string or a reference to a subroutine that returns the pattern string. =item C The C<< $RE{num}{real}{-base=>'I'} >> pattern uses the characters [0-9A-Z] to represent the digits of various bases. Hence it only produces regular expressions for bases up to hexatricensimal. =item C The pattern has no default delimiter. You need to write: C<< $RE{delimited}{-delim=>I'} >> for some character I =back =head1 ACKNOWLEDGEMENTS Deepest thanks to the many people who have encouraged and contributed to this project, especially: Elijah, Jarkko, Tom, Nat, Ed, and Vivek. Further thanks go to: Alexandr Ciornii, Blair Zajac, Bob Stockdale, Charles Thomas, Chris Vertonghen, the CPAN Testers, David Hand, Fany, Geoffrey Leach, Hermann-Marcus Behrens, Jerome Quelin, Jim Cromie, Lars Wilke, Linda Julien, Mike Arms, Mike Castle, Mikko, Murat Uenalan, RafaE<235>l Garcia-Suarez, Ron Savage, Sam Vilain, Slaven Rezic, Smylers, Tim Maher, and all the others I've forgotten. =head1 AUTHOR Damian Conway (damian@conway.org) =head1 MAINTAINANCE This package is maintained by Abigail S<(I)>. =head1 BUGS AND IRRITATIONS Bound to be plenty. For a start, there are many common regexes missing. Send them in to I. There are some POD issues when installing this module using a pre-5.6.0 perl; some manual pages may not install, or may not install correctly using a perl that is that old. You might consider upgrading your perl. =head1 NOT A BUG =over 4 =item * The various patterns are not anchored. That is, a pattern like C<< $RE {num} {int} >> will match against "abc4def", because a substring of the subject matches. This is by design, and not a bug. If you want the pattern to be anchored, use something like: my $integer = $RE {num} {int}; $subj =~ /^$integer$/ and print "Matches!\n"; =back =head1 LICENSE and COPYRIGHT This software is Copyright (c) 2001 - 2011, Damian Conway and Abigail. This module is free software, and maybe used under any of the following licenses: 1) The Perl Artistic License. See the file COPYRIGHT.AL. 2) The Perl Artistic License 2.0. See the file COPYRIGHT.AL2. 3) The BSD Licence. See the file COPYRIGHT.BSD. 4) The MIT Licence. See the file COPYRIGHT.MIT.