use strict;
my $str = '<doc>hello</doc>';
my $regex = qr/(?x)
# XML 1.0 well-formed validation implemented as a single regular expression.
# dedicated to the public domain
# Advanced features of regex engine used: Example: Alternative:
# - utf8 matching expand UTF8 sequences in char classes
# - utf8 \x{D800}-\x{DFFF} disallowed and max value make explicit if not enforced by regex engine
# - fixed number of repetitions {3} expand (max 5)
# - ignore whitespacd extended syntax (?x) remove spaces and comment lines
# - ascii escaped hex characters \xhh alt escape notation
# - unicode hex in character class \x{hhh} \uhhhh \Uhhhhhhhh or raw utf8
# - named capture group aka subroutine definition (?<xyz>) flatten + numbered
# - function call \g<nm> flatten
# - recursive function call for {element}, {kids} \g<element> programmatic check
# - back reference (end tag matching) \k<tag> programmatic check
# Tested with:
# - PCRE2
# Implemented well-formedness constraints:
# - PEs in Internal subset
# - External Subset (by virtue of not loading externals)
# - Element Type Match
# - Legal Character
# - In DTD
# TODO:
# - optimisation: make anon groups non capturing (?: )
# - optimisation: atomic groups, eager/lazy, etc
# - unicode order character at start
# - support character set other than assume engine is utf8 and ignore encoding...
^ (<\?xml
(?<s> [\t\n\r\x20])+
version \g<s>*=\g<s>* ( "1\.0" | '1\.0' )
( \g<s>+ encoding \g<s>*=\g<s>* ( "[A-Za-z][A-Za-z0-9._-]*"
| '[A-Za-z][A-Za-z0-9._-]*' ) )?
( \g<s>+ standalone \g<s>*=\g<s>* ( "(yes|no)" | '(yes|no)' ) )?
\g<s>* \?>)?
(?<misc> \g<s>+
| (?<comment> <!-- ( -? [^\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}-] )* -->)
| (?<procinst> <\?
( [MmLl] | (?<firstnoxml> [a-kn-wyzA-KN-WYZ_:] |
[\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}] |
[\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}] )
(?<latter> [XxMmLl] | (?<xmlatter> [0-9.-] | \g<firstnoxml> | [\xB7\x{300}-\x{36F}\x{203F}-\x{2040}] ) )*
| [Xx] ( [XxLl] | \g<xmlatter> ) \g<latter>*
| [Xx] [Mm] ( [XxMm] | \g<xmlatter> ) \g<latter>*
| [Xx] [Mm] [Ll] \g<latter>+ )
( \g<s>+ ( \? [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]
| [^?\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] )* \?? )? \?> )
)*
(
<!DOCTYPE \g<s>+ (?<nm> ([XxMmLl] | \g<firstnoxml>) \g<latter>*)
( \g<s>+
(?<world>
( SYSTEM
| (?<idpublic> PUBLIC \g<s>+ ('[\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*'
| "['\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*" ) ) )
\g<s>+ ( " [^"\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* "
| ' [^'\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* ' )
)
)? \g<s>*
( \[
(
<!ELEMENT \g<s>+ \g<nm> \g<s>+
( EMPTY | ANY
| \( \g<s>* \#PCDATA ( \g<s>* \)
| ( (?<or> \g<s>* \| \g<s>*) \g<nm> )* \g<s>* \)\* )
|
(?<kids>
\( \g<s>* ( \g<nm> [?*+]? | \g<kids> ) (
( \g<or> ( \g<nm> [?*+]? | \g<kids> ) )+
| ( \g<s>* , \g<s>* ( \g<nm> [?*+]? | \g<kids> ) )*
) \g<s>*
\) [?*+]?
)
) \g<s>* >
| <!ENTITY \g<s>+ (
( % \g<s>+ )? \g<nm> \g<s>+
( " ( [^"%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
# WFC: PEs in Internal Subset | % \g<nm> ;
| (?<ref> & \g<nm> ; )
| (?<hash> &\#
( 0* ( 9 | 10 | 13
| 3[2-9] | [4-9][0-9]
| [1-9][0-9][0-9][0-9]?
| [1-47-9][0-9]{4} | 5[0-48-9][0-9]{3} | 55[0-1][0-9][0-9] | 552[0-8][0-9] | 5529[0-5]
| 5734[4-9] | 573[5-9][0-9] | 57[4-9][0-9][0-9]
| 6[0-46-9][0-9]{3} | 65[0-46-9][0-9][0-9] | 655[0-24-9][0-9] | 6553[0-36-9]
| [1-9][0-9]{5}
| 10[0-9]{5} | 110[0-9]{4} | 111[0-3][0-9]{3} | 11140[0-9][0-9] | 111410[0-9] | 111411[01] )
| x 0* ( [9aAdD]
| [2-9a-f] [0-9a-fA-F]
| [1-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
| [1-9a-cA-CeE] [0-9a-fA-F]{3}
| [dD] [0-7] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [0-9a-eA-E] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [fF] [0-9a-eA-E] [0-9a-fA-F]
| [fF] [fF] [fF] [0-9a-dA-D]
| (10|[1-9a-fA-F]) [0-9a-fA-F]{4} )
) ; )
)* "
| ' ( [^'%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
# | % \g<nm> ;
| \g<ref>
| \g<hash> )* ' )
| \g<nm> \g<s>+ \g<world> ( \g<s>+ NDATA \g<s>+ \g<nm> )?
| % \g<s>+ \g<nm> \g<s>+ \g<world>
) \g<s>* >
| <!NOTATION \g<s>+ \g<nm> \g<s>+ ( \g<world> | \g<idpublic> ) \g<s>* >
| <!ATTLIST \g<s>+ \g<nm> ( \g<s>+ \g<nm> \g<s>+
( CDATA | ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS
| NOTATION \g<s>+ \( \g<s>* \g<nm> ( \g<or> \g<nm> )* \g<s>* \)
| \( \g<s>* \g<latter>+ ( \g<or> \g<latter>+ )* \g<s>* \)
)
\g<s>+
( \#REQUIRED
| \#IMPLIED
| ( \#FIXED \g<s>+ )?
(?<attvalue> " ( [^"<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* "
| ' ( [^'<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* ' )
)
)*
\g<s>* >
| \g<procinst>
| \g<comment>
| % \g<nm> ;
| \g<s>+
)*
] \g<s>*
)? >
)?
\g<misc>*
(?<element>
< (?<tag> \g<nm>)
( \g<s>+ \g<nm> \g<s>*=\g<s>* \g<attvalue> )* \g<s>*
( \x2F >
| >
(?<data> >+ | ( >* ( [^><&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
| [^\]<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )+ ) )?
(
( \g<element>
| \g<ref>
| \g<hash>
| <!\[CDATA\[ >* ( [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
| [^\]\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )* ]]>
| \g<procinst>
| \g<comment> )
\g<data>?
)*
< \x2F \k<tag> \g<s>* > )
)
\g<misc>*
$
/xup;
if ( $str =~ /$regex/ ) {
print "Whole match is ${^MATCH} and its start/end positions can be obtained via \$-[0] and \$+[0]\n";
# print "Capture Group 1 is $1 and its start/end positions can be obtained via \$-[1] and \$+[1]\n";
# print "Capture Group 2 is $2 ... and so on\n";
}
# ${^POSTMATCH} and ${^PREMATCH} are also available with the use of '/p'
# Named capture groups can be called via $+{name}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Perl, please visit: http://perldoc.perl.org/perlre.html