# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"""
(?x)
# XML 1.0 well-formed validation implemented as a single regular expression.
# dedicated to the public domain
# Advanced features of regex engine used: Example: Alternative:
# - utf8 matching expand UTF8 sequences in char classes
# - utf8 \x{D800}-\x{DFFF} disallowed and max value make explicit if not enforced by regex engine
# - fixed number of repetitions {3} expand (max 5)
# - ignore whitespacd extended syntax (?x) remove spaces and comment lines
# - ascii escaped hex characters \xhh alt escape notation
# - unicode hex in character class \x{hhh} \uhhhh \Uhhhhhhhh or raw utf8
# - named capture group aka subroutine definition (?<xyz>) flatten + numbered
# - function call \g<nm> flatten
# - recursive function call for {element}, {kids} \g<element> programmatic check
# - back reference (end tag matching) \k<tag> programmatic check
# Tested with:
# - PCRE2
# Implemented well-formedness constraints:
# - PEs in Internal subset
# - External Subset (by virtue of not loading externals)
# - Element Type Match
# - Legal Character
# - In DTD
# TODO:
# - optimisation: make anon groups non capturing (?: )
# - optimisation: atomic groups, eager/lazy, etc
# - unicode order character at start
# - support character set other than assume engine is utf8 and ignore encoding...
^ (<\?xml
(?<s> [\t\n\r\x20])+
version \g<s>*=\g<s>* ( "1\.0" | '1\.0' )
( \g<s>+ encoding \g<s>*=\g<s>* ( "[A-Za-z][A-Za-z0-9._-]*"
| '[A-Za-z][A-Za-z0-9._-]*' ) )?
( \g<s>+ standalone \g<s>*=\g<s>* ( "(yes|no)" | '(yes|no)' ) )?
\g<s>* \?>)?
(?<misc> \g<s>+
| (?<comment> <!-- ( -? [^\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}-] )* -->)
| (?<procinst> <\?
( [MmLl] | (?<firstnoxml> [a-kn-wyzA-KN-WYZ_:] |
[\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}] |
[\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}] )
(?<latter> [XxMmLl] | (?<xmlatter> [0-9.-] | \g<firstnoxml> | [\xB7\x{300}-\x{36F}\x{203F}-\x{2040}] ) )*
| [Xx] ( [XxLl] | \g<xmlatter> ) \g<latter>*
| [Xx] [Mm] ( [XxMm] | \g<xmlatter> ) \g<latter>*
| [Xx] [Mm] [Ll] \g<latter>+ )
( \g<s>+ ( \? [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]
| [^?\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] )* \?? )? \?> )
)*
(
<!DOCTYPE \g<s>+ (?<nm> ([XxMmLl] | \g<firstnoxml>) \g<latter>*)
( \g<s>+
(?<world>
( SYSTEM
| (?<idpublic> PUBLIC \g<s>+ ('[\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*'
| "['\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*" ) ) )
\g<s>+ ( " [^"\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* "
| ' [^'\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* ' )
)
)? \g<s>*
( \[
(
<!ELEMENT \g<s>+ \g<nm> \g<s>+
( EMPTY | ANY
| \( \g<s>* \#PCDATA ( \g<s>* \)
| ( (?<or> \g<s>* \| \g<s>*) \g<nm> )* \g<s>* \)\* )
|
(?<kids>
\( \g<s>* ( \g<nm> [?*+]? | \g<kids> ) (
( \g<or> ( \g<nm> [?*+]? | \g<kids> ) )+
| ( \g<s>* , \g<s>* ( \g<nm> [?*+]? | \g<kids> ) )*
) \g<s>*
\) [?*+]?
)
) \g<s>* >
| <!ENTITY \g<s>+ (
( % \g<s>+ )? \g<nm> \g<s>+
( " ( [^"%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
# WFC: PEs in Internal Subset | % \g<nm> ;
| (?<ref> & \g<nm> ; )
| (?<hash> &\#
( 0* ( 9 | 10 | 13
| 3[2-9] | [4-9][0-9]
| [1-9][0-9][0-9][0-9]?
| [1-47-9][0-9]{4} | 5[0-48-9][0-9]{3} | 55[0-1][0-9][0-9] | 552[0-8][0-9] | 5529[0-5]
| 5734[4-9] | 573[5-9][0-9] | 57[4-9][0-9][0-9]
| 6[0-46-9][0-9]{3} | 65[0-46-9][0-9][0-9] | 655[0-24-9][0-9] | 6553[0-36-9]
| [1-9][0-9]{5}
| 10[0-9]{5} | 110[0-9]{4} | 111[0-3][0-9]{3} | 11140[0-9][0-9] | 111410[0-9] | 111411[01] )
| x 0* ( [9aAdD]
| [2-9a-f] [0-9a-fA-F]
| [1-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
| [1-9a-cA-CeE] [0-9a-fA-F]{3}
| [dD] [0-7] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [0-9a-eA-E] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [fF] [0-9a-eA-E] [0-9a-fA-F]
| [fF] [fF] [fF] [0-9a-dA-D]
| (10|[1-9a-fA-F]) [0-9a-fA-F]{4} )
) ; )
)* "
| ' ( [^'%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
# | % \g<nm> ;
| \g<ref>
| \g<hash> )* ' )
| \g<nm> \g<s>+ \g<world> ( \g<s>+ NDATA \g<s>+ \g<nm> )?
| % \g<s>+ \g<nm> \g<s>+ \g<world>
) \g<s>* >
| <!NOTATION \g<s>+ \g<nm> \g<s>+ ( \g<world> | \g<idpublic> ) \g<s>* >
| <!ATTLIST \g<s>+ \g<nm> ( \g<s>+ \g<nm> \g<s>+
( CDATA | ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS
| NOTATION \g<s>+ \( \g<s>* \g<nm> ( \g<or> \g<nm> )* \g<s>* \)
| \( \g<s>* \g<latter>+ ( \g<or> \g<latter>+ )* \g<s>* \)
)
\g<s>+
( \#REQUIRED
| \#IMPLIED
| ( \#FIXED \g<s>+ )?
(?<attvalue> " ( [^"<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* "
| ' ( [^'<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* ' )
)
)*
\g<s>* >
| \g<procinst>
| \g<comment>
| % \g<nm> ;
| \g<s>+
)*
] \g<s>*
)? >
)?
\g<misc>*
(?<element>
< (?<tag> \g<nm>)
( \g<s>+ \g<nm> \g<s>*=\g<s>* \g<attvalue> )* \g<s>*
( \x2F >
| >
(?<data> >+ | ( >* ( [^><&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
| [^\]<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )+ ) )?
(
( \g<element>
| \g<ref>
| \g<hash>
| <!\[CDATA\[ >* ( [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
| [^\]\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )* ]]>
| \g<procinst>
| \g<comment> )
\g<data>?
)*
< \x2F \k<tag> \g<s>* > )
)
\g<misc>*
$
"""
test_str = "<doc>hello</doc>"
matches = re.search(regex, test_str, re.VERBOSE | re.UNICODE)
if matches:
print ("Match was found at {start}-{end}: {match}".format(start = matches.start(), end = matches.end(), match = matches.group()))
for groupNum in range(0, len(matches.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = matches.start(groupNum), end = matches.end(groupNum), group = matches.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html