#include <StringConstants.au3> ; to declare the Constants of StringRegExp
#include <Array.au3> ; UDF needed for _ArrayDisplay and _ArrayConcatenate
Local $sRegex = "(?xu)(?x)" & @CRLF & _
"" & @CRLF & _
"# XML 1.0 well-formed validation implemented as a single regular expression." & @CRLF & _
"# dedicated to the public domain" & @CRLF & _
"" & @CRLF & _
"# Advanced features of regex engine used: Example: Alternative:" & @CRLF & _
"# - utf8 matching expand UTF8 sequences in char classes" & @CRLF & _
"# - utf8 \x{D800}-\x{DFFF} disallowed and max value make explicit if not enforced by regex engine" & @CRLF & _
"# - fixed number of repetitions {3} expand (max 5)" & @CRLF & _
"# - ignore whitespacd extended syntax (?x) remove spaces and comment lines" & @CRLF & _
"# - ascii escaped hex characters \xhh alt escape notation" & @CRLF & _
"# - unicode hex in character class \x{hhh} \uhhhh \Uhhhhhhhh or raw utf8" & @CRLF & _
"# - named capture group aka subroutine definition (?<xyz>) flatten + numbered" & @CRLF & _
"# - function call \g<nm> flatten" & @CRLF & _
"# - recursive function call for {element}, {kids} \g<element> programmatic check" & @CRLF & _
"# - back reference (end tag matching) \k<tag> programmatic check" & @CRLF & _
"" & @CRLF & _
"# Tested with:" & @CRLF & _
"# - PCRE2" & @CRLF & _
"" & @CRLF & _
"# Implemented well-formedness constraints:" & @CRLF & _
"# - PEs in Internal subset" & @CRLF & _
"# - External Subset (by virtue of not loading externals)" & @CRLF & _
"# - Element Type Match" & @CRLF & _
"# - Legal Character" & @CRLF & _
"# - In DTD" & @CRLF & _
"" & @CRLF & _
"# TODO:" & @CRLF & _
"# - optimisation: make anon groups non capturing (?: )" & @CRLF & _
"# - optimisation: atomic groups, eager/lazy, etc" & @CRLF & _
"# - unicode order character at start" & @CRLF & _
"# - support character set other than assume engine is utf8 and ignore encoding..." & @CRLF & _
"" & @CRLF & _
"^ (<\?xml" & @CRLF & _
" (?<s> [\t\n\r\x20])+" & @CRLF & _
" version \g<s>*=\g<s>* ( "1\.0" | '1\.0' )" & @CRLF & _
" ( \g<s>+ encoding \g<s>*=\g<s>* ( "[A-Za-z][A-Za-z0-9._-]*"" & @CRLF & _
" | '[A-Za-z][A-Za-z0-9._-]*' ) )?" & @CRLF & _
" ( \g<s>+ standalone \g<s>*=\g<s>* ( "(yes|no)" | '(yes|no)' ) )?" & @CRLF & _
" \g<s>* \?>)?" & @CRLF & _
" (?<misc> \g<s>+" & @CRLF & _
" | (?<comment> <!-- ( -? [^\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}-] )* -->)" & @CRLF & _
" | (?<procinst> <\?" & @CRLF & _
" ( [MmLl] | (?<firstnoxml> [a-kn-wyzA-KN-WYZ_:] | " & @CRLF & _
" [\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}] |" & @CRLF & _
" [\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}] )" & @CRLF & _
" (?<latter> [XxMmLl] | (?<xmlatter> [0-9.-] | \g<firstnoxml> | [\xB7\x{300}-\x{36F}\x{203F}-\x{2040}] ) )*" & @CRLF & _
" | [Xx] ( [XxLl] | \g<xmlatter> ) \g<latter>*" & @CRLF & _
" | [Xx] [Mm] ( [XxMm] | \g<xmlatter> ) \g<latter>*" & @CRLF & _
" | [Xx] [Mm] [Ll] \g<latter>+ )" & @CRLF & _
" ( \g<s>+ ( \? [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]" & @CRLF & _
" | [^?\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] )* \?? )? \?> )" & @CRLF & _
" )*" & @CRLF & _
" (" & @CRLF & _
" <!DOCTYPE \g<s>+ (?<nm> ([XxMmLl] | \g<firstnoxml>) \g<latter>*)" & @CRLF & _
" ( \g<s>+ " & @CRLF & _
" (?<world>" & @CRLF & _
" ( SYSTEM" & @CRLF & _
" | (?<idpublic> PUBLIC \g<s>+ ('[\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*'" & @CRLF & _
" | "['\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*" ) ) )" & @CRLF & _
" \g<s>+ ( " [^"\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* "" & @CRLF & _
" | ' [^'\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* ' )" & @CRLF & _
" )" & @CRLF & _
" )? \g<s>*" & @CRLF & _
" ( \[" & @CRLF & _
" (" & @CRLF & _
" <!ELEMENT \g<s>+ \g<nm> \g<s>+" & @CRLF & _
" ( EMPTY | ANY " & @CRLF & _
" | \( \g<s>* \#PCDATA ( \g<s>* \) " & @CRLF & _
" | ( (?<or> \g<s>* \| \g<s>*) \g<nm> )* \g<s>* \)\* )" & @CRLF & _
" | " & @CRLF & _
" (?<kids> " & @CRLF & _
" \( \g<s>* ( \g<nm> [?*+]? | \g<kids> ) ( " & @CRLF & _
" ( \g<or> ( \g<nm> [?*+]? | \g<kids> ) )+ " & @CRLF & _
" | ( \g<s>* , \g<s>* ( \g<nm> [?*+]? | \g<kids> ) )*" & @CRLF & _
" ) \g<s>*" & @CRLF & _
" \) [?*+]?" & @CRLF & _
" )" & @CRLF & _
" ) \g<s>* >" & @CRLF & _
" | <!ENTITY \g<s>+ (" & @CRLF & _
" ( % \g<s>+ )? \g<nm> \g<s>+ " & @CRLF & _
" ( " ( [^"%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+" & @CRLF & _
"# WFC: PEs in Internal Subset | % \g<nm> ;" & @CRLF & _
" | (?<ref> & \g<nm> ; )" & @CRLF & _
" | (?<hash> &\# " & @CRLF & _
" ( 0* ( 9 | 10 | 13" & @CRLF & _
" | 3[2-9] | [4-9][0-9]" & @CRLF & _
" | [1-9][0-9][0-9][0-9]?" & @CRLF & _
" | [1-47-9][0-9]{4} | 5[0-48-9][0-9]{3} | 55[0-1][0-9][0-9] | 552[0-8][0-9] | 5529[0-5]" & @CRLF & _
" | 5734[4-9] | 573[5-9][0-9] | 57[4-9][0-9][0-9]" & @CRLF & _
" | 6[0-46-9][0-9]{3} | 65[0-46-9][0-9][0-9] | 655[0-24-9][0-9] | 6553[0-36-9]" & @CRLF & _
" | [1-9][0-9]{5}" & @CRLF & _
" | 10[0-9]{5} | 110[0-9]{4} | 111[0-3][0-9]{3} | 11140[0-9][0-9] | 111410[0-9] | 111411[01] )" & @CRLF & _
" | x 0* ( [9aAdD]" & @CRLF & _
" | [2-9a-f] [0-9a-fA-F]" & @CRLF & _
" | [1-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]" & @CRLF & _
" | [1-9a-cA-CeE] [0-9a-fA-F]{3}" & @CRLF & _
" | [dD] [0-7] [0-9a-fA-F] [0-9a-fA-F]" & @CRLF & _
" | [fF] [0-9a-eA-E] [0-9a-fA-F] [0-9a-fA-F]" & @CRLF & _
" | [fF] [fF] [0-9a-eA-E] [0-9a-fA-F]" & @CRLF & _
" | [fF] [fF] [fF] [0-9a-dA-D]" & @CRLF & _
" | (10|[1-9a-fA-F]) [0-9a-fA-F]{4} )" & @CRLF & _
" ) ; )" & @CRLF & _
" )* "" & @CRLF & _
" | ' ( [^'%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+" & @CRLF & _
"# | % \g<nm> ; " & @CRLF & _
" | \g<ref> " & @CRLF & _
" | \g<hash> )* ' )" & @CRLF & _
" | \g<nm> \g<s>+ \g<world> ( \g<s>+ NDATA \g<s>+ \g<nm> )?" & @CRLF & _
" | % \g<s>+ \g<nm> \g<s>+ \g<world>" & @CRLF & _
" ) \g<s>* >" & @CRLF & _
" | <!NOTATION \g<s>+ \g<nm> \g<s>+ ( \g<world> | \g<idpublic> ) \g<s>* >" & @CRLF & _
" | <!ATTLIST \g<s>+ \g<nm> ( \g<s>+ \g<nm> \g<s>+ " & @CRLF & _
" ( CDATA | ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS " & @CRLF & _
" | NOTATION \g<s>+ \( \g<s>* \g<nm> ( \g<or> \g<nm> )* \g<s>* \)" & @CRLF & _
" | \( \g<s>* \g<latter>+ ( \g<or> \g<latter>+ )* \g<s>* \)" & @CRLF & _
" )" & @CRLF & _
" \g<s>+" & @CRLF & _
" ( \#REQUIRED" & @CRLF & _
" | \#IMPLIED" & @CRLF & _
" | ( \#FIXED \g<s>+ )?" & @CRLF & _
" (?<attvalue> " ( [^"<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* "" & @CRLF & _
" | ' ( [^'<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* ' )" & @CRLF & _
" )" & @CRLF & _
" )*" & @CRLF & _
" \g<s>* >" & @CRLF & _
" | \g<procinst>" & @CRLF & _
" | \g<comment>" & @CRLF & _
" | % \g<nm> ;" & @CRLF & _
" | \g<s>+" & @CRLF & _
" )*" & @CRLF & _
" ] \g<s>*" & @CRLF & _
" )? >" & @CRLF & _
" )?" & @CRLF & _
" \g<misc>* " & @CRLF & _
" (?<element>" & @CRLF & _
" < (?<tag> \g<nm>)" & @CRLF & _
" ( \g<s>+ \g<nm> \g<s>*=\g<s>* \g<attvalue> )* \g<s>*" & @CRLF & _
" ( \x2F >" & @CRLF & _
" | >" & @CRLF & _
" (?<data> >+ | ( >* ( [^><&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+" & @CRLF & _
" | [^\]<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )+ ) )?" & @CRLF & _
" (" & @CRLF & _
" ( \g<element>" & @CRLF & _
" | \g<ref>" & @CRLF & _
" | \g<hash>" & @CRLF & _
" | <!\[CDATA\[ >* ( [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+" & @CRLF & _
" | [^\]\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )* ]]>" & @CRLF & _
" | \g<procinst>" & @CRLF & _
" | \g<comment> )" & @CRLF & _
" \g<data>?" & @CRLF & _
" )*" & @CRLF & _
" < \x2F \k<tag> \g<s>* > )" & @CRLF & _
" )" & @CRLF & _
" \g<misc>*" & @CRLF & _
"$" & @CRLF & _
""
Local $sString = "<doc>hello</doc>"
Local $aArray = StringRegExp($sString, $sRegex, $STR_REGEXPARRAYFULLMATCH)
; Present the entire match result
_ArrayDisplay($aArray, "Result")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for AutoIt, please visit: https://www.autoitscript.com/autoit3/docs/functions/StringRegExp.htm