const regex = /(?x)
# XML 1.0 well-formed validation implemented as a single regular expression.
# dedicated to the public domain
# Advanced features of regex engine used: Example: Alternative:
# - utf8 matching expand UTF8 sequences in char classes
# - utf8 \x{D800}-\x{DFFF} disallowed and max value make explicit if not enforced by regex engine
# - fixed number of repetitions {3} expand (max 5)
# - ignore whitespacd extended syntax (?x) remove spaces and comment lines
# - ascii escaped hex characters \xhh alt escape notation
# - unicode hex in character class \x{hhh} \uhhhh \Uhhhhhhhh or raw utf8
# - named capture group aka subroutine definition (?<xyz>) flatten + numbered
# - function call \g<nm> flatten
# - recursive function call for {element}, {kids} \g<element> programmatic check
# - back reference (end tag matching) \k<tag> programmatic check
# Tested with:
# - PCRE2
# Implemented well-formedness constraints:
# - PEs in Internal subset
# - External Subset (by virtue of not loading externals)
# - Element Type Match
# - Legal Character
# - In DTD
# TODO:
# - optimisation: make anon groups non capturing (?: )
# - optimisation: atomic groups, eager/lazy, etc
# - unicode order character at start
# - support character set other than assume engine is utf8 and ignore encoding...
^ (<\?xml
(?<s> [\t\n\r\x20])+
version \g<s>*=\g<s>* ( "1\.0" | '1\.0' )
( \g<s>+ encoding \g<s>*=\g<s>* ( "[A-Za-z][A-Za-z0-9._-]*"
| '[A-Za-z][A-Za-z0-9._-]*' ) )?
( \g<s>+ standalone \g<s>*=\g<s>* ( "(yes|no)" | '(yes|no)' ) )?
\g<s>* \?>)?
(?<misc> \g<s>+
| (?<comment> <!-- ( -? [^\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}-] )* -->)
| (?<procinst> <\?
( [MmLl] | (?<firstnoxml> [a-kn-wyzA-KN-WYZ_:] |
[\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}] |
[\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}] )
(?<latter> [XxMmLl] | (?<xmlatter> [0-9.-] | \g<firstnoxml> | [\xB7\x{300}-\x{36F}\x{203F}-\x{2040}] ) )*
| [Xx] ( [XxLl] | \g<xmlatter> ) \g<latter>*
| [Xx] [Mm] ( [XxMm] | \g<xmlatter> ) \g<latter>*
| [Xx] [Mm] [Ll] \g<latter>+ )
( \g<s>+ ( \? [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]
| [^?\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] )* \?? )? \?> )
)*
(
<!DOCTYPE \g<s>+ (?<nm> ([XxMmLl] | \g<firstnoxml>) \g<latter>*)
( \g<s>+
(?<world>
( SYSTEM
| (?<idpublic> PUBLIC \g<s>+ ('[\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*'
| "['\r\n\x20a-zA-Z0-9()+,.\x2F:=?;!*\#@$_%-]*" ) ) )
\g<s>+ ( " [^"\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* "
| ' [^'\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]* ' )
)
)? \g<s>*
( \[
(
<!ELEMENT \g<s>+ \g<nm> \g<s>+
( EMPTY | ANY
| \( \g<s>* \#PCDATA ( \g<s>* \)
| ( (?<or> \g<s>* \| \g<s>*) \g<nm> )* \g<s>* \)\* )
|
(?<kids>
\( \g<s>* ( \g<nm> [?*+]? | \g<kids> ) (
( \g<or> ( \g<nm> [?*+]? | \g<kids> ) )+
| ( \g<s>* , \g<s>* ( \g<nm> [?*+]? | \g<kids> ) )*
) \g<s>*
\) [?*+]?
)
) \g<s>* >
| <!ENTITY \g<s>+ (
( % \g<s>+ )? \g<nm> \g<s>+
( " ( [^"%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
# WFC: PEs in Internal Subset | % \g<nm> ;
| (?<ref> & \g<nm> ; )
| (?<hash> &\#
( 0* ( 9 | 10 | 13
| 3[2-9] | [4-9][0-9]
| [1-9][0-9][0-9][0-9]?
| [1-47-9][0-9]{4} | 5[0-48-9][0-9]{3} | 55[0-1][0-9][0-9] | 552[0-8][0-9] | 5529[0-5]
| 5734[4-9] | 573[5-9][0-9] | 57[4-9][0-9][0-9]
| 6[0-46-9][0-9]{3} | 65[0-46-9][0-9][0-9] | 655[0-24-9][0-9] | 6553[0-36-9]
| [1-9][0-9]{5}
| 10[0-9]{5} | 110[0-9]{4} | 111[0-3][0-9]{3} | 11140[0-9][0-9] | 111410[0-9] | 111411[01] )
| x 0* ( [9aAdD]
| [2-9a-f] [0-9a-fA-F]
| [1-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
| [1-9a-cA-CeE] [0-9a-fA-F]{3}
| [dD] [0-7] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [0-9a-eA-E] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [fF] [0-9a-eA-E] [0-9a-fA-F]
| [fF] [fF] [fF] [0-9a-dA-D]
| (10|[1-9a-fA-F]) [0-9a-fA-F]{4} )
) ; )
)* "
| ' ( [^'%&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
# | % \g<nm> ;
| \g<ref>
| \g<hash> )* ' )
| \g<nm> \g<s>+ \g<world> ( \g<s>+ NDATA \g<s>+ \g<nm> )?
| % \g<s>+ \g<nm> \g<s>+ \g<world>
) \g<s>* >
| <!NOTATION \g<s>+ \g<nm> \g<s>+ ( \g<world> | \g<idpublic> ) \g<s>* >
| <!ATTLIST \g<s>+ \g<nm> ( \g<s>+ \g<nm> \g<s>+
( CDATA | ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS
| NOTATION \g<s>+ \( \g<s>* \g<nm> ( \g<or> \g<nm> )* \g<s>* \)
| \( \g<s>* \g<latter>+ ( \g<or> \g<latter>+ )* \g<s>* \)
)
\g<s>+
( \#REQUIRED
| \#IMPLIED
| ( \#FIXED \g<s>+ )?
(?<attvalue> " ( [^"<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* "
| ' ( [^'<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+ | \g<ref> | \g<hash> )* ' )
)
)*
\g<s>* >
| \g<procinst>
| \g<comment>
| % \g<nm> ;
| \g<s>+
)*
] \g<s>*
)? >
)?
\g<misc>*
(?<element>
< (?<tag> \g<nm>)
( \g<s>+ \g<nm> \g<s>*=\g<s>* \g<attvalue> )* \g<s>*
( \x2F >
| >
(?<data> >+ | ( >* ( [^><&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
| [^\]<&\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )+ ) )?
(
( \g<element>
| \g<ref>
| \g<hash>
| <!\[CDATA\[ >* ( [^>\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}]+
| [^\]\x00-\x08\x0B\x0C\x0E-\x1F\x{FFFE}\x{FFFF}] ]? > )* ]]>
| \g<procinst>
| \g<comment> )
\g<data>?
)*
< \x2F \k<tag> \g<s>* > )
)
\g<misc>*
$
/u;
// Alternative syntax using RegExp constructor
// const regex = new RegExp('(?x)
# XML 1.0 well-formed validation implemented as a single regular expression.
# dedicated to the public domain
# Advanced features of regex engine used: Example: Alternative:
# - utf8 matching expand UTF8 sequences in char classes
# - utf8 \\x{D800}-\\x{DFFF} disallowed and max value make explicit if not enforced by regex engine
# - fixed number of repetitions {3} expand (max 5)
# - ignore whitespacd extended syntax (?x) remove spaces and comment lines
# - ascii escaped hex characters \\xhh alt escape notation
# - unicode hex in character class \\x{hhh} \\uhhhh \\Uhhhhhhhh or raw utf8
# - named capture group aka subroutine definition (?<xyz>) flatten + numbered
# - function call \\g<nm> flatten
# - recursive function call for {element}, {kids} \\g<element> programmatic check
# - back reference (end tag matching) \\k<tag> programmatic check
# Tested with:
# - PCRE2
# Implemented well-formedness constraints:
# - PEs in Internal subset
# - External Subset (by virtue of not loading externals)
# - Element Type Match
# - Legal Character
# - In DTD
# TODO:
# - optimisation: make anon groups non capturing (?: )
# - optimisation: atomic groups, eager\/lazy, etc
# - unicode order character at start
# - support character set other than assume engine is utf8 and ignore encoding...
^ (<\\?xml
(?<s> [\\t\\n\\r\\x20])+
version \\g<s>*=\\g<s>* ( "1\\.0" | \'1\\.0\' )
( \\g<s>+ encoding \\g<s>*=\\g<s>* ( "[A-Za-z][A-Za-z0-9._-]*"
| \'[A-Za-z][A-Za-z0-9._-]*\' ) )?
( \\g<s>+ standalone \\g<s>*=\\g<s>* ( "(yes|no)" | \'(yes|no)\' ) )?
\\g<s>* \\?>)?
(?<misc> \\g<s>+
| (?<comment> <!-- ( -? [^\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}-] )* -->)
| (?<procinst> <\\?
( [MmLl] | (?<firstnoxml> [a-kn-wyzA-KN-WYZ_:] |
[\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}] |
[\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}] )
(?<latter> [XxMmLl] | (?<xmlatter> [0-9.-] | \\g<firstnoxml> | [\\xB7\\x{300}-\\x{36F}\\x{203F}-\\x{2040}] ) )*
| [Xx] ( [XxLl] | \\g<xmlatter> ) \\g<latter>*
| [Xx] [Mm] ( [XxMm] | \\g<xmlatter> ) \\g<latter>*
| [Xx] [Mm] [Ll] \\g<latter>+ )
( \\g<s>+ ( \\? [^>\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]
| [^?\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}] )* \\?? )? \\?> )
)*
(
<!DOCTYPE \\g<s>+ (?<nm> ([XxMmLl] | \\g<firstnoxml>) \\g<latter>*)
( \\g<s>+
(?<world>
( SYSTEM
| (?<idpublic> PUBLIC \\g<s>+ (\'[\\r\\n\\x20a-zA-Z0-9()+,.\\x2F:=?;!*\\#@$_%-]*\'
| "[\'\\r\\n\\x20a-zA-Z0-9()+,.\\x2F:=?;!*\\#@$_%-]*" ) ) )
\\g<s>+ ( " [^"\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]* "
| \' [^\'\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]* \' )
)
)? \\g<s>*
( \\[
(
<!ELEMENT \\g<s>+ \\g<nm> \\g<s>+
( EMPTY | ANY
| \\( \\g<s>* \\#PCDATA ( \\g<s>* \\)
| ( (?<or> \\g<s>* \\| \\g<s>*) \\g<nm> )* \\g<s>* \\)\\* )
|
(?<kids>
\\( \\g<s>* ( \\g<nm> [?*+]? | \\g<kids> ) (
( \\g<or> ( \\g<nm> [?*+]? | \\g<kids> ) )+
| ( \\g<s>* , \\g<s>* ( \\g<nm> [?*+]? | \\g<kids> ) )*
) \\g<s>*
\\) [?*+]?
)
) \\g<s>* >
| <!ENTITY \\g<s>+ (
( % \\g<s>+ )? \\g<nm> \\g<s>+
( " ( [^"%&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+
# WFC: PEs in Internal Subset | % \\g<nm> ;
| (?<ref> & \\g<nm> ; )
| (?<hash> &\\#
( 0* ( 9 | 10 | 13
| 3[2-9] | [4-9][0-9]
| [1-9][0-9][0-9][0-9]?
| [1-47-9][0-9]{4} | 5[0-48-9][0-9]{3} | 55[0-1][0-9][0-9] | 552[0-8][0-9] | 5529[0-5]
| 5734[4-9] | 573[5-9][0-9] | 57[4-9][0-9][0-9]
| 6[0-46-9][0-9]{3} | 65[0-46-9][0-9][0-9] | 655[0-24-9][0-9] | 6553[0-36-9]
| [1-9][0-9]{5}
| 10[0-9]{5} | 110[0-9]{4} | 111[0-3][0-9]{3} | 11140[0-9][0-9] | 111410[0-9] | 111411[01] )
| x 0* ( [9aAdD]
| [2-9a-f] [0-9a-fA-F]
| [1-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
| [1-9a-cA-CeE] [0-9a-fA-F]{3}
| [dD] [0-7] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [0-9a-eA-E] [0-9a-fA-F] [0-9a-fA-F]
| [fF] [fF] [0-9a-eA-E] [0-9a-fA-F]
| [fF] [fF] [fF] [0-9a-dA-D]
| (10|[1-9a-fA-F]) [0-9a-fA-F]{4} )
) ; )
)* "
| \' ( [^\'%&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+
# | % \\g<nm> ;
| \\g<ref>
| \\g<hash> )* \' )
| \\g<nm> \\g<s>+ \\g<world> ( \\g<s>+ NDATA \\g<s>+ \\g<nm> )?
| % \\g<s>+ \\g<nm> \\g<s>+ \\g<world>
) \\g<s>* >
| <!NOTATION \\g<s>+ \\g<nm> \\g<s>+ ( \\g<world> | \\g<idpublic> ) \\g<s>* >
| <!ATTLIST \\g<s>+ \\g<nm> ( \\g<s>+ \\g<nm> \\g<s>+
( CDATA | ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS
| NOTATION \\g<s>+ \\( \\g<s>* \\g<nm> ( \\g<or> \\g<nm> )* \\g<s>* \\)
| \\( \\g<s>* \\g<latter>+ ( \\g<or> \\g<latter>+ )* \\g<s>* \\)
)
\\g<s>+
( \\#REQUIRED
| \\#IMPLIED
| ( \\#FIXED \\g<s>+ )?
(?<attvalue> " ( [^"<&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+ | \\g<ref> | \\g<hash> )* "
| \' ( [^\'<&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+ | \\g<ref> | \\g<hash> )* \' )
)
)*
\\g<s>* >
| \\g<procinst>
| \\g<comment>
| % \\g<nm> ;
| \\g<s>+
)*
] \\g<s>*
)? >
)?
\\g<misc>*
(?<element>
< (?<tag> \\g<nm>)
( \\g<s>+ \\g<nm> \\g<s>*=\\g<s>* \\g<attvalue> )* \\g<s>*
( \\x2F >
| >
(?<data> >+ | ( >* ( [^><&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+
| [^\\]<&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}] ]? > )+ ) )?
(
( \\g<element>
| \\g<ref>
| \\g<hash>
| <!\\[CDATA\\[ >* ( [^>\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+
| [^\\]\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}] ]? > )* ]]>
| \\g<procinst>
| \\g<comment> )
\\g<data>?
)*
< \\x2F \\k<tag> \\g<s>* > )
)
\\g<misc>*
$
', 'u')
const str = `<doc>hello</doc>`;
// Reset `lastIndex` if this regex is defined globally
// regex.lastIndex = 0;
let m;
if ((m = regex.exec(str)) !== null) {
// The result can be accessed through the `m`-variable.
m.forEach((match, groupIndex) => {
console.log(`Found match, group ${groupIndex}: ${match}`);
});
}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for JavaScript, please visit: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions