import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Example {
public static void main(String[] args) {
final String regex = "(?x)\n\n"
+ "# XML 1.0 well-formed validation implemented as a single regular expression.\n"
+ "# dedicated to the public domain\n\n"
+ "# Advanced features of regex engine used: Example: Alternative:\n"
+ "# - utf8 matching expand UTF8 sequences in char classes\n"
+ "# - utf8 \\x{D800}-\\x{DFFF} disallowed and max value make explicit if not enforced by regex engine\n"
+ "# - fixed number of repetitions {3} expand (max 5)\n"
+ "# - ignore whitespacd extended syntax (?x) remove spaces and comment lines\n"
+ "# - ascii escaped hex characters \\xhh alt escape notation\n"
+ "# - unicode hex in character class \\x{hhh} \\uhhhh \\Uhhhhhhhh or raw utf8\n"
+ "# - named capture group aka subroutine definition (?<xyz>) flatten + numbered\n"
+ "# - function call \\g<nm> flatten\n"
+ "# - recursive function call for \\{element\\}, \\{kids\\} \\g<element> programmatic check\n"
+ "# - back reference (end tag matching) \\k<tag> programmatic check\n\n"
+ "# Tested with:\n"
+ "# - PCRE2\n\n"
+ "# Implemented well-formedness constraints:\n"
+ "# - PEs in Internal subset\n"
+ "# - External Subset (by virtue of not loading externals)\n"
+ "# - Element Type Match\n"
+ "# - Legal Character\n"
+ "# - In DTD\n\n"
+ "# TODO:\n"
+ "# - optimisation: make anon groups non capturing (?: )\n"
+ "# - optimisation: atomic groups, eager/lazy, etc\n"
+ "# - unicode order character at start\n"
+ "# - support character set other than assume engine is utf8 and ignore encoding...\n\n"
+ "^ (<\\?xml\n"
+ " (?<s> [\\t\\n\\r\\x20])+\n"
+ " version \\g<s>*=\\g<s>* ( \"1\\.0\" | '1\\.0' )\n"
+ " ( \\g<s>+ encoding \\g<s>*=\\g<s>* ( \"[A-Za-z][A-Za-z0-9._-]*\"\n"
+ " | '[A-Za-z][A-Za-z0-9._-]*' ) )?\n"
+ " ( \\g<s>+ standalone \\g<s>*=\\g<s>* ( \"(yes|no)\" | '(yes|no)' ) )?\n"
+ " \\g<s>* \\?>)?\n"
+ " (?<misc> \\g<s>+\n"
+ " | (?<comment> <!-- ( -? [^\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}-] )* -->)\n"
+ " | (?<procinst> <\\?\n"
+ " ( [MmLl] | (?<firstnoxml> [a-kn-wyzA-KN-WYZ_:] | \n"
+ " [\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}] |\n"
+ " [\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}] )\n"
+ " (?<latter> [XxMmLl] | (?<xmlatter> [0-9.-] | \\g<firstnoxml> | [\\xB7\\x{300}-\\x{36F}\\x{203F}-\\x{2040}] ) )*\n"
+ " | [Xx] ( [XxLl] | \\g<xmlatter> ) \\g<latter>*\n"
+ " | [Xx] [Mm] ( [XxMm] | \\g<xmlatter> ) \\g<latter>*\n"
+ " | [Xx] [Mm] [Ll] \\g<latter>+ )\n"
+ " ( \\g<s>+ ( \\? [^>\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]\n"
+ " | [^?\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}] )* \\?? )? \\?> )\n"
+ " )*\n"
+ " (\n"
+ " <!DOCTYPE \\g<s>+ (?<nm> ([XxMmLl] | \\g<firstnoxml>) \\g<latter>*)\n"
+ " ( \\g<s>+ \n"
+ " (?<world>\n"
+ " ( SYSTEM\n"
+ " | (?<idpublic> PUBLIC \\g<s>+ ('[\\r\\n\\x20a-zA-Z0-9()+,.\\x2F:=?;!*\\#@$_%-]*'\n"
+ " | \"['\\r\\n\\x20a-zA-Z0-9()+,.\\x2F:=?;!*\\#@$_%-]*\" ) ) )\n"
+ " \\g<s>+ ( \" [^\"\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]* \"\n"
+ " | ' [^'\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]* ' )\n"
+ " )\n"
+ " )? \\g<s>*\n"
+ " ( \\[\n"
+ " (\n"
+ " <!ELEMENT \\g<s>+ \\g<nm> \\g<s>+\n"
+ " ( EMPTY | ANY \n"
+ " | \\( \\g<s>* \\#PCDATA ( \\g<s>* \\) \n"
+ " | ( (?<or> \\g<s>* \\| \\g<s>*) \\g<nm> )* \\g<s>* \\)\\* )\n"
+ " | \n"
+ " (?<kids> \n"
+ " \\( \\g<s>* ( \\g<nm> [?*+]? | \\g<kids> ) ( \n"
+ " ( \\g<or> ( \\g<nm> [?*+]? | \\g<kids> ) )+ \n"
+ " | ( \\g<s>* , \\g<s>* ( \\g<nm> [?*+]? | \\g<kids> ) )*\n"
+ " ) \\g<s>*\n"
+ " \\) [?*+]?\n"
+ " )\n"
+ " ) \\g<s>* >\n"
+ " | <!ENTITY \\g<s>+ (\n"
+ " ( % \\g<s>+ )? \\g<nm> \\g<s>+ \n"
+ " ( \" ( [^\"%&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+\n"
+ "# WFC: PEs in Internal Subset | % \\g<nm> ;\n"
+ " | (?<ref> & \\g<nm> ; )\n"
+ " | (?<hash> &\\# \n"
+ " ( 0* ( 9 | 10 | 13\n"
+ " | 3[2-9] | [4-9][0-9]\n"
+ " | [1-9][0-9][0-9][0-9]?\n"
+ " | [1-47-9][0-9]{4} | 5[0-48-9][0-9]{3} | 55[0-1][0-9][0-9] | 552[0-8][0-9] | 5529[0-5]\n"
+ " | 5734[4-9] | 573[5-9][0-9] | 57[4-9][0-9][0-9]\n"
+ " | 6[0-46-9][0-9]{3} | 65[0-46-9][0-9][0-9] | 655[0-24-9][0-9] | 6553[0-36-9]\n"
+ " | [1-9][0-9]{5}\n"
+ " | 10[0-9]{5} | 110[0-9]{4} | 111[0-3][0-9]{3} | 11140[0-9][0-9] | 111410[0-9] | 111411[01] )\n"
+ " | x 0* ( [9aAdD]\n"
+ " | [2-9a-f] [0-9a-fA-F]\n"
+ " | [1-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]\n"
+ " | [1-9a-cA-CeE] [0-9a-fA-F]{3}\n"
+ " | [dD] [0-7] [0-9a-fA-F] [0-9a-fA-F]\n"
+ " | [fF] [0-9a-eA-E] [0-9a-fA-F] [0-9a-fA-F]\n"
+ " | [fF] [fF] [0-9a-eA-E] [0-9a-fA-F]\n"
+ " | [fF] [fF] [fF] [0-9a-dA-D]\n"
+ " | (10|[1-9a-fA-F]) [0-9a-fA-F]{4} )\n"
+ " ) ; )\n"
+ " )* \"\n"
+ " | ' ( [^'%&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+\n"
+ "# | % \\g<nm> ; \n"
+ " | \\g<ref> \n"
+ " | \\g<hash> )* ' )\n"
+ " | \\g<nm> \\g<s>+ \\g<world> ( \\g<s>+ NDATA \\g<s>+ \\g<nm> )?\n"
+ " | % \\g<s>+ \\g<nm> \\g<s>+ \\g<world>\n"
+ " ) \\g<s>* >\n"
+ " | <!NOTATION \\g<s>+ \\g<nm> \\g<s>+ ( \\g<world> | \\g<idpublic> ) \\g<s>* >\n"
+ " | <!ATTLIST \\g<s>+ \\g<nm> ( \\g<s>+ \\g<nm> \\g<s>+ \n"
+ " ( CDATA | ID | IDREF | IDREFS | ENTITY | ENTITIES | NMTOKEN | NMTOKENS \n"
+ " | NOTATION \\g<s>+ \\( \\g<s>* \\g<nm> ( \\g<or> \\g<nm> )* \\g<s>* \\)\n"
+ " | \\( \\g<s>* \\g<latter>+ ( \\g<or> \\g<latter>+ )* \\g<s>* \\)\n"
+ " )\n"
+ " \\g<s>+\n"
+ " ( \\#REQUIRED\n"
+ " | \\#IMPLIED\n"
+ " | ( \\#FIXED \\g<s>+ )?\n"
+ " (?<attvalue> \" ( [^\"<&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+ | \\g<ref> | \\g<hash> )* \"\n"
+ " | ' ( [^'<&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+ | \\g<ref> | \\g<hash> )* ' )\n"
+ " )\n"
+ " )*\n"
+ " \\g<s>* >\n"
+ " | \\g<procinst>\n"
+ " | \\g<comment>\n"
+ " | % \\g<nm> ;\n"
+ " | \\g<s>+\n"
+ " )*\n"
+ " ] \\g<s>*\n"
+ " )? >\n"
+ " )?\n"
+ " \\g<misc>* \n"
+ " (?<element>\n"
+ " < (?<tag> \\g<nm>)\n"
+ " ( \\g<s>+ \\g<nm> \\g<s>*=\\g<s>* \\g<attvalue> )* \\g<s>*\n"
+ " ( \\x2F >\n"
+ " | >\n"
+ " (?<data> >+ | ( >* ( [^><&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+\n"
+ " | [^\\]<&\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}] ]? > )+ ) )?\n"
+ " (\n"
+ " ( \\g<element>\n"
+ " | \\g<ref>\n"
+ " | \\g<hash>\n"
+ " | <!\\[CDATA\\[ >* ( [^>\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}]+\n"
+ " | [^\\]\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x{FFFE}\\x{FFFF}] ]? > )* ]]>\n"
+ " | \\g<procinst>\n"
+ " | \\g<comment> )\n"
+ " \\g<data>?\n"
+ " )*\n"
+ " < \\x2F \\k<tag> \\g<s>* > )\n"
+ " )\n"
+ " \\g<misc>*\n"
+ "$\n";
final String string = "<doc>hello</doc>";
final Pattern pattern = Pattern.compile(regex, Pattern.COMMENTS | Pattern.UNICODE_CASE);
final Matcher matcher = pattern.matcher(string);
if (matcher.find()) {
System.out.println("Full match: " + matcher.group(0));
for (int i = 1; i <= matcher.groupCount(); i++) {
System.out.println("Group " + i + ": " + matcher.group(i));
}
}
}
}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Java, please visit: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html