import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Example {
public static void main(String[] args) {
final String regex = "(?P<content>.*?) # Content up to next tag\n"
+ "(?P<markup> # Entire tag\n"
+ " <!\\[CDATA\\[(?P<cdata>.+?)]]>| # <![CDATA[ ... ]]>\n"
+ " <!--(?P<comment>.+?)-->| # <!-- Comment -->\n"
+ " </\\s*(?P<close_tag>\\w+)\\s*>| # </tag>\n"
+ " <(?P<tag>\\w+) # <tag ...\n"
+ " (?P<attributes>\n"
+ " (?P<attribute>\\s+\n"
+ "# <snip>: Use this part to get the attributes out of 'attributes' group.\n"
+ " (?P<attribute_name>\\w+)\n"
+ " (?:\\s*=\\s*\n"
+ " (?P<attribute_value>\n"
+ " [\\w:/.\\-]+| # Unquoted\n"
+ " (?=(?P<_v> # Quoted\n"
+ " (?P<_q>['\\\"]).*?(?<!\\\\)(?P=_q)))\n"
+ " (?P=_v)\n"
+ " ))?\n"
+ "# </snip>\n"
+ " )*\n"
+ " )\\s*\n"
+ " (?P<is_self_closing>/?) # Self-closing indicator\n"
+ " >) # End of tag\n";
final String string = "In this case, $url will indeed contain http://example.com/whatever.jpg. But what happens when you start getting HTML like this:\n\n"
+ "<img src='http://example.com/whatever.jpg'>\n"
+ "or\n\n"
+ "<img src=http://example.com/whatever.jpg>\n"
+ "or\n\n"
+ "<img border=0 src=\"http://example.com/whatever.jpg\">\n"
+ "or\n\n"
+ "<img border src=\"http://example.com/whatever.jpg\">\n"
+ "or\n\n"
+ "<img\n"
+ " src=\"http://example.com/whatever.jpg\">\n"
+ "or you start getting false positives from\n\n"
+ "<!-- // commented out\n"
+ "<img src=\"http://example.com/outdated.png\">\n"
+ "-->\n\n"
+ "<script><![CDATA[ This is <b>not</b> parsed ]]></script>\n\n"
+ "<asd ASD=asd>\n\n"
+ "<!-- // commented out <img src=\"http://example.com/outdated.png\"> -->\n\n"
+ "No quotes:\n"
+ "<iframe src=test.html target=xyz></ iframe >\n"
+ "Self-closing tag:\n"
+ "<a href=test.html target=xyz/>\n"
+ "Self closing tag with a space before closure:\n"
+ "<a href=test.html target=xyz />\n"
+ "Double quotes:\n"
+ "<a href=\"test.html\" target=\"xyz\">\n"
+ "Single quotes:\n"
+ "<a href='test.html' target='xyz'>\n"
+ "Escaping double quotes:\n"
+ "<a href=\"test.html?val=1\" title=\"\\\"No rules exist\\\" Andre Breton's quote\">\n"
+ "Escaping single quotes (also with spaces between equals signs):\n"
+ "<a href = \"test.html?val=1\" title = 'Charlie\\'s Angels'>\n"
+ "Tag without closure (ignored):\n"
+ "<a href = \"test.html?val=1\" title='Charlie\\'s Angels'\n"
+ "Tag without opening (ignored):\n"
+ "a href = \"test.html?val=1\" title=\"Charlie\\\"s Angels\">\n\n";
final Pattern pattern = Pattern.compile(regex, Pattern.DOTALL | Pattern.COMMENTS);
final Matcher matcher = pattern.matcher(string);
while (matcher.find()) {
System.out.println("Full match: " + matcher.group(0));
for (int i = 1; i <= matcher.groupCount(); i++) {
System.out.println("Group " + i + ": " + matcher.group(i));
}
}
}
}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Java, please visit: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html