import re
regex = re.compile(r"(?i)<a\s+href=\"(?:https?:\/\/)?(?:w{3}\.)?(?:[^\"\/]*\.)?([a-z0-9_-]+\.[a-z0-9_-]{2,6})(\/[^\"]*)?\"[^>]*>(?!.*\1.*)(?:https?:\/\/)?(?:w{3}\.)?(?:[^\"\/]*\.)?([a-z0-9_-]+\.[a-z0-9_-]{2,6})(\/[^\"]*)?.*?<\/a>", flags=re.MULTILINE)
test_str = ("<a href=\"http://www.test1.net/dir1/index.html\" target=\"_blank\">test1.net/admin</a> <-- NOT MATCH\n"
"<a href=\"https://test2.com\">THIS SITE</a> <-- NOT MATCH\n"
"<a href=\"https://subdomain.test3.org\">test2.org</a> <-- MATCH\n"
"<a href=\"http://www2.test4.com\" target=\"_blank\">https://global.test4.com/index.html</a> <-- NOT MATCH\n"
"<a href=\"http://eu.test5.com\">https://evil.com/eu.test5.com/</a> <-- MATCH\n"
"<a href=\"http://eu.site6.com/index.html\" target=\"_blank\">https://eu.evil.com</a> <-- MATCH\n"
"<a href=\"https://site7.com/\">http://www.site7.com/123/test</a> <-- NOT MATCH")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html