import re
regex = re.compile(r"((https?|ftps?):\/\/[^\"<\s]+)(?![^<>]*>|[^\"]*?<\/a)", flags=re.IGNORECASE)
test_str = ("\"http://www.website.com\"\n"
"https://www.website.com\n"
"\"ftp://www.website.com\n"
"ftps://www.website.com\n\n"
"<u>http://www.website.com</u>\n"
"This is a link http://www.website.com that is not linked ftp://www.website.com\n"
"This is a long link http://www.website.com/index.htm?foo=bar\n\n"
"<a href=\"http://www.website.com\" target=\"_blank\">http://website.com</a >\n"
"<a href=\"https://www.website.com\">http://website.com</a>\n"
"<a href=\"http://www.website.com\"><u>http://www.website.com</u></a>\n"
"<a href=\"ftp://www.website.com\">ftp://www.website.com</a>\n\n\n"
"<img src=\"http://www.website.com\" target=\"_blank\"/>\n\n"
"<a href=\"http://www.website.com\" target=\"_blank\">\n"
"http://website.com\n"
"</a>\n\n\n"
"<a href=\"http://www.website.com\">\n\n\n"
"http://www.website.com\n\n"
"http://www.website.com \n"
" \n"
"</a>\n\n"
"Lorem ipsum Test dolor sit amet, consetetur sadhttp://url.comipscing elitr, sed diam <a href=\"http://Test.com/url\">Test</a> eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam http://url.com et justo...\n\n\n"
"<a href=\"http://www.website.com\">\n\n"
"http://www.website.com\n"
"<img src=\"http://www.website.com\" target=\"_blank\"/>\n"
"http://www.website.com\n"
"</a>")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html