import re
regex = re.compile(r"(?<=<a)(.+)\s?href\s*=\s*(\"|')(.+?)(\2)", flags=re.DOTALL)
test_str = ("<!DOCTYPE html>\n"
"<html>\n"
"<head>\n"
"<title>Hyperlinks</title>\n"
"<link href=\"theme.css\" rel=\"stylesheet\" />\n"
"</head>\n"
"<body>\n"
"<ul><li><a href=\"/\" id=\"home\">Home</a></li><li><a\n\n"
"class=\"selected\" href=\"/courses\">Courses</a>\n"
"</li><li><a href =\n"
"'/forum' >Forum</a></li><li><a class=\"href\"\n"
"onclick=\"go()\" href= \"#\">Forum</a></li>\n"
"<li><a id=\"js\" href =\n"
"\"javascript:alert('hi yo')\" class=\"new\">click</a></li>\n"
"<li><a id='nakov' href =\n"
"http://www.nakov.com class='new'>nak</a></li></ul>\n"
"a href=\"#empty\"></a>\n"
"<a id=\"href\">href='fake'<img src='http://abv.bg/i.gif'\n"
"alt='abv'/></a><a href=\"#\"><a href='hello'></a>\n"
"<!-- This code is commented:\n"
"<a href=\"#commented\">commentex hyperlink</a> -->\n"
"</body>\n")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html