import re
regex = re.compile(r"""
<a\s+ # Look for '<a' followed by whitespace
([^>]+\s+)? # Look for anything else that isn't 'href='
# such as 'class=' or 'id='
href\s*=\s* # locate the 'href=' with any whitespace around the '=' character
(
'([^']*)' # Look for '...'
| # ...or...
"([^"]*) # Look for "..."
| # ...or...
([^\s>]+) # Look anything NOT '>' or spaces
)
[^>]*> # Match anything else up to the closing '>'
""", flags=re.VERBOSE | re.MULTILINE)
test_str = ("<!DOCTYPE html>\n"
"<html>\n"
"<head>\n"
" <title>Hyperlinks</title>\n"
" <link href=\"theme.css\" rel=\"stylesheet\" />\n"
"</head>\n"
"<body>\n"
"<ul><li><a href=\"/\" id=\"home\">Home</a></li><li><a\n"
" class=\"selected\" href=/courses>Courses</a>\n"
"</li><li><a href = \n"
"'/forum' >Forum</a></li><li><a class=\"href\"\n"
"onclick=\"go()\" href= \"#\">Forum</a></li>\n"
"<li><a id=\"js\" href =\n"
"\"javascript:alert('hi yo')\" class=\"new\">click</a></li>\n"
"<li><a id='nakov' href =\n"
"http://www.nakov.com class='new'>nak</a></li></ul>\n"
"<a href=\"#empty\"></a>\n"
"<a id=\"href\">href='fake'<img src='http://abv.bg/i.gif' \n"
"alt='abv'/></a><a href=\"#\"><a href='hello'></a>\n"
"<!-- This code is commented:\n"
" <a href=\"#commented\">commentex hyperlink</a> -->\n"
"</body>\n")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html