import re
regex = re.compile(r"""
<url>\s*
(?>
(?>
(?><lastmod>\s*(?<mod>[^<]+)\s*</lastmod>)
|
(?><changefreq>\s*(?<freq>\w+)\s*</changefreq>)
|
(?><priority>\s*(?<prio>[01](?>\.\d{1,2})?)\s*</priority>)
)\s*
){0,3}\s*
<loc>\s*
(?<uri>[^<]+)\s*
</loc>\s*
(?>
(?>
(?><lastmod>\s*(?<mod>[^<]+)\s*</lastmod>)
|
(?><changefreq>\s*(?<freq>\w+)\s*</changefreq>)
|
(?><priority>\s*(?<prio>[01](?>\.\d{1,2})?)\s*</priority>)
)\s*
){0,3}\s*
</url>
""", flags=re.VERBOSE | re.UNICODE)
test_str = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"
" <url>\n"
" <lastmod>2005-01-01</lastmod>\n"
" <changefreq>monthly</changefreq>\n"
" <priority>0.8</priority>\n"
" <loc>http://www.example.com/</loc>\n"
" </url>\n"
" <url>\n"
" <loc>http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc>\n"
" <changefreq>weekly</changefreq>\n"
" </url>\n"
" <url>\n"
" <loc>http://www.example.com/catalog?item=73&desc=vacation_new_zealand</loc>\n"
" <lastmod>2004-12-23</lastmod>\n"
" <changefreq>weekly</changefreq>\n"
" </url>\n"
" <url>\n"
" <loc>http://www.example.com/catalog?item=74&desc=vacation_newfoundland</loc>\n"
" <lastmod>2004-12-23T18:00:15+00:00</lastmod>\n"
" <priority>0.3</priority>\n"
" </url>\n"
" <url>\n"
" <loc>http://www.example.com/catalog?item=83&desc=vacation_usa</loc>\n"
" <lastmod>2004-11-23</lastmod>\n"
" </url>\n"
"</urlset>")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html