import re
regex = re.compile(r"\<(\w+)(?:\s*|\>)?", flags=re.MULTILINE)
test_str = ("\n"
"<!DOCTYPE html>\n"
"<html lang=\"el\" dir=\"ltr\" class=\"no-js\">\n"
"<head>\n"
" <meta charset=\"utf-8\" />\n"
" <title>Radial</title>\n"
" <meta name=\"description\" content=\"Η Radial σχεδιάζει και αναπτÏσσει websites, οÏγανώνει online καμπάνιες και δημιουÏγεί εταιÏικÎÏ‚ ταυτότητες.\" />\n"
" \n"
" <meta property=\"fb:page_id\" content=\"118981334805256\" />\n"
" <meta property=\"fb:app_id\" content=\"166019300145537\" />\n"
" \n"
" \n"
" <!--[if lt IE 9]><script type=\"text/javascript\" src=\"http://html5shiv.googlecode.com/svn/trunk/html5.js\"></script><![endif]-->\n"
" <script type=\"text/javascript\">(function(H){H.className=H.className.replace(/\\bno-js\\b/,'js')})(document.documentElement);</script>\n"
" \n"
" <link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"http://static.radial.gr/styles/screen.css?43614\" />\n"
" \n"
" <script type=\"text/javascript\" src=\"http://use.typekit.com/ioa2dxy.js\"></script>\n"
" <script type=\"text/javascript\">try{Typekit.load();}catch(e){}</script?\n"
" \n"
" <link rel=\"alternate\" type=\"application/rss+xml\" title=\"Τελευταία ÎÏγα\" href=\"/feed/\" />\n"
" \n"
" \n"
" \n"
"</head>\n\n"
"<body id=\"home\" class=\"\">\n\n"
"<header id=\"main-header\"><div class=\"wrapper\">\n"
" <div id=\"logo\"><a href=\"/\"><img src=\"http://static.radial.gr/images/radial-logo.png\" alt=\"Radial\" /></a></div>\n"
" <nav id=\"main-nav\">\n"
" <ul>\n"
" <li><a href=\"/projects/\">ΈÏγα</a></li>\n"
" <li><a href=\"/profile/\">Î Ïοφίλ</a></li>\n"
" <li><a href=\"/contact/\">Επικοινωνία</a></li>\n"
" </ul>\n"
" </nav>\n"
" <nav id=\"lang-nav\">\n"
" <span>EΛ</span>\n"
" <a href=\"/en/\">EN</a>\n"
" </nav>\n\n"
"<br />\n\n"
"</div></header>\n\n"
"<script type=\"text/javascript\">\n"
" \n"
" var _gaq = _gaq || [];\n"
" _gaq.push(['_setAccount', 'UA-515952-44']);\n"
" _gaq.push(['_trackPageview']);\n"
" (function() {\n"
" var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;\n"
" ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';\n"
" var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);\n"
" })();\n\n\n"
"</script>\n\n"
"</body>\n"
"</html>\n")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html