import re
regex = re.compile(r"(?<=<a href=\")([^\"]*)-([^&\"]*)(?=\.html\">)", flags=re.MULTILINE)
test_str = ("<div id=\"breadcrumb-section\">\n"
" <ol id=\"breadcrumbs\">\n"
" <li class=\"first\">\n"
" <span>\n"
" <a href=\"index.html\">Technologies</a>\n"
" </span>\n"
" </li>\n"
" <li>\n"
" <span>\n"
" <a href=\"Some-Technologies-Documentation_218464400.html\">Some Technologies Documentation</a>\n"
" </span>\n"
" </li>\n"
" <li>\n"
" <span>\n"
" <a href=\"Some-Other-Documentation_268370090.html\">Some Other Documentation</a>\n"
" </span>\n"
" </li>\n"
" <li>\n"
" <span>\n"
" <a href=\"Another-Documentation_268370112.html\">Another Documentation</a>\n"
" </span>\n"
" </li>\n"
" </ol>\n"
"</div>")
subst = "$1_$2"
result = regex.sub(subst, test_str)
if result:
print(result)
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html