import re
regex = re.compile(r"""
(.*?) # Data before sentences (to be removed)
( # Capture Both sentences and text in between
H.*?e.*?l.*?l.*?o.*?\s # Hello[space]
(<.*?>)* # Optional Opening Tag(s)
進.*?撃.*?の.*?巨.*?人.*? # 進撃の巨人
(<\/.*?>)* # Optional Closing Tag(s)
(.*?) # Optional Data in between sentences
(<.*?>)* # Optional Opening Tag(s)
L.*?o.*?r.*?e.*?m.*?\s # Lorem[space]
(<.*?>)* # Optional Opening Tag(s)
i.*?p.*?s.*?u.*?m.*? # ipsum
)
(.*) # Data after sentences (to be removed)
""", flags=re.DOTALL | re.VERBOSE)
test_str = ("\n"
"<html>\n"
"<body>\n"
"<header>Hello <p> </p> 進撃<em>の巨</人!</em></header>\n"
"random code\n"
"random code\n"
"<p>Lorem <span>ipsum<span>.<p>\n"
"</body>\n"
"</html>")
subst = "\\2"
result = regex.sub(subst, test_str)
if result:
print(result)
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html