import re
regex = re.compile(r"(?<!&)(?:\b(?:nbsp|quot|divide)|#[0-9a-f]+);", flags=re.MULTILINE | re.IGNORECASE)
test_str = ("I have a huge HTML with several special chars, in the forms or \"�.\n"
"Faulty HEX: #82173333;\n"
"Some of them are wrong, because they lack the initial &.\n\n"
"I would like to search for such wrong spacial chars. I know that I can search all the right special chars by means of the following regex:\n\n"
" \\&(?:[a-z]+|#x?\\d+);\\\n"
"But I'd need a regex useful to search the wrong ones (without the initial &). Can you help me? Thanks in advance\n\n"
"Edit:\n\n"
"As suggested, I post an example. My HTML cointains the following statement:\n\n"
" <![CDATA[<nolink>blablabla blablabla</nolink>]]>nbsp;\n"
"where we have 2 special HTML character:\n"
"divide;\n"
"÷\n"
" \n"
"quot;\n"
"I'm interested in finding the second item, because it is wrong (laking the initial &).\n\n"
"So the output of the requested regex should be: quot;")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html