import re
regex = re.compile(r"""
(?<!(?<formula_character>[\w\-+])) # must start at a word/formula boundary
(?=[\w\-+@=<>()[\]\/.]{3}) # minimum number of characters
(?<unit> # a molecular unit
(?: #multiple instances of:
(?: # option 1: a single element component, eg. 35Cl3-
(?: # optional mass number as preceeding superscript
(?<isotope>[1-9]\d{0,2})
| (?:<sup>)(?&isotope)(?:<\/sup>)
)?
(?: # optional atomic number as preceeding subscript
(?<species>[1-9]\d{0,2})
| (?:<sub>)(?&species)(?:<\/sub>)
)?
(?: #atomic symbol
A(?:c|g|l|m|r|s|t|u)
| B(?:a|e|h|i|k|r)?
| C(?:a|d|e|f|l|m|n|o|r|s|u)?
| D(?:b|s|y)
| E(?:r|s|u)
| F(?:e|l|m|r)?
| G(?:a|d|e)
| H(?:e|f|g|o|s)?
| I(?:n|r)?
| Kr?
| L(?:a|i|r|u|v)
| M(?:c|d|g|n|o|t)
| N(?:a|b|d|e|h|i|o|p)?
| O(?:g|s)?
| P(?:a|b|d|m|o|r|t|u)?
| R(?:a|b|e|f|g|h|n|u)
| S(?:b|c|e|g|i|m|n|r)?
| T(?:a|b|c|e|h|i|l|m|s)
| U
| V
| W
| Xe
| Yb?
| Z(?:n|r)
)
(?<sub_count> # optional proportion as subscript matching any positive number or 'n'
(?<count>\d+\.\d+|[1-9]\d*|n)
| (?:<sub>)(?&count)(?:<\/sub>)
)?
(?<sup_charge> # optional ionic charge as superscript
(?<charge>\d*[+\-])
| (?:<sup>)(?&charge)(?:<\/sup>)
)?
(?::{1,3}|[@\-=])? # optional symbol describing connectivity to the next element component
)
| # option 2: an ion
\[(?&unit)](?&sup_charge)? # square brackets containing a molecular unit with optional charge
| # option 3: a repeating unit
\((?&unit)\)(?&sub_count)(?&sup_charge)? # parentheses containing a molecular unit with count and optional charge
)+
)
(?!(?&formula_character)) # must end at a word/formula boundary
""", flags=re.MULTILINE | re.VERBOSE)
test_str = ("BrI\n"
"CCl4\n"
"CH3I\n"
"C2H5Br\n"
"H2O4S\n"
"Al2(SO4)3\n"
"[[ClO2]+[ClO4]-]\n"
"[SO4]2-\n"
"CB4.2\n"
"CBn\n"
"CaCl2\n"
"(CH3)3CH\n"
"[Co(NH3)6]3+Cl3-\n"
"[Co(NH<sub>3</sub>)<sub>6</sub>]<sup>3+</sup>Cl<sub>3</sub><sup>-</sup>\n"
"C<sub>4</sub>H<sub>10</sub>\n"
"[As@Ni12As20]3-\n"
"[As@Ni<sub>12</sub>As<sub>20</sub>]<sup>3-</sup>\n"
"CH3-CH2-OH\n"
"SO<sub>4</sub><sup>2-</sup>\n"
"Fe0.95O\n"
"[32PO4]3-\n"
"18O16O\n"
"Fe0.95O\n"
"<sup>16</sup><sub>8</sub>O<sub>2</sub>\n"
"My secret chemical is H2O don't tell\n\n"
"UNiCoRn-PoOP\n\n\n"
"h2o case sensitive\n"
"fireH2O not at word break\n"
"()3- no atomic symbol\n"
"NOt ReAL AToMiC SYMbOLs\n"
"SO-4 inverted ion and count\n"
"Fe0O zero count\n"
"0H2 invalid isotope mass\n"
"100010H invalid atomic number\n"
"9999999H mass or atomic number too large\n")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html