import re
regex = re.compile(r"""
(?'char'[A-F0-9]{4,7})\;(?'replace'.*)(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+\;\ (?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})\;(?'replace'.*)(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+\;\ (?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})\;(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+\;\ (?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})\;(?'replace'.*)\;\ (?'comment'.*)(?:\n)|
(?'range_start'[A-F0-9]{4,7})\-(?'range_end'[A-F0-9]{4,7})(?:\;\ )(?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})(?:\;\ )(?'comment'.*)(?:\n)|
(?'range_start'[A-F0-9]{4,7})\-(?'range_end'[A-F0-9]{4,7})(?:\n)|
(?:^\ {3})(?'char'[A-F0-9]{4,7})(?:\n)|
(?<appendix>(?'appendix_type'Start|End)\sTable\s(?'appendix_number'(\w).(?'appendix_order'(\d)))(?=\ -----\n))
""", flags=re.VERBOSE | re.MULTILINE)
test_str = (" F001-AB12\n"
" 0221\n"
" 0234-024F\n"
" 02AE-02AF\n"
" 03AB; 03CB; Case map\n"
" 03B0; 03C5 0308 0301; Case map\n"
" 03C2; 03C3; Case map\n"
" 03B0; 03C5 0308 0301 0A1B; Case map\n"
" ----- Start Table A.1 -----\n"
" 03D0; 03B2; Case map\n"
" 03D1; 03B8; Case map\n"
" ----- End Table A.1 -----\n"
" 03D2; 03C5; Additional folding\n"
" 03D3; 03CD; Additional folding\n"
" 00DF; 0073 0073; Case map\n"
" 037B-037D\n"
" 037F-0383\n"
" 038B\n"
"Hoffman & Blanchet Standards Track [Page 89]\n"
"\n"
"RFC 3454 Preparation of Internationalized Strings December 2002\n"
" 1806; ; Map to nothing\n"
" 1806; ; Map to nothing\n"
" 1806; ; Map to nothing\n"
" 1806; ; Map to nothing\n"
" F0000-FFFFD\n"
" 100000-10FFFD\n"
" F0000\n"
" 013B; 013C; Case map\n"
" 013D; 013E; Case map\n"
" 0080-009F; [CONTROL CHARACTERS]\n"
" 06DD; ARABIC END OF AYAH\n"
" 070F; SYRIAC ABBREVIATION MARK\n"
" 180E; MONGOLIAN VOWEL SEPARATOR\n"
" 200C; ZERO WIDTH NON-JOINER\n"
" 200D; ZERO WIDTH JOINER\n"
" 2028; LINE SEPARATOR\n"
" 2029; PARAGRAPH SEPARATOR\n"
" 2060; WORD JOINER\n"
" 2061; FUNCTION APPLICATION\n"
" 2062; INVISIBLE TIMES\n"
" 013F; 0140; Case map\n"
" 0141; 0142; Case map\n"
" 0143; 0144; Case map\n"
" 0145; 0146; Case map\n"
" 0147; 0148; Case map\n"
" 0149; 02BC 006E; Case map\n"
" 014A; 014B; Case map\n"
" 014C; 014D; Case map\n")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html