import re
regex = re.compile(r"""
^
(?:
[ \t]*[(][ \t]*
(\d+)
[ \t]*,[ \t]*
(\d+)
[ \t]*,[ \t]*
(\S+)
[ \t]*[)][ \t]*
)?
$
""", flags=re.MULTILINE | re.UNICODE | re.VERBOSE)
test_str = ("( 0, 12, Tokenization ) \n"
"( 13 , 15 , is ) \n"
" ( 16, 22, widely)\n"
"( 23, 31, regarded )\n"
"(32, 34, as )\n"
"(35, 36, a) \n"
"(37, 43, solved ) \n"
"(44, 51, problem)\n"
"(52, 55, due)\n"
"(56, 58, to)\n"
"(59, 62, the)\n"
"(63, 67, high)\n"
"(68, 76, accuracy)\n"
"(77, 81, that)\n"
"(82, 91, rulebased)\n"
"(92, 102, tokenizers)\n"
"(103, 110, achieve)\n"
"(110, 111, .)\n\n"
"(0, 3, But)\n"
"(4, 14, rule-based)\n"
"(15, 25, tokenizers)\n"
"(26, 29, are)\n"
"(30, 34, hard)\n"
"(35, 37, to)\n"
"(38, 46, maintain)\n"
"(47, 50, and)\n"
"(51, 56, their)\n"
"(57, 62, rules)\n"
"(63, 71, language)\n"
"(72, 80, specific)\n"
"(80, 81, .)\n\n"
"(0, 2, We)\n"
"(3, 7, show)\n"
"(8, 12, that)\n"
"(13, 17, high)\n"
"(18, 26, accuracy)\n"
"(27, 31, word)\n"
"(32, 35, and)\n"
"(36, 44, sentence)\n"
"(45, 57, segmentation)\n"
"(58, 61, can)\n"
"(62, 64, be)\n"
"(65, 73, achieved)\n"
"(74, 76, by)\n"
"(77, 82, using)\n"
"(83, 93, supervised)\n"
"(94, 102, sequence)\n"
"(103, 111, labeling)\n"
"(112, 114, on)\n"
"(115, 118, the)\n"
"(119, 128, character)\n"
"(129, 134, level)\n"
"(135, 143, combined)\n"
"(144, 148, with)\n"
"(149, 161, unsupervised)\n"
"(162, 169, feature)\n"
"(170, 178, learning)\n"
"(178, 179, .)\n\n"
"(0, 2, We)\n"
"(3, 12, evaluated)\n"
"(13, 16, our)\n"
"(17, 23, method)\n"
"(24, 26, on)\n"
"(27, 32, three)\n"
"(33, 42, languages)\n"
"(43, 46, and)\n"
"(47, 55, obtained)\n"
"(56, 61, error)\n"
"(62, 67, rates)\n"
"(68, 70, of)\n"
"(71, 75, 0.27)\n"
"(76, 77, ‰)\n"
"(78, 79, ()\n"
"(79, 86, English)\n"
"(86, 87, ))\n"
"(87, 88, ,)\n"
"(89, 93, 0.35)\n"
"(94, 95, ‰)\n"
"(96, 97, ( )\n"
"(97, 102, Dutch)\n"
"(102, 103, ) )\n"
"(104, 107, and)\n"
"(108, 112, 0.76)\n"
"(113, 114, ‰)\n"
"(115, 116, ()\n"
"(116, 123, Italian)\n"
"(123, 124, ))\n"
"(125, 128, for)\n"
"(129, 132, our)\n"
"(133, 137, best)\n"
"(138, 144, models)\n"
"(144, 145, .)")
matches = regex.finditer(test_str)
for match_num, match in enumerate(matches, start=1):
print(f"Match {match_num} was found at {match.start()}-{match.end()}: {match.group()}")
for group_num, group in enumerate(match.groups(), start=1):
print(f"Group {group_num} found at {match.start(group_num)}-{match.end(group_num)}: {group}")
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html