# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"\> .+\s+(?<text>[\w\W]+?)(?=\s+\d+\s+|$)"
test_str = ("1\n"
"00:00:00,000 --> 00:00:00,500\n"
"moi j'ai\n"
"fait des \n\n"
"2\n"
"00:00:00,500 --> 00:00:01,780\n"
"rêves lucides\n\n"
"3\n"
"00:00:01,780 --> 00:00:03,340\n"
"mais j’ai\n"
"jamais essayé\n\n"
"4\n"
"00:00:03,340 --> 00:00:03,820\n"
"d'en faire\n\n"
"5\n"
"00:00:03,820 --> 00:00:04,700\n"
"ou quoi\n\n"
"6\n"
"00:00:04,700 --> 00:00:05,500\n"
"J'ai dû\n"
"en faire\n\n"
"7\n"
"00:00:05,500 --> 00:00:06,720\n"
"genre cinq\n\n"
"8\n"
"00:00:06,720 --> 00:00:07,380\n"
"mais il y en\n\n"
"9\n"
"00:00:07,380 --> 00:00:09,160\n"
"a deux des cinq\n"
"où vraiment\n\n"
"10\n"
"00:00:09,160 --> 00:00:10,260\n"
"je faisais\n"
"ce que je veux\n\n"
"11\n"
"00:00:10,260 --> 00:00:11,080\n"
"Genre j'ai volé\n\n"
"12\n"
"00:00:11,080 --> 00:00:12,100\n"
"tout ce à quoi\n"
"je pensais\n\n"
"13\n"
"00:00:12,100 --> 00:00:12,720\n"
"je le faisais\n\n"
"14\n"
"00:00:12,720 --> 00:00:13,260\n"
"c'était\n"
"vraiment\n\n"
"15\n"
"00:00:13,260 --> 00:00:14,040\n"
"tranquille\n"
"bon c'était\n\n"
"16\n"
"00:00:14,040 --> 00:00:14,640\n"
"pendant\n"
"ma période\n\n"
"17\n"
"00:00:14,640 --> 00:00:16,140\n"
"où j'étais\n"
"un peu\n\n"
"18\n"
"00:00:16,140 --> 00:00:16,800\n"
"dans un autre\n"
"monde\n\n"
"19\n"
"00:00:16,800 --> 00:00:17,360\n"
"de manière\n"
"générale\n\n"
"20\n"
"00:00:17,360 --> 00:00:17,720\n"
"Et j'ai fait\n\n"
"21\n"
"00:00:17,720 --> 00:00:18,820\n"
"l'inverse aussi\n\n"
"22\n"
"00:00:18,820 --> 00:00:20,460\n"
"où j'étais\n"
"réveillé Ça\n\n"
"23\n"
"00:00:20,460 --> 00:00:20,800\n"
"j'ai jamais\n\n"
"24\n"
"00:00:20,800 --> 00:00:21,280\n"
"fait Par contre\n\n"
"25\n"
"00:00:21,280 --> 00:00:21,900\n"
"et heureusement\n\n"
"26\n"
"00:00:21,900 --> 00:00:22,580\n"
"parce que\n"
"ça a l'air\n\n"
"27\n"
"00:00:22,580 --> 00:00:23,680\n"
"tellement\n"
"flippant\n\n"
"28\n"
"00:00:23,680 --> 00:00:24,740\n"
"J'adorais\n"
"vivre ça\n\n"
"29\n"
"00:00:24,740 --> 00:00:26,100\n"
"ah t’es unn\n"
"malade\n\n"
"30\n"
"00:00:26,100 --> 00:00:26,740\n"
"Mais attendez\n\n"
"31\n"
"00:00:26,740 --> 00:00:27,960\n"
"une expérience\n"
"négative\n\n"
"32\n"
"00:00:27,960 --> 00:00:28,920\n"
"est une bonne\n"
"expérience\n\n"
"33\n"
"00:00:28,920 --> 00:00:30,620\n"
"Moi j'ai\n"
"une amoureuse\n\n"
"34\n"
"00:00:30,620 --> 00:00:31,520\n"
"qui vit ça\n\n"
"35\n"
"00:00:31,520 --> 00:00:32,140\n"
"très souvent\n\n"
"36\n"
"00:00:32,140 --> 00:00:32,460\n"
"j’ai pas envie\n\n"
"37\n"
"00:00:32,460 --> 00:00:32,880\n"
"d'en faire 20\n\n"
"38\n"
"00:00:32,880 --> 00:00:33,740\n"
"mais et je peux\n"
"te dire\n\n"
"39\n"
"00:00:33,740 --> 00:00:35,020\n"
"qu'elle n'en\n"
"peut plus quoi")
matches = re.finditer(regex, test_str)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html