# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"^(.+)$(?=[\s\S]*^(\1)$[\s\S]*)"
test_str = ("\n\n"
"An HTML attachment was scrubbed...\n"
"URL: https://list.something.edu/mailman/attachments/200908002/7452/attachment.html\n"
"From SomeAddress at SomeDomain.org SomeDate\n"
"From: SomeonesEmail at SomeDomain.org (FirstName LastName)\n"
"Date: [the date]\n"
"Subject: [ListservName] SomeSubject\n"
"In-Reply-To: <SomeIDnumber>\n"
"References: <%SomeEmailAddress>\n"
" <SomeIDnumber>\n"
"Message-ID: <SomeIDnumber>\n\n"
"Hey there, Everyone,\n\n"
"I completely disagree with this. It's crap!\n\n"
"Sincerely,\n"
"SomePerson\n\n\n\n"
"On SomeDate, Someone wrote:\n"
"From: SomeonesEmail at SomeDomain.org (FirstName LastName)\n"
"Date: [the date]\n"
"Subject: [ListservName] SomeSubject\n"
"In-Reply-To: <SomeIDnumber>\n"
"References: <%SomeEmailAddress>\n"
" <SomeIDnumber>\n"
"Message-ID: <SomeIDnumber>\n\n"
"Hey PersonA,\n\n"
"This is my advice. It is really good advice. I hope you take it since I think it's pertinent.\n\n"
"Sincerely,\n"
"PersonZ\n\n\n\n\n"
"An HTML attachment was scrubbed...\n"
"URL: https://list.something.edu/mailman/attachments/200908002/7452/attachment.html\n"
"From SomeAdress at SomeDomain.org SomeDate\n"
"From: SomeonesEmail at SomeDomain.org (FirstName LastName)\n"
"Date: [the date]\n"
"Subject: [ListservName] SomeSubject\n"
"In-Reply-To: <SomeIDnumber>\n"
"References: <%SomeEmailAddress>\n"
" <SomeIDnumber>\n"
"Message-ID: <SomeIDnumber>\n\n"
"Good Afternoon, PersonA,\n\n"
"But have you considered this aspect? It changes everything, so this is my advice. It is even better advice! \n\n"
"Sincerely,\n"
"PersonB\n\n\n\n"
"On SomeDate, PersonZ wrote:\n\n"
"Hey PersonA,\n\n"
"This is my advice. It\n"
"is really\n"
"good advice. I hope you take it since I\n"
"think it's pertinent.\n\n"
"Sincerely,\n"
"PersonZ\n\n\n\n"
"From SomeAddress at SomeDomain.org SomeDate \n"
"From: SomeonesEmail at SomeDomain.org (FirstName LastName)\n"
"Date: [the date]\n"
"Subject: [ListservName] SomeSubject\n"
"In-Reply-To: <SomeIDnumber>\n"
"References: <%SomeEmailAddress>\n"
" <SomeIDnumber>\n"
"Message-ID: <SomeIDnumber>\n\n"
"Thank you all for your feedback on this issue. You've given me a lot to\n"
"consider. Cheers.\n\n\n\n"
"[And so on.]\n\n")
matches = re.finditer(regex, test_str)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html