# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"b\d{4}\s[a-zA-Z]{3,4}(-\d)?"
test_str = ("D b1308 pspE; thiosulfate sulfurtransferase PspE K03972 pspE; phage shock protein E\n"
"B 09193 Unclassified: signaling and cellular processes-6\n"
"C 99977 Transport\n"
"D b2347 yfdC; inner membrane protein YfdC K21990 yfdC; formate-nitrite transporter family protein\n"
"D b3657 yicJ; putative xyloside transporter YicJ K03292 TC.GPH; glycoside/pentoside/hexuronide:cation symporter, GPH family\n"
"D b3876 yihO; putative sulfoquinovose transporter K03292 TC.GPH; glycoside/pentoside/hexuronide:cation symporter, GPH family\n"
"D b0361 insD-1; IS2 element protein K07497 K07497; putative transposase\n"
"D b1402 insD-2; IS2 insertion element protein InsB K07497 K07497; putative transposase")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html