# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"[a-zA-Z]+\d+|(?<=\d)[a-zA-Z]+"
test_str = ("新发布的三星盖乐世S9采用Exynos9810处理器,RAM为4GB,电池容量为3000mAh。\n"
"期望将S9,Exynos9810,GB,mAh提取出来\n\n"
"如何将一个句子中的英文词组或者英文+数字词组识别出来\n"
"新发布的三星盖乐世S9的特性如下:1.采用Exynos9810处理器;2.RAM为4GB;3.电池容量为3000mAh。\n"
"期望将S9,Exynos9810,GB,mAh提取出来,该用什么方法,希望得到各位的指点,谢谢。")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html