import Foundation
let pattern = #"""
(?(DEFINE)
(?<marker>\b[A-Z][-\w]*\b)
(?<ws>[\ \t]+)
(?<needle>\b(?:Study|Test))
(?<pre>(?:(?&marker)(?&ws))+)
(?<post>(?:(?&ws)(?&marker))+)
(?<before>(?&pre)(?&needle))
(?<after>(?&needle)(?&post))
(?<both>(?&pre)(?&needle)(?&post))
)
(?&both)|(?&before)|(?&after)
"""#
let regex = try! NSRegularExpression(pattern: pattern, options: [.anchorsMatchLines, .allowCommentsAndWhitespace])
let testString = #"""
I have been analyzing large amounts of text data. This is what I got so far:
(([A-Z][\w-]*)+\s+(\b(Study|Test)\b)(\s[A-Z][\w-]*)*)|(\b(Study|Test)\b)(\s[A-Z][\w-]*)+
Types of phrases I would like to capture:
Europe National Longitudinal Study
Longitudinal Study
Study Initiative
Longitudinal Study Initiative
I want to capture the word 'Study' or 'Test' ONLY if it is surrounded by the words starting with a capital letter. The ideal regex would achieve all of this + it would ignore\escape certain words like 'of' or 'the'.
*the above regex is super slow with the str.findall function, I guess there must be a better solution
** I used https://regex101.com for testing and then run it in Jupyter, Python 3
"""#
let stringRange = NSRange(location: 0, length: testString.utf16.count)
let matches = regex.matches(in: testString, range: stringRange)
var result: [[String]] = []
for match in matches {
var groups: [String] = []
for rangeIndex in 1 ..< match.numberOfRanges {
let nsRange = match.range(at: rangeIndex)
guard !NSEqualRanges(nsRange, NSMakeRange(NSNotFound, 0)) else { continue }
let string = (testString as NSString).substring(with: nsRange)
groups.append(string)
}
if !groups.isEmpty {
result.append(groups)
}
}
print(result)
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Swift 5.2, please visit: https://developer.apple.com/documentation/foundation/nsregularexpression