import Foundation
let pattern = #"\[(?:http://|https://)*(?:\w+\.)*(\w+(?:\.(?:com|org|net|edu|gov|info|biz|io|co|app|co|uk|de|jp|ca|dev|app|gg))+)]\((?:http://|https://)(?:\w+\.)+\w+(?:/\w+)*\)"#
let regex = try! NSRegularExpression(pattern: pattern, options: .anchorsMatchLines)
let testString = #"""
Normal links don't get caught:
[do not catch this](https://example.com)
orthis.com
Neither do links with full stops in the message:
(messages. with. full stops)[https://example.com]
even if they forget a space
[whoops.nospace](https://example.com)
because we catch based on tld:
[catchthis.com](https://malicious.link)
[catchthis.org](https://malicious.link)
[catchthis.net](https://malicious.link)
[catchthis.edu](https://malicious.link)
[catchthis.gov](https://malicious.link)
[catchthis.info](https://malicious.link)
[catchthis.biz](https://malicious.link)
[catchthis.io](https://malicious.link)
[catchthis.co](https://malicious.link)
[catchthis.uk](https://malicious.link)
[catchthis.de](https://malicious.link)
[catchthis.jp](https://malicious.link)
[www.catchthis.com](https://malicious.link)
[https://catchthis.com](https://malicious.link)
[http://catchthis.com](http://malicious.link)
any combination of the above also gets matched for multiple tld urls:
[link.co.jp.org.net](https://malicious.link)
This is perfect because we can block any malicious link with any tld or any number of subdomains, but have a controlled list of tlds that links with a fake url begin with. Since most non-standard tlds are sketchy, we don't even need that many:
[link.com](http://any.malicious.li.nk/anything/at/all)
Any number of subdomains also get caught:
[auth.google.com](https://malicious.website.com)
[any.number.at.all.com](https://malicious.link)
This method of having a set tld list means almost zero false positives, with the drawback of people having to recognise sketchy urls themselves:
[linkwitha.sketchytld](https://malicious.link) // not caught
If you want a wider net with a higher chance of false positives, replace the subdomains with the word matcher wildcard (\w+):
\[(?:\w+\.)*(\w+(?:\.(?:\w+))+)]\((?:http://|https://)(?:\w+\.)+\w+(?:/\w+)*\)
Or a much shorter one that doesn't catch http:// links but that is short enough for Discord: [discord already blocks "fake" links with https in the title but not ones without it]
\[(\w+\.?)*]\((https?://)(\w+\.?)*\)
a longer method with subdomain denylisting is also short enough for Discord:
\[(?:(?:www|auth|login)\.)*(\w+(?:\.(?:com|org|net|edu|gov|info|biz|io|co|app|co|uk|de|jp|ca|dev|app|gg))+)]\((?:http://|https://)(?:\w+\.)+\w+(?:/\w+)*\)
Since this compiles to a shorter resulting regex (add more subdomains after auth to catch more. )
"""#
let stringRange = NSRange(location: 0, length: testString.utf16.count)
let matches = regex.matches(in: testString, range: stringRange)
var result: [[String]] = []
for match in matches {
var groups: [String] = []
for rangeIndex in 1 ..< match.numberOfRanges {
let nsRange = match.range(at: rangeIndex)
guard !NSEqualRanges(nsRange, NSMakeRange(NSNotFound, 0)) else { continue }
let string = (testString as NSString).substring(with: nsRange)
groups.append(string)
}
if !groups.isEmpty {
result.append(groups)
}
}
print(result)
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Swift 5.2, please visit: https://developer.apple.com/documentation/foundation/nsregularexpression