re = /(?i)<a\s+href="(?:https?:\/\/)?(?:w{3}\.)?(?:[^"\/]*\.)?([a-z0-9_-]+\.[a-z0-9_-]{2,6})(\/[^"]*)?"[^>]*>(?!.*\1.*)(?:https?:\/\/)?(?:w{3}\.)?(?:[^"\/]*\.)?([a-z0-9_-]+\.[a-z0-9_-]{2,6})(\/[^"]*)?.*?<\/a>/m
str = '<a href="http://www.test1.net/dir1/index.html" target="_blank">test1.net/admin</a> <-- NOT MATCH
<a href="https://test2.com">THIS SITE</a> <-- NOT MATCH
<a href="https://subdomain.test3.org">test2.org</a> <-- MATCH
<a href="http://www2.test4.com" target="_blank">https://global.test4.com/index.html</a> <-- NOT MATCH
<a href="http://eu.test5.com">https://evil.com/eu.test5.com/</a> <-- MATCH
<a href="http://eu.site6.com/index.html" target="_blank">https://eu.evil.com</a> <-- MATCH
<a href="https://site7.com/">http://www.site7.com/123/test</a> <-- NOT MATCH'
# Print the match result
str.scan(re) do |match|
puts match.to_s
end
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Ruby, please visit: http://ruby-doc.org/core-2.2.0/Regexp.html