using System;
using System.Text.RegularExpressions;
public class Example
{
public static void Main()
{
string pattern = @"(?(DEFINE)
(?<marker>\b[A-Z][-\w]*\b)
(?<ws>[\ \t]+)
(?<needle>\b(?:Study|Test))
(?<pre>(?:(?&marker)(?&ws))+)
(?<post>(?:(?&ws)(?&marker))+)
(?<before>(?&pre)(?&needle))
(?<after>(?&needle)(?&post))
(?<both>(?&pre)(?&needle)(?&post))
)
(?&both)|(?&before)|(?&after)";
string input = @"
I have been analyzing large amounts of text data. This is what I got so far:
(([A-Z][\w-]*)+\s+(\b(Study|Test)\b)(\s[A-Z][\w-]*)*)|(\b(Study|Test)\b)(\s[A-Z][\w-]*)+
Types of phrases I would like to capture:
Europe National Longitudinal Study
Longitudinal Study
Study Initiative
Longitudinal Study Initiative
I want to capture the word 'Study' or 'Test' ONLY if it is surrounded by the words starting with a capital letter. The ideal regex would achieve all of this + it would ignore\escape certain words like 'of' or 'the'.
*the above regex is super slow with the str.findall function, I guess there must be a better solution
** I used https://regex101.com for testing and then run it in Jupyter, Python 3
";
RegexOptions options = RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace;
foreach (Match m in Regex.Matches(input, pattern, options))
{
Console.WriteLine("'{0}' found at index {1}.", m.Value, m.Index);
}
}
}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for C#, please visit: https://msdn.microsoft.com/en-us/library/system.text.regularexpressions.regex(v=vs.110).aspx