using System;
using System.Text.RegularExpressions;
public class Example
{
public static void Main()
{
string pattern = @"^(?!$|(?:\.*+\w++)?(?&sep)$)(?:\.(?&sep))?(?:(?<protocol>(?:file:(?<sep>\\|\/){3})?[A-Za-z]:(?&sep)|(?&sep){2}|https?:(?&sep){2})(?!$)(?<IP>(?:\d+\.){3}\d++(?::\d++)?(?:(?&sep)(?!$)|$))?|(?&sep))?(?<folder>(?:\w+|(?!\.(?&sep))\.{1,3}\w*|\w+\.\.?|\w+\.\w+)(?:(?&sep)|$))*$";
string input = @"Somewhat advanced help is required (this is like a boss fight)
Hello dear people!
background:
i am creating an application that looks up both strings, and folders in the same time
i would like to create a regex pattern to identify an uri in windows based on which my application may get a string, or reference to a folder in which there are multiple other files with strings, or so
i expect only file:///, https?://, smb (the one starting with double slash), or no marked protocolls to work with
my approach to this is that as i am reading an uri string, i am taking named groups of a match i am determining which kind of uri have i got from a user
i am actually mostly complete already, and the purpose of this post is out of bug finding, or refactoring purpose, and i have got a newer version that does not work yet
i am going to provide a currently working pattern that is ran in PHP 8.1.31 and PCRE 10.39 2021-10-29 that is working very flimsical because it for example requires multiple named groups for the same deal, because it can expect folders, and files to be named, but not trimmed, and sometimes it just runs into errors that render matching right by accident ""does not match"", however the thing is i do not wish to run into an error by accident, and then be unable to determine the required pattern correctly
during refactoring i would like to avoid to use the backward kind of look around, and i would like to preserve the current way i determine which character may a folder name possess (i mean specifically the brackets' clause \[\^...\])
i would also like to opt into the compatibility to other flavors specifically to google sheets, and notepad++ in this priority order with keeping the current pcre one (if possible)
i have started to work on a new pattern that should be more robust, but i did not get it to work, and i would like to grant that regex pattern here as well with the exact same specifications, and almost the same if not exactly the same functionality as proclaimed, and as the current pattern works
it is very important that i would like to rather focus on the ability to get every single possible deal into a variable via a named group, to actually match anything
deals to get via named groupings:
what is the relative path if any (including relative paths that does not name any folder, nany file)
what is the root folder if any (including relative path, smb root (""//"" and so), drive (c:/), including any protocoll all the way up to double, or triple slashes for example ""file:///c:/"" counts as a root, but ""c:/"" also counts as a root)
what is the last folder name in the path if it is not a file (a separator character will explicitly determine the last name as a folder name in case its name contains a dot, else it is a file when its name contains a dot)
what is the file name if any (with the difference that files may possess extensions, yet folder names may contain dots, and after a file name there can not be a separator character)
what is the file extension if any (with 3 types of extensions i expect out of which one is any extension)
whether the file is with .lnk extension (such that i can recursively go as deep as i please)
whether the file is with .url extension
what is the ipv4 if any (i expect to be able to both refer solely to the ip with the respective protocol before it, and to refer to any path under the ip)
what is the path from the first letter up until the name of the folder, or file (with either one excluded such that i will be able to create a file into the given folder before i attempt to read anything)
bonus deal: i did not figure out a way to name the separator anything, so i would like to know what the separator is because as of my current knowledge it can either be backslash, or forward slash, yet both my current patterns only work with the forward one
expected match, and ~~mismatch~~ examples to both current, and new patterns
i expect to be able to recognize any folder both alone, and along the path in the following ways:
PASS
./
../
folder_name
.folder
..folder
folder.
fodler..
folder.txt/
i expect relative paths to be recognized
PASS
../../../
./../../
i expect paths that can be joined to another folder recognized
PASS
/folder/file.txt
i expect separator character to not be before any protocol
FAIL
///server
/http://folder
/c:/folder/folder
PASS
C:/fodler/dofler/difle.dxd
i expect to be able to name any file a dot (witht the file's name possibly only the last one in the path)
PASS
..txt
FAIL
..txt/
PASS
...txt
../..txt
../folder/..txt
../folder/..txt/
i expect that a folder name along the path, and the last folder's name is not expected to be a dot, or two dots, but ""close calls"" are expected
FAIL
.././other/.././folder.
PASS
../f./other/..f/.d/folder.
i expect that when i refer to an ip address i must use the protocoll before it
FAIL
123.123.123.123:234
PASS
http://123.123.123.123:234
FAIL
123.123.123.123
PASS
http://123.123.123.123
https://823.123.123.123:2340
https://823.123.123.123:2340
https://823.123.123.123
https://823.123.123.123
i expect that i can have the same folder, and file structure after i have used ipv4 with its protocoll
PASS
https://823.123.123.123:234/notfile./.folder/some_more_folders/..txt
https://823.123.123.123:234/notfile./.folder/some_more_folders/..txt
https://823.123.123.123:234/notfile./.folder/some_more_folders/..txt/
https://823.123.123.123:234/notfile./.folder/some_more_folders/..txt/
i expect that i can not use an ipv4 as a folder itself
PASS
https://823.123.123.123:2340
https://823.123.123.123:2340
FAIL
https://823.123.123.123:234/
PASS
https://823.123.123.123:234
https://823.123.123.123
https://823.123.123.123
FAIL
https://823.123.123.123/
PASS
https://823.123.123.123
i expect protocols to not be alone
FAIL
http://
file:///
file:///C:/
PASS
C:/folder
c:/folder
i expect that i can not stack separators along the path, for example just two slashes indicate smb protocoll, but without anything else, i would not use it
FAIL
//
PASS
//server
//anything
FAIL
//server//folder
PASS
//server/folder
FAIL
file://c:/folder
PASS
file:///c:/folder
FAIL
file:///c:/folder//
PASS
file:///c:/folder/..txt
FAIL
file:///c:/folder//..txt
file:///c:/folder//folder
c://
i a am done with matches, and mismatches. let me provide you the new prototype not working pattern, and then the current that works (to some extent)
next...
(
?#all definitions first...
)
(
?
(
DEFINE
)
(
?'separator_s'
\/
)
(
?'smb_root_s'
\g'separator_s'{2}
)
(
?'root_middle_s'
\:
\g'separator_s'{2}
)
(
?'drive_root_s'[a-z]
\:
\g'separator_s'
)
(
?'file_root_s'file
\g'root_middle_s'
\g'separator_s'
\g'drive_root_s'
)
(
?'ip_num_s'
\d{1,3}
)
(
?'ipv4_gate_s'
\d+
)
(
?'web_root_s'https?
\g'root_middle_s'
)
(
?'ipv4_s'
(
?:
\g'ip_num_s'
\.
)
{3}
\g'ip_num_s'
(
?:
\:
\g'ipv4_gate_s'
)
?
)
(
?'separator_root_s'
\g'separator_s'?
)
(
?'relative_root_s'
\.{1,2}
\g'separator_s'
(
?:
\.{2}
\g'separator_s'
)
*
)
(
?'not_name_s'[^\v\t\\\/\:\*\""\?\<\>\|]
)
(
?'not_name_nand_dot_s'[^\.\v\t\\\/\:\*\""\?\<\>\|]
)
(
?'any_extension_s'[a-z0-9]
)
(
?'any_name_s'
(
?:
\g'not_name_nand_dot_s'
\g'not_name_s'*?|
\.
\g'not_name_nand_dot_s'
\g'not_name_s'*?|
\.{1,2}
(
?=
\.
\g'any_extension_s'
)
|
\.
\.
\g'not_name_s'+?
)
)
(
?'body_s'
(
?:
\g'any_name_s'
\g'separator_s'
)
*
)
)
(
?#definition has ended, pattern from now on
)
^
(
?<body>
(
?<root>
\g'file_root_s'|
\g'drive_root_s'|
\g'smb_root_s'|
(
?<relative_root>
\g'relative_root_s'
)
|
(
?<separator_root>
\g'separator_root_s'
)
|
(
?<web_root>
\g'web_root_s'
)
(
?:
(
?<ipv4>
\g'ipv4_s'
)
\g'separator_s'
)
?
)
?
\g'body_s'
)
(
?:
\k<relative_root>|
\k<web_root>
\k<ipv4>|
\k<body>
(
?<name>
\g'any_name_s'
)
(
?:
\g'separator_s'|
(
?:
\.
(
?:
(
?<shortcut_extension>lnk
)
|
(
?<web_extension>url
)
|
(
?<non_particular_extension>
\g'any_extension_s'+
)
)
)
)
?
)
$
(
?#all definitions first...
)
(
?
(
DEFINE
)
(
?'separator_s'
\/
)
(
?'smb_root_s'
\g'separator_s'{2}
)
(
?'root_middle_s'
\:
\g'separator_s'{2}
)
(
?'drive_root_s'[a-z]
\:
\g'separator_s'
)
(
?'file_root_s'file
\g'root_middle_s'
\g'separator_s'
\g'drive_root_s'
)
(
?'ip_num_s'
\d{1,3}
)
(
?'ipv4_gate_s'
\d+
)
(
?'web_root_s'https?
\g'root_middle_s'
)
(
?'ipv4_s'
(
?:
\g'ip_num_s'
\.
)
{3}
\g'ip_num_s'
(
?:
\:
\g'ipv4_gate_s'
)
?
)
(
?'separator_root_s'
\g'separator_s'?
)
(
?'relative_root_s'
\.{1,2}
\g'separator_s'
(
?:
\.{2}
\g'separator_s'
)
*
)
(
?'not_name_s'[^\v\t\\\/\:\*\""\?\<\>\|]
)
(
?'not_name_nand_dot_s'[^\.\v\t\\\/\:\*\""\?\<\>\|]
)
(
?'any_extension_s'[a-z0-9]
)
(
?'any_name_s'
(
?:
\g'not_name_nand_dot_s'
\g'not_name_s'*?|
\.
\g'not_name_nand_dot_s'
\g'not_name_s'*?|
\.{1,2}
(
?=
\.
\g'any_extension_s'
)
|
\.
\.
\g'not_name_s'+?
)
)
(
?'body_s'
(
?:
\g'any_name_s'
\g'separator_s'
)
*
)
)
(
?#definition has ended, pattern from now on
)
^
(
?<relative_root_excluzive_body>
(
?<excluzive_relative_root>
\g'relative_root_s'
)
)
(
?=$
)
|
(
?:
(
?<web_root_excluzive_body>
\g'web_root_s'
)
(
?<excluzive_ipv4>
\g'ipv4_s'
)
)
(
?=$
)
|
(
?:
(
?<body>
(
?:
\g'file_root_s'|
\g'drive_root_s'|
\g'smb_root_s'|
(
?<relative_root>
\g'relative_root_s'
)
|
(
?<separator_root>
\g'separator_root_s'
)
|
(
?<web_root>
\g'web_root_s'
)
(
?:
(
?<ipv4>
\g'ipv4_s'
)
\g'separator_s'
)
?
)
?
\g'body_s'
)
(
?<name>
\g'any_name_s'
)
(
?:
\g'separator_s'|
(
?<extension>
\.
(
?:
(
?<shortcut_extension>lnk
)
|
(
?<web_extension>url
)
|
(
?<non_particular_extension>
\g'any_extension_s'+
)
)
)
)
?
)
$";
RegexOptions options = RegexOptions.Multiline;
foreach (Match m in Regex.Matches(input, pattern, options))
{
Console.WriteLine("'{0}' found at index {1}.", m.Value, m.Index);
}
}
}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for C#, please visit: https://msdn.microsoft.com/en-us/library/system.text.regularexpressions.regex(v=vs.110).aspx