Regular Expressions 101

Community Patterns

Find next valid markup element of XML content

0

Regular Expression
PCRE (PHP <7.3)

/
(?P<Element><(?P<TagName>[:_A-z][-.0-9:_A-z\xB7]*)(?:[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*[\x09\x0A\x0D\x20]*=[\x09\x0A\x0D\x20]*(?:"(?:[^<&"]|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*"|'(?:[^<&']|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*'))*[\x09\x0A\x0D\x20]*(?:>(?:(?:[^<&\]]|](?!]>))*(?:(?:(?P>Element)|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));|<!\[CDATA\[(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x5D]|](?!]>))*]]>|<\?[:_A-z][-.0-9:_A-z\xB7]*(?<!(?i:\?xml))(?:[\x09\x0A\x0D\x20]+(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x3F]|\?(?!>))*)?\?>|<!--(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x2D]|-(?!-))*-->)(?:[^<&\]]|](?!]>))*)*)<\/(?P=TagName)[\x09\x0A\x0D\x20]*|\/)>)|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));|<!\[CDATA\[(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x5D]|](?!]>))*]]>|<\?[:_A-z][-.0-9:_A-z\xB7]*(?<!(?i:\?xml))(?:[\x09\x0A\x0D\x20]+(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x3F]|\?(?!>))*)?\?>|<!--(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x2D]|-(?!-))*-->|<!DOCTYPE[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*(?:[\x09\x0A\x0D\x20]+(?:SYSTEM[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*')|PUBLIC[\x09\x0A\x0D\x20]+(?:"[\x0A\x0D\x20\x21\x23-\x25\x27-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*"|'[\x0A\x0D\x20\x21\x23-\x25\x28-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*')[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*')))?[\x09\x0A\x0D\x20]*(?:\[(?:(?:<!ELEMENT[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*[\x09\x0A\x0D\x20]+(?:EMPTY|ANY|\([\x09\x0A\x0D\x20]*#PCDATA(?:(?:[\x09\x0A\x0D\x20]*\|[\x09\x0A\x0D\x20]*[:_A-z][-.0-9:_A-z\xB7]*)*[\x09\x0A\x0D\x20]*\)\*|[\x09\x0A\x0D\x20]*\))|(?:(?P<choice>\([\x09\x0A\x0D\x20]*(?:[:_A-z][-.0-9:_A-z\xB7]*|(?P>choice)|(?P>seq))[?*+]?(?:[\x09\x0A\x0D\x20]*\|[\x09\x0A\x0D\x20]*(?:[:_A-z][-.0-9:_A-z\xB7]*|(?P>choice)|(?P>seq))[?*+]?)+[\x09\x0A\x0D\x20]*\))|(?P<seq>\([\x09\x0A\x0D\x20]*(?:[:_A-z][-.0-9:_A-z\xB7]*|(?P>choice)|(?P>seq))[?*+]?(?:[\x09\x0A\x0D\x20]*,[\x09\x0A\x0D\x20]*(?:[:_A-z][-.0-9:_A-z\xB7]*|(?P>choice)|(?P>seq))[?*+]?)*[\x09\x0A\x0D\x20]*\)))[?*+]?)[\x09\x0A\x0D\x20]*>|<!ATTLIST[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*(?:[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*[\x09\x0A\x0D\x20]+(?:CDATA|(?:ID(?:REFS?)?|ENTIT(?:Y|IES)|NMTOKENS?)|(?:NOTATION[\x09\x0A\x0D\x20]+\([\x09\x0A\x0D\x20]*[:_A-z][-.0-9:_A-z\xB7]*(?:[\x09\x0A\x0D\x20]*\|[\x09\x0A\x0D\x20]*[:_A-z][-.0-9:_A-z\xB7]*)*[\x09\x0A\x0D\x20]*\)|\([\x09\x0A\x0D\x20]*(?:[-.0-9:_A-z\xB7])+(?:[\x09\x0A\x0D\x20]*\|[\x09\x0A\x0D\x20]*(?:[-.0-9:_A-z\xB7])+)*[\x09\x0A\x0D\x20]*\)))[\x09\x0A\x0D\x20]+(?:#(?:REQUIRED|IMPLIED)|(?:#FIXED[\x09\x0A\x0D\x20]+)?(?:"(?:[^<&"]|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*"|'(?:[^<&']|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*')))*[\x09\x0A\x0D\x20]*>|(?:<!ENTITY[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*[\x09\x0A\x0D\x20]+(?:(?:"(?:[^%&"]|%[:_A-z][-.0-9:_A-z\xB7]*;|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*"|'(?:[^%&']|%[:_A-z][-.0-9:_A-z\xB7]*;|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*')|(?:SYSTEM[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*')|PUBLIC[\x09\x0A\x0D\x20]+(?:"[\x0A\x0D\x20\x21\x23-\x25\x27-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*"|'[\x0A\x0D\x20\x21\x23-\x25\x28-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*')[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*'))(?:[\x09\x0A\x0D\x20]+NDATA[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*)?)[\x09\x0A\x0D\x20]*>|<!ENTITY[\x09\x0A\x0D\x20]+%[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*[\x09\x0A\x0D\x20]+(?:(?:"(?:[^%&"]|%[:_A-z][-.0-9:_A-z\xB7]*;|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*"|'(?:[^%&']|%[:_A-z][-.0-9:_A-z\xB7]*;|&(?:[:_A-z][-.0-9:_A-z\xB7]*|#(?:[0-9]+|x[0-9a-fA-F]+));)*')|(?:SYSTEM[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*')|PUBLIC[\x09\x0A\x0D\x20]+(?:"[\x0A\x0D\x20\x21\x23-\x25\x27-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*"|'[\x0A\x0D\x20\x21\x23-\x25\x28-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*')[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*')))[\x09\x0A\x0D\x20]*>)|<!NOTATION[\x09\x0A\x0D\x20]+[:_A-z][-.0-9:_A-z\xB7]*[\x09\x0A\x0D\x20]+(?:(?:SYSTEM[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*')|PUBLIC[\x09\x0A\x0D\x20]+(?:"[\x0A\x0D\x20\x21\x23-\x25\x27-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*"|'[\x0A\x0D\x20\x21\x23-\x25\x28-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*')[\x09\x0A\x0D\x20]+(?:"[^"]*"|'[^']*'))|PUBLIC[\x09\x0A\x0D\x20]+(?:"[\x0A\x0D\x20\x21\x23-\x25\x27-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*"|'[\x0A\x0D\x20\x21\x23-\x25\x28-\x2F\x3A\x3B\x3D\x3F\x40_0-9A-z]*'))[\x09\x0A\x0D\x20]*>|<\?[:_A-z][-.0-9:_A-z\xB7]*(?<!(?i:\?xml))(?:[\x09\x0A\x0D\x20]+(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x3F]|\?(?!>))*)?\?>|<!--(?:[^\x01-\x08\x0B\x0C\x0E-\x1F\x2D]|-(?!-))*-->)|(?:%[:_A-z][-.0-9:_A-z\xB7]*;|[\x09\x0A\x0D\x20]+))*][\x09\x0A\x0D\x20]*)?>
/
gm

Description

I created the regexp using the XML-specificaton (https://www.w3.org/TR/xml), but in a simplified format (for example allowing only a narrowed set of tag-name characters). The regular expression is a recursive regexp with backreference. I do not tested extensively the regex with a regex-directed engine, so it is not optimized for that. I used the regexp to check a text-directed regular expression engine developed by me ( Windows-users can test this engine, visit https://www.regex.hu )

Submitted by GyRos - 2 years ago