Smart Way to do Regex // Council of Elrond

Below is a sample code that I found to get complex regex pattern instead of write it manually good example to make smaller parts first the combine to a very complex pattern.


import re

# convert the {name} test to rules in predefined format in macros
def regex_expand(macros, pattern, guarded = True):
    output = []
    pos = 0
    size = len(pattern)
    while pos < size:
        ch = pattern[pos]
        if ch == '\\':
            output.append(pattern[pos:pos + 2])
            pos += 2
            continue
        elif ch != '{':
            output.append(ch)
            pos += 1
            continue
        p2 = pattern.find('}', pos)
        if p2 < 0:
            output.append(ch)
            pos += 1
            continue
        p3 = p2 + 1
        name = pattern[pos + 1:p2].strip('\r\n\t ')
        if name == '':
            output.append(pattern[pos:p3])
            pos = p3
            continue
        elif name[0].isdigit():
            output.append(pattern[pos:p3])
            pos = p3
            continue
        elif ('<' in name) or ('>' in name):
            raise ValueError('invalid pattern name "%s"'%name)
        if name not in macros:
            raise ValueError('{%s} is undefined'%name)
        if guarded:
            output.append('(?:' + macros[name] + ')')
        else:
            output.append(macros[name])
        pos = p3
    return ''.join(output)

# given a set of rules, create rules dictionary
def regex_build(code, macros = None, capture = True):
    defined = {}
    if macros is not None:
        for k, v in macros.items():
            defined[k] = v
    line_num = 0
    for line in code.split('\n'):
        line_num += 1
        line = line.strip('\r\n\t ')
        if (not line) or line.startswith('#'):
            continue
        pos = line.find('=')
        if pos < 0:
            raise ValueError('%d: not a valid rule'%line_num)
        head = line[:pos].strip('\r\n\t ')
        body = line[pos + 1:].strip('\r\n\t ')
        if (not head):
            raise ValueError('%d: empty rule name'%line_num)
        elif head[0].isdigit():
            raise ValueError('%d: invalid rule name "%s"'%(line_num, head))
        elif ('<' in head) or ('>' in head):
            raise ValueError('%d: invalid rule name "%s"'%(line_num, head))
        try:
            pattern = regex_expand(defined, body, guarded = not capture)
        except ValueError as e:
            raise ValueError('%d: %s'%(line_num, str(e)))
        try:
            re.compile(pattern)
        except re.error:
            raise ValueError('%d: invalid pattern "%s"'%(line_num, pattern))
        if not capture:
            defined[head] = pattern
        else:
            defined[head] = '(?P<%s>%s)'%(head, pattern)
    return defined

# define a set of rules
rules = r'''
    protocol = http|https
    login_name = [^:@\r\n\t ]+
    login_pass = [^@\r\n\t ]+
    login = {login_name}(:{login_pass})?
    host = [^:/@\r\n\t ]+
    port = \d+
    optional_port = (?:[:]{port})?
    path = /[^\r\n\t ]*
    url = {protocol}://({login}[@])?{host}{optional_port}{path}?
'''

# convert above rules to dictionary
m = regex_build(rules, capture = True)

# print dictionary info
for k, v in m.items():
    print(k, '=', v)

print()

# use final rule "url" to regex a text
pattern = m['url']
s = re.match(pattern, 'https://name:pass@www.test.com:8080/abcdefg')

# print the complete list of matches
print('matched: "%s"'%s.group(0))
print()

# print the matched items based on rule defined names
for name in ('url', 'login_name', 'login_pass', 'host', 'port', 'path'):
    print('subgroup:', name, '=', s.group(name))

Output:

C:\Users\Administrator\AppData\Local\Programs\Python\Python311\python.exe F:\Python_Projects\Regex_Test\Regex_test.py 
protocol = (?P<protocol>http|https)
login_name = (?P<login_name>[^:@\r\n\t ]+)
login_pass = (?P<login_pass>[^@\r\n\t ]+)
login = (?P<login>(?P<login_name>[^:@\r\n\t ]+)(:(?P<login_pass>[^@\r\n\t ]+))?)
host = (?P<host>[^:/@\r\n\t ]+)
port = (?P<port>\d+)
optional_port = (?P<optional_port>(?:[:](?P<port>\d+))?)
path = (?P<path>/[^\r\n\t ]*)
url = (?P<url>(?P<protocol>http|https)://((?P<login>(?P<login_name>[^:@\r\n\t ]+)(:(?P<login_pass>[^@\r\n\t ]+))?)[@])?(?P<host>[^:/@\r\n\t ]+)(?P<optional_port>(?:[:](?P<port>\d+))?)(?P<path>/[^\r\n\t ]*)?)

matched: "https://name:pass@www.test.com:8080/abcdefg"

subgroup: url = https://name:pass@www.test.com:8080/abcdefg
subgroup: login_name = name
subgroup: login_pass = pass
subgroup: host = www.test.com
subgroup: port = 8080
subgroup: path = /abcdefg

Process finished with exit code 0