glob() and exponential backtracking

Lukas Mai

2017-08-09

glob: foo??bar*.gif

Who's affected?

Why does this happen?

bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length && pat[p] == str[s]) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
it works!
… but it's just string equality
metacharacters?
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    return matches_at(str, 0, pat, 0);
}

bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    return matches_at(str, 0, pat, 0);
}

bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    return matches_at(str, 0, pat, 0);
}

bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s <= str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    return matches_at(str, 0, pat, 0);
}

bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s <= str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    return matches_at(str, 0, pat, 0);
}

bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s <= str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}

We're done … ?

what happen

    for (; s <= str.length; s++) {
        if (matches_at(str, s, pat, p+1)) {
            return true;
        }
    }

matches("aaaaaaaaaa", "a*a*a*a*b")

a*   # loop: s .. str.length / 1 .. 10
  a*   # loop: s .. str.length / 2 .. 10
    a*   # loop: s .. str.length / 3 .. 10
      a*   # loop: s .. str.length / 4 .. 10
        b    # fail: backtrack

Backtracking

make backtracking explicit
but first: a minor optimization (that makes the following slides simpler)
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s <= str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }


            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s <= str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }


            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s < str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
                return matches_at(str, s, pat, p+1);

            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s < str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
                p++;
                next;
            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s < str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
                p++;
                next;
            } else if (s < str.length &&
                       (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s < str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                for (; s < str.length; s++) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                while (s < str.length) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                    s++;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                    s++;
                    next;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                    s++;
                    next;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                    s++;
                    next;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    if (matches_at(str, s, pat, p+1)) {
                        return true;
                    }
                    s++;
                    cont.push([s+1, p]);
                    p++;
                    next;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                    p++;
                    next;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                    p++;
                    next;
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (!cont.empty()) {
            [s, p] = cont.pop();
            next;
        }
        return false;
    }
    return true;
}
bool matches_at(string str, int s, string pat, int p) {
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (!cont.empty()) {
            [s, p] = cont.pop();
            next;
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (!cont.empty()) {
            [s, p] = cont.pop();
            next;
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (!cont.empty()) {
            [s, p] = cont.pop();
            next;
        }
        return false;
    }
    return true;
}

Redundant retries

bool matches(string str, string pat) {
    int s = 0, p = 0;
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                cont.clear();
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (!cont.empty()) {
            [s, p] = cont.pop();
            next;
        }
        return false;
    }
    return true;
}
🆒 it actually works!
but wait, there's more!
bool matches(string str, string pat) {
    int s = 0, p = 0;
    stack cont;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                cont.clear();
                if (s < str.length) {
                    cont.push([s+1, p]);
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (!cont.empty()) {
            [s, p] = cont.pop();
            next;
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    int cont_s = -1, cont_p;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                cont_s = -1;
                if (s < str.length) {
                    [cont_s, cont_p] = [s+1, p];
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (cont_s != -1) {
            [s, p] = [cont_s, cont_p];
            next;
        }
        return false;
    }
    return true;
}
bool matches(string str, string pat) {
    int s = 0, p = 0;
    int cont_s = -1, cont_p;
    while (s < str.length || p < pat.length) {
        if (p < pat.length) {
            if (pat[p] == '*') {
                cont_s = -1;
                if (s < str.length) {
                    [cont_s, cont_p] = [s+1, p];
                }
                p++;
                next;
            }
            if (s < str.length && (pat[p] == '?' || pat[p] == str[s])) {
                s++; p++;
                next;
            }
        }
        if (cont_s != -1) {
            [s, p] = [cont_s, cont_p];
            next;
        }
        return false;
    }
    return true;
}

In the wild

Regexery

Python

What would that look like in Perl?
sub glob2re {
    my ($pat) = @_;

    return qr/\A\Q$pat\E\z/;
}

"aaaaaaaaaa" =~ glob2re("a*a*a*b*");

# same exponential backtracking problems
sub glob2re {
    my ($pat) = @_;
    $pat = quotemeta $pat;
    return qr/\A$pat\z/;
}

"aaaaaaaaaa" =~ glob2re("a*a*a*b*");

# same exponential backtracking problems
sub glob2re {
    my ($pat) = @_;
    $pat =~ s/(\W)/\\$1/g;
    return qr/\A$pat\z/;
}

"aaaaaaaaaa" =~ glob2re("a*a*a*b*");

# same exponential backtracking problems
sub glob2re {
    my ($pat) = @_;
    $pat =~ s/(\W)/
        $1 eq '*' ? '.*?' :
        $1 eq '?' ? '.' :
        "\\$1"
    /eg;
    return qr/\A$pat\z/s;
}

"aaaaaaaaaa" =~ glob2re("a*a*a*b*");

# same exponential backtracking problems
sub glob2re {
    my ($pat) = @_;
    $pat =~ s/(\W)/
        $1 eq '*' ? '(*PRUNE).*?' :
        $1 eq '?' ? '.' :
        "\\$1"
    /eg;
    return qr/\A$pat\z/s;
}

"aaaaaaaaaa" =~ glob2re("a*a*a*b*");
# fixed!
# same exponential backtracking problems

Conclusions

function glob2re(pat) {
    function tr(pat) {
        return pat.replace(/\W/g, function (m0) {
            return (
                m0 === '?' ? '[\\s\\S]' :
                '\\' + m0
            );
        });
    }

    var n = 1;
    pat = pat.replace(/\W[^*]*/g, function (m0, mp, ms) {
        if (m0.charAt(0) !== '*') {
            return tr(m0);
        }
        var eos = mp + m0.length === ms.length ? '$' : '';
        return '(?=([\\s\\S]*?' + tr(m0.substr(1)) + eos + '))\\' + n++;
    });
    return new RegExp('^' + pat + '$');
}

console.log(glob2re('a*a*a*b'));
// /^a(?=([\s\S]*?a))\1(?=([\s\S]*?a))\2(?=([\s\S]*?b$))\3$/
function glob2re(pat) {
    function tr(pat) {
        return pat.replace(/\W/g, function (m0) {
            return (
                m0 === '?' ? '[\\s\\S]' :
                '\\' + m0
            );
        });
    }

    var n = 1;
    pat = pat.replace(/\W[^*]*/g, function (m0, mp, ms) {
        if (m0.charAt(0) !== '*') {
            return tr(m0);
        }
        var eos = mp + m0.length === ms.length ? '$' : '';
        return '(?=([\\s\\S]*?' + tr(m0.substr(1)) + eos + '))\\' + n++;
    });
    return new RegExp('^' + pat + '$');
}

console.log(glob2re('a*a*a*b'));
// /^a(?=([\s\S]*?a))\1(?=([\s\S]*?a))\2(?=([\s\S]*?b$))\3$/
{END,FIN}{E,} (END|FIN)E?