Search by a double asterisk glob pattern in Java behaves differently comparing to Python

You need to overwrite how the pattern for the PathMatcher is born. As this happens in classes which you can’t extend, you need to copy the code.

But first, lets look at what is happening. For this, you need to dive deep. First of all, this is the pattern which is generated:

^Tests\.*\[^\]*.java$

The pattern forces 2 separators () to be present in the path.

^Tests          - starts with "Tests"
\              - first required separator
.*              - any string
\              - second required separator
[^\]*          - match everything that is not a separator
.              - a dot
java$           - ends with "java"

The secret to this is hidden in sun/nio/fs/Globs.java where the glob pattern is translated to a regex:

private static String toRegexPattern(String globPattern, boolean isDos) {
    boolean inGroup = false;
    StringBuilder regex = new StringBuilder("^");

    int i = 0;
    while (i < globPattern.length()) {
        char c = globPattern.charAt(i++);
        switch (c) {
            case '\':
                // escape special characters
                if (i == globPattern.length()) {
                    throw new PatternSyntaxException("No character to escape",
                            globPattern, i - 1);
                }
                char next = globPattern.charAt(i++);
                if (isGlobMeta(next) || isRegexMeta(next)) {
                    regex.append('\');
                }
                regex.append(next);
                break;
            case "https://stackoverflow.com/":
                if (isDos) {
                    regex.append("\\");
                } else {
                    regex.append(c);
                }
                break;
            case '[':
                // don't match name separator in class
                if (isDos) {
                    regex.append("[[^\\]&&[");
                } else {
                    regex.append("[[^/]&&[");
                }
                if (next(globPattern, i) == '^') {
                    // escape the regex negation char if it appears
                    regex.append("\^");
                    i++;
                } else {
                    // negation
                    if (next(globPattern, i) == '!') {
                        regex.append('^');
                        i++;
                    }
                    // hyphen allowed at start
                    if (next(globPattern, i) == '-') {
                        regex.append('-');
                        i++;
                    }
                }
                boolean hasRangeStart = false;
                char last = 0;
                while (i < globPattern.length()) {
                    c = globPattern.charAt(i++);
                    if (c == ']') {
                        break;
                    }
                    if (c == "https://stackoverflow.com/" || (isDos && c == '\')) {
                        throw new PatternSyntaxException("Explicit 'name separator' in class",
                                globPattern, i - 1);
                    }
                    // TBD: how to specify ']' in a class?
                    if (c == '\' || c == '[' ||
                            c == '&' && next(globPattern, i) == '&') {
                        // escape '', '[' or "&&" for regex class
                        regex.append('\');
                    }
                    regex.append(c);

                    if (c == '-') {
                        if (!hasRangeStart) {
                            throw new PatternSyntaxException("Invalid range",
                                    globPattern, i - 1);
                        }
                        if ((c = next(globPattern, i++)) == EOL || c == ']') {
                            break;
                        }
                        if (c < last) {
                            throw new PatternSyntaxException("Invalid range",
                                    globPattern, i - 3);
                        }
                        regex.append(c);
                        hasRangeStart = false;
                    } else {
                        hasRangeStart = true;
                        last = c;
                    }
                }
                if (c != ']') {
                    throw new PatternSyntaxException("Missing ']", globPattern, i - 1);
                }
                regex.append("]]");
                break;
            case '{':
                if (inGroup) {
                    throw new PatternSyntaxException("Cannot nest groups",
                            globPattern, i - 1);
                }
                regex.append("(?:(?:");
                inGroup = true;
                break;
            case '}':
                if (inGroup) {
                    regex.append("))");
                    inGroup = false;
                } else {
                    regex.append('}');
                }
                break;
            case ',':
                if (inGroup) {
                    regex.append(")|(?:");
                } else {
                    regex.append(',');
                }
                break;
            case '*':
                if (next(globPattern, i) == '*') {
                    // crosses directory boundaries
                    regex.append(".*");
                    i++;
                } else {
                    // within directory boundary
                    if (isDos) {
                        regex.append("[^\\]*");
                    } else {
                        regex.append("[^/]*");
                    }
                }
                break;
            case '?':
               if (isDos) {
                   regex.append("[^\\]");
               } else {
                   regex.append("[^/]");
               }
               break;

            default:
                if (isRegexMeta(c)) {
                    regex.append('\');
                }
                regex.append(c);
        }
    }

    if (inGroup) {
        throw new PatternSyntaxException("Missing '}", globPattern, i - 1);
    }

    return regex.append('$').toString();
}

Specifically, the cases for "https://stackoverflow.com/" and '*':

case "https://stackoverflow.com/":
    if (isDos) {
        regex.append("\\");
    } else {
        regex.append(c);
    }
    break;

and

case '*':
    if (next(globPattern, i) == '*') {
        // crosses directory boundaries
        regex.append(".*");
        i++;
    } else {
        // within directory boundary
        if (isDos) {
            regex.append("[^\\]*");
        } else {
            regex.append("[^/]*");
        }
    }
    break;

So in theory, it could be possible to get the same result as python, with the same pattern, but you would need to implement your own PathMatcher as well as your own syntax/pattern parser.

While the PathMatcher is as easy as:

return new PathMatcher() {
    @Override
    public boolean matches(Path path) {
        return pattern.matcher(path.toString()).matches();
    }
};

The syntax/pattern parser is much more complex.

I have decided to improve the '*' case a bit like this:

case '*':
    if (next(globPattern, i) == '*') {
        // crosses directory boundaries
        regex.append(".*");
        i++;
        // following IF is new
        if (next(globPattern, i) == "https://stackoverflow.com/") {
            // ignore forced path delimiter after double asterisk
            i++;
        }
    } else {
        // within directory boundary
        if (isDos) {
            regex.append("[^\\]*");
        } else {
            regex.append("[^/]*");
        }
    }
    break;

It does now return all the results, but I can’t guarantee if it might break other cases. Basically, it consumes the forced path delimiter following double asterisk.

The full code (most is copied and I assume windows file system):

class GlobPathMatcher implements PathMatcher {

    private Pattern pattern;

    public GlobPathMatcher(String pattern) {
        this.pattern = Pattern.compile(toRegexPattern(pattern.split(":")[1], true),
                Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
    }

    @Override
    public boolean matches(Path path) {
        return pattern.matcher(path.toString()).matches();
    }

    private static final String regexMetaChars = ".^$+{[]|()";
    private static final String globMetaChars = "\*?[{";

    private static boolean isRegexMeta(char c) {
        return regexMetaChars.indexOf(c) != -1;
    }

    private static boolean isGlobMeta(char c) {
        return globMetaChars.indexOf(c) != -1;
    }

    private static char EOL = 0; // TBD

    private static char next(String glob, int i) {
        if (i < glob.length()) {
            return glob.charAt(i);
        }
        return EOL;
    }

    private static String toRegexPattern(String globPattern, boolean isDos) {
        boolean inGroup = false;
        StringBuilder regex = new StringBuilder("^");

        int i = 0;
        while (i < globPattern.length()) {
            char c = globPattern.charAt(i++);
            switch (c) {
                case '\':
                    // escape special characters
                    if (i == globPattern.length()) {
                        throw new PatternSyntaxException("No character to escape", globPattern,
                                i - 1);
                    }
                    char next = globPattern.charAt(i++);
                    if (isGlobMeta(next) || isRegexMeta(next)) {
                        regex.append('\');
                    }
                    regex.append(next);
                    break;
                case "https://stackoverflow.com/":
                    if (isDos) {
                        regex.append("\\");
                    } else {
                        regex.append(c);
                    }
                    break;
                case '[':
                    // don't match name separator in class
                    if (isDos) {
                        regex.append("[[^\\]&&[");
                    } else {
                        regex.append("[[^/]&&[");
                    }
                    if (next(globPattern, i) == '^') {
                        // escape the regex negation char if it appears
                        regex.append("\^");
                        i++;
                    } else {
                        // negation
                        if (next(globPattern, i) == '!') {
                            regex.append('^');
                            i++;
                        }
                        // hyphen allowed at start
                        if (next(globPattern, i) == '-') {
                            regex.append('-');
                            i++;
                        }
                    }
                    boolean hasRangeStart = false;
                    char last = 0;
                    while (i < globPattern.length()) {
                        c = globPattern.charAt(i++);
                        if (c == ']') {
                            break;
                        }
                        if (c == "https://stackoverflow.com/" || (isDos && c == '\')) {
                            throw new PatternSyntaxException(
                                    "Explicit 'name separator' in class", globPattern, i - 1);
                        }
                        // TBD: how to specify ']' in a class?
                        if (c == '\' || c == '[' || c == '&' && next(globPattern, i) == '&') {
                            // escape '', '[' or "&&" for regex class
                            regex.append('\');
                        }
                        regex.append(c);

                        if (c == '-') {
                            if (!hasRangeStart) {
                                throw new PatternSyntaxException("Invalid range", globPattern,
                                        i - 1);
                            }
                            if ((c = next(globPattern, i++)) == EOL || c == ']') {
                                break;
                            }
                            if (c < last) {
                                throw new PatternSyntaxException("Invalid range", globPattern,
                                        i - 3);
                            }
                            regex.append(c);
                            hasRangeStart = false;
                        } else {
                            hasRangeStart = true;
                            last = c;
                        }
                    }
                    if (c != ']') {
                        throw new PatternSyntaxException("Missing ']", globPattern, i - 1);
                    }
                    regex.append("]]");
                    break;
                case '{':
                    if (inGroup) {
                        throw new PatternSyntaxException("Cannot nest groups", globPattern,
                                i - 1);
                    }
                    regex.append("(?:(?:");
                    inGroup = true;
                    break;
                case '}':
                    if (inGroup) {
                        regex.append("))");
                        inGroup = false;
                    } else {
                        regex.append('}');
                    }
                    break;
                case ',':
                    if (inGroup) {
                        regex.append(")|(?:");
                    } else {
                        regex.append(',');
                    }
                    break;
                case '*':
                    if (next(globPattern, i) == '*') {
                        // crosses directory boundaries
                        regex.append(".*");
                        i++;
                        if (next(globPattern, i) == "https://stackoverflow.com/") {
                            // ignore forced path delimiter
                            i++;
                        }
                    } else {
                        // within directory boundary
                        if (isDos) {
                            regex.append("[^\\]*");
                        } else {
                            regex.append("[^/]*");
                        }
                    }
                    break;
                case '?':
                    if (isDos) {
                        regex.append("[^\\]");
                    } else {
                        regex.append("[^/]");
                    }
                    break;

                default:
                    if (isRegexMeta(c)) {
                        regex.append('\');
                    }
                    regex.append(c);
            }
        }

        if (inGroup) {
            throw new PatternSyntaxException("Missing '}", globPattern, i - 1);
        }

        return regex.append('$').toString();
    }

}

And you use it with:

public static void main(String[] args) throws IOException {
    PathMatcher pathMatcher = new GlobPathMatcher("glob:" + "Tests/**/*.java");
    Path base = Paths.get(".").toAbsolutePath();
    Files.walk(base).forEach((path) -> {
        path = base.relativize(path);
        if (pathMatcher.matches(path)) {
            System.out.println(path);
        }
    });
}

From looking at python glob, it seems to work completely different to java, in that it does not generate a regex but simply traverses the folder structure and matches if base (“Tests”) and filename (“*.java”) are matching (I am no python expert, so this might be completely wrong).

Leave a Comment