LinuxLists.cc - [RFC] globmatch() helper function

2008-12-17 10:43:00

Subject: [RFC] globmatch() helper function

I was just noticing that the latest libata drive blacklist update
(libbata: fix Seagate NCQ+FLUSH blacklist) is suffering an NxM explosion
of model numbers and firmware versions:

Any of the 6 models
"ST31500341AS"
"ST31000333AS"
"ST3640623AS"
"ST3640323AS"
"ST3320813AS"
"ST3320613AS"

With any of the 5 firmware versions
"SD15"
"SD16"
"SD17"
"SD18"
"SD19"

Requires 30 entries in ata_device_blacklist[].

With some simple globbing, that could be reduced to four entries:
"ST31500341AS", "SD1[5-9]"
"ST31000333AS", "SD1[5-9]"
"ST3640?23AS", "SD1[5-9]"
"ST3320?13AS", "SD1[5-9]"

There's already a trailing-* special case in the matching code, but
a generic helper in lib/ might be more useful.

So I whipped up the following. This code uses shell globbing for
compactness; a full regexp engine doesn't seem called for.

This has to be turned into a patch with a header file, but any comments
on the following code? (MODULE_LICSENSE("Public domain"))

Other places it could be used:
drivers/acpi/blacklist.c:
{
.callback = dmi_enable_osi_linux,
.ident = "Lenovo ThinkPad x61",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad [RTX]61"),
},
},

drivers/ata/pata_hpt366.c:
drivers/ata/pata_hpt37x.c:
static const char * const bad_ata33[] = {
"Maxtor 90256D2", "Maxtor 90288D2", "Maxtor 90340D2", "Maxtor 90432D2", "Maxtor 90500D4",
"Maxtor 90510D3", "Maxtor 90576D4", "Maxtor 90625D5", "Maxtor 90648D3", "Maxtor 90648D5",
"Maxtor 90650U2", "Maxtor 90680D4", "Maxtor 90720D5", "Maxtor 90750D6", "Maxtor 90840D[67]",
"Maxtor 90845D[456]", "Maxtor 90845U3", "Maxtor 90875D7", "Maxtor 90910D8", "Maxtor 91008D7",
"Maxtor 91020D6", "Maxtor 91020U3", "Maxtor 91080D5", "Maxtor 91190D7", "Maxtor 91303D6",
"Maxtor 91360U4", "Maxtor 91512D7", "Maxtor 92040U6", "Maxtor 90432D3", "Maxtor 90510D4",
"Maxtor 91000D8", "Maxtor 91152D8", "Maxtor 91360D8", "Maxtor 91728D8", "Maxtor 92720U8",
NULL
};

static const char * const bad_ata66_4[] = {
"IBM-DTLA-3050[234]0",
"IBM-DTLA-3070[147]5",
"IBM-DTLA-3070[236]0",
"IC35L0[1-46]0AVER07-0",
"WDC AC310200R",
NULL
};

drivers/ide/ide-pio-blacklist.c: ide_pio_blacklist[]

And possibly drivers/scsi/scsi_devinfo.c

--- Code starts here ---

#include <stdbool.h>
#include <string.h> /* For strchr */

#ifndef __pure
/* Kernel compatibility #define */
#define __pure __attribute__((pure))
#endif

/**
* is_in_class - globmatch() helper, returns true if character is in class.
* @c: Character to be tested.
* @pat: Character class to test for inclusion in, terminated by ']'.
*
* Evaluate the "body" of a character class. Given the class "[!a-z]",
* this expects @pat to point to the a, and proceeds until the closing ].
* The caller must skip the opening bracket and the complement character.
*
* The caller must verify that a terminating ] exists; this will NOT
* stop at a trailing nul byte. Also, the first character may be ];
* that is not taken as a terminator.
*
* Comparison is done using unsigned bytes, 0...255.
*/
static bool __pure
is_in_class(unsigned char c, char const *pat)
{
/*
* Iterate over each span in the character class.
* a span is either a single character x, or a
* range x-y.
*/
do {
unsigned char c1 = *pat++, c2 = c1;

if (pat[0] == '-' && pat[1] != ']') {
c2 = pat[1];
pat += 2;
}
/* Any special action if c1 > c2? */
if (c1 <= c && c <= c2)
return true;
} while (*pat != ']');
return false;
}

/**
* globmatch - Shell-style pattern matching, like !fnmatch(pat, str, 0)
* @pat: Pattern to match. Metacharacters are ?, *, [ and \.
* @str: String to match. the pattern must match the entire string.
*
* Perform shell-style glob matching, returning true if the match succeeds.
* Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT treat / or
* leading . specially; it isn't actually used for pathnames.
*
* This is small and simple implementation intended for device blacklists
* where the pattern is part of the kernel. It recurses on each * in
* the pattern (although the stack frame is small), so shouldn't be
* fed pathological patterns with many *s.
*/
static bool __pure
globmatch(char const *pat, char const *str)
{
/*
* Loop over each token (character or class) in pat, matching
* it against the remaining unmatched tail of str. Return false
* on mismatch, or true after matching the trailing nul bytes.
*
*/
do {
char const *end;
bool inv;

/*
* (To handle * properly, which may match zero bytes, each case is
* required to increment the str pointer itself.
*/
switch (pat[0]) {
case '?':
if (!*str++)
return false;
break;
case '*':
if (pat[1] == '\0') /* Optimize trailing * case */
return true;
/* Recurse on each possible tail of str */
while (!globmatch(pat+1, str))
if (!*str++)
return false;
break;
case '[':
/* Character class */
/* Is it an inverted character class? */
inv = (pat[1] == '^' || pat[1] == '!');
/* If no terminating ], interpret as plain text. */
end = strchr(pat+2+inv, ']');
if (!end)
goto def;
pat += 1 + inv;
if (is_in_class(*str++, pat) == inv)
return false;
pat = end;
break;
case '\\':
pat++;
/* FALLLTHROUGH */
default:
def:
if (*pat != *str++)
return false;
break;
}
} while (*pat++);
return true;
}

/* Test code */
#include <stdio.h>
#include <stdlib.h>

static void
test(char const *pat, char const *str, bool expected)
{
bool match = globmatch(pat, str);
printf("\"%s\" vs. \"%s\": %s %s\n", pat, str, match ? "match" : "mismatch",
match == expected ? "OK" : "*** ERROR ***");
if (match != expected)
exit(1);
}

int
main(void)
{
test("a", "a", true);
test("a", "b", false);
test("a", "aa", false);
test("a", "", false);
test("", "", true);
test("", "a", false);
test("?", "a", true);
test("?", "aa", false);
test("??", "a", false);
test("?x?", "axb", true);
test("?x?", "abx", false);
test("?x?", "xab", false);
test("*??", "a", false);
test("*??", "ab", true);
test("*??", "abc", true);
test("??*", "a", false);
test("??*", "ab", true);
test("??*", "abc", true);
test("?*?", "a", false);
test("?*?", "ab", true);
test("?*?", "abc", true);
test("*b", "b", true);
test("*b", "ab", true);
test("*b", "aab", true);
test("*bc", "abbc", true);
test("*bc", "bc", true);
test("*bc", "bbc", true);
test("*abcd*", "abcabcabcabcdefg", true);
test("*abcd*", "abcabcabcabcefg", false);
test("[a]", "a", true);
test("[^a]", "a", false);
test("[!a]", "a", false);
test("[!a]", "b", true);
test("[ab]", "a", true);
test("[ab]", "b", true);
test("[ab]", "c", false);
test("[^ab]", "c", true);
test("[a-c]", "b", true);
test("[a-c]", "d", false);
test("[]a-ceg-ik[]", "a", true);
test("[]a-ceg-ik[]", "]", true);
test("[]a-ceg-ik[]", "[", true);
test("[]a-ceg-ik[]", "h", true);
test("[]a-ceg-ik[]", "f", false);
test("[!]a-ceg-ik[]", "h", false);
test("[!]a-ceg-ik[]", "]", false);
test("[!]a-ceg-ik[]", "f", true);

return 0;
}

2008-12-18 21:53:25

by George Spelvin

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Casey Dahlin <[email protected]> wrote:
> George Spelvin wrote:
>> Eureka!
>>
>> Never mind all of this angst; I've figured out a non-recursive way
>> to do it. Thanks for pushing me to think a bit harder and find it.
>
> Yours is better, but I may as well post my solution:

Interesting. You can see how complex that gets.

Here's my solution, currently a stand-alone executable:

#include <stdbool.h>
#include <string.h> /* For strchr */
#include <assert.h>

#define WARN_ON(x) assert(!(x))

#ifndef __pure
/* Kernel compatibility #define */
#define __pure __attribute__((pure))
#endif

/**
* is_in_class - globmatch() helper, returns true if character is in class.
* @c: Character to be tested.
* @pat: Character class to test for inclusion in, terminated by ']'.
*
* Evaluate the "body" of a character class. Given the class "[!a-z]",
* this expects @pat to point to the a, and proceeds until the closing ].
* The caller must skip the opening bracket and the complement character.
*
* The caller must verify that a terminating ] exists; this will NOT
* stop at a trailing nul byte. Also, the first character may be ];
* that is not taken as a terminator.
*
* Comparison is done using unsigned bytes, 0...255.
*/
static bool __pure
is_in_class(unsigned char c, char const *pat)
{
/*
* Iterate over each span in the character class.
* a span is either a single character x, or a
* range x-y.
*/
do {
unsigned char c1 = *pat++, c2 = c1;

if (pat[0] == '-' && pat[1] != ']') {
c2 = pat[1];
pat += 2;
}
/* Any special action if c1 > c2? */
if (c1 <= c && c <= c2)
return true;
} while (*pat != ']');
return false;
}

/**
* globmatch - Shell-style pattern matching, like !fnmatch(pat, str, 0)
* @pat: Pattern to match. Metacharacters are ?, *, [ and \.
* @str: String to match. the pattern must match the entire string.
* @maxdepth: Maximum number of (non-trailing) * permitted in pattern.
*
* Perform shell-style glob matching, returning true if it matches.
*
* Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT
* treat / or leading . specially; it isn't actually used for pathnames.
*
* This is small and simple implementation intended for device blacklists
* where a string is matched against a number of patterns. Thus,
* it does not proprocess the patterns. Run-time is at most quadratic
* in strlen(@str).
*/
static bool __pure
globmatch(char const *pat, char const *str)
{
/*
* Backtrack to previous * on mismatch nad retry starting one
* character later in the string. It can be proved that there's
* never a need to backtrack multiple levels.
*/
char const *back_pat = 0, *back_str = back_str;

/*
* Loop over each token (character or class) in pat, matching
* it against the remaining unmatched tail of str. Return false
* on mismatch, or true after matching the trailing nul bytes.
*/
do {
/*
* (To handle * properly, which may match zero bytes, each
* case is required to increment the str pointer itself.)
*/
switch (pat[0]) {
char c;
case '?': /* Wildcard: anything but nul */
if (!*str++)
return false;
break;
case '*': /* Any-length wildcard */
/* This doesn't loop... */
if (pat[1] == '\0') /* Optimize trailing * case */
return true;
back_pat = pat;
back_str = str;
break;
case '[': { /* Character class */
/* Is it an inverted character class? */
bool inv = (pat[1] == '^' || pat[1] == '!');
/* If no terminating ], interpret as plain text. */
char const *end = strchr(pat+2+inv, ']');
if (!end)
goto def; /* Or return -1 for malformed pattern? */
pat += 1 + inv;
c = *str++;
if (is_in_class(c, pat) == inv)
goto backtrack;
pat = end;
}
break;
case '\\':
pat++;
/* FALLLTHROUGH */
default: /* Literal character */
def:
c = *str++;
if (*pat == c)
break;
backtrack:
if (c == '\0' || !back_pat)
return false; /* No point continuing */
/* Try again from last *, one character later in str. */
pat = back_pat;
str = ++back_str;
break;
}
} while (*pat++);
return true;
}

/* Test code */
#include <stdio.h>
#include <stdlib.h>

static void
test(char const *pat, char const *str, bool expected)
{
bool match = globmatch(pat, str);
printf("\"%s\" vs. \"%s\": %s %s\n", pat, str, match ? "match" : "mismatch",
match == expected ? "OK" : "*** ERROR ***");
if (match != expected)
exit(1);
}

int
main(void)
{
test("a", "a", true);
test("a", "b", false);
test("a", "aa", false);
test("a", "", false);
test("", "", true);
test("", "a", false);
test("?", "a", true);
test("?", "aa", false);
test("??", "a", false);
test("?x?", "axb", true);
test("?x?", "abx", false);
test("?x?", "xab", false);
test("*??", "a", false);
test("*??", "ab", true);
test("*??", "abc", true);
test("??*", "a", false);
test("??*", "ab", true);
test("??*", "abc", true);
test("?*?", "a", false);
test("?*?", "ab", true);
test("?*?", "abc", true);
test("*b", "b", true);
test("*b", "ab", true);
test("*b", "aab", true);
test("*bc", "abbc", true);
test("*bc", "bc", true);
test("*bc", "bbc", true);
test("*ac*", "abacadaeafag", true);
test("*ac*ae*ag*", "abacadaeafag", true);
test("*a*b*[bc]*[ef]*g*", "abacadaeafag", true);
test("*a*b*[ef]*[cd]*g*", "abacadaeafag", false);
test("*abcd*", "abcabcabcabcdefg", true);
test("*ab*cd*", "abcabcabcabcdefg", true);
test("*abcd*abcdef*", "abcabcdabcdeabcdefg", true);
test("*abcd*", "abcabcabcabcefg", false);
test("*ab*cd*", "abcabcabcabcefg", false);
test("[a]", "a", true);
test("[^a]", "a", false);
test("[!a]", "a", false);
test("[!a]", "b", true);
test("[ab]", "a", true);
test("[ab]", "b", true);
test("[ab]", "c", false);
test("[^ab]", "c", true);
test("[a-c]", "b", true);
test("[a-c]", "d", false);
test("[]a-ceg-ik[]", "a", true);
test("[]a-ceg-ik[]", "]", true);
test("[]a-ceg-ik[]", "[", true);
test("[]a-ceg-ik[]", "h", true);
test("[]a-ceg-ik[]", "f", false);
test("[!]a-ceg-ik[]", "h", false);
test("[!]a-ceg-ik[]", "]", false);
test("[!]a-ceg-ik[]", "f", true);
test("[!]a-ceg-ik[]", "f", true);

return 0;
}

2008-12-17 13:28:22

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

"George Spelvin" <[email protected]> writes:

Wow, finally a name.

> break;
> case '*':
> if (pat[1] == '\0') /* Optimize trailing * case */
> return true;
> /* Recurse on each possible tail of str */
> while (!globmatch(pat+1, str))
> if (!*str++)
> return false;

I'm uneasy with the unbounded recursion. Sure currently all the users
are controlled in kernel source code and expect to put in sane patterns.
But if someone ever adds a user controlled glob in some way it will be
trivial to crash/overwrite memory with the limited kernel stack.
And with such a generalized function it's likely to be used more
in the future.

-Andi

--
[email protected]

2008-12-17 15:18:07

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Wed, 2008-12-17 at 14:28 +0100, Andi Kleen wrote:
> "George Spelvin" <[email protected]> writes:
>
> Wow, finally a name.
>
> > break;
> > case '*':
> > if (pat[1] == '\0') /* Optimize trailing * case */
> > return true;
> > /* Recurse on each possible tail of str */
> > while (!globmatch(pat+1, str))
> > if (!*str++)
> > return false;
>
> I'm uneasy with the unbounded recursion. Sure currently all the users
> are controlled in kernel source code and expect to put in sane patterns.
> But if someone ever adds a user controlled glob in some way it will be
> trivial to crash/overwrite memory with the limited kernel stack.
> And with such a generalized function it's likely to be used more
> in the future.

ftrace has a globbing thing in there somewhere as well and that does
indeed take user input.

Using recursion in kernel code is indeed not recommended, what Andi said
we have tiny stacks.

2008-12-17 15:37:24

by George Spelvin

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Andi Kleen <[email protected]> wrote:

> I'm uneasy with the unbounded recursion. Sure currently all the users
> are controlled in kernel source code and expect to put in sane patterns.
> But if someone ever adds a user controlled glob in some way it will be
> trivial to crash/overwrite memory with the limited kernel stack.
> And with such a generalized function it's likely to be used more
> in the future.

I was just trying to keep the code small and elegant, and adding a
recursion counter or explicit stack would complicate it.

Further, even ignoring the stack space issue, allowing uncontrolled
patterns exposes a second, more insidious bug: due to the naive
backtracking, the run time is (potentially) exponential in the number
of *s present. That could itself crash a non-preemptive kernel with a
watchdog enabled.

The simple fix is a very low limit on non-trailing * patterns (like
maybe 2), but then it's not generalized any more...

I'm willing to be persuaded, but the cost of making it robust against
pathological patterns is significant. Is it really worth it?
And if it's not actually robust, why include a half-assed solution?

Does anyone else have an opinion on the matter?

2008-12-17 15:56:15

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Wed, 2008-12-17 at 16:15 +0100, Peter Zijlstra wrote:
> On Wed, 2008-12-17 at 14:28 +0100, Andi Kleen wrote:
> > "George Spelvin" <[email protected]> writes:
> >
> > Wow, finally a name.
> >
> > > break;
> > > case '*':
> > > if (pat[1] == '\0') /* Optimize trailing * case */
> > > return true;
> > > /* Recurse on each possible tail of str */
> > > while (!globmatch(pat+1, str))
> > > if (!*str++)
> > > return false;
> >
> > I'm uneasy with the unbounded recursion. Sure currently all the users
> > are controlled in kernel source code and expect to put in sane patterns.
> > But if someone ever adds a user controlled glob in some way it will be
> > trivial to crash/overwrite memory with the limited kernel stack.
> > And with such a generalized function it's likely to be used more
> > in the future.
>
> ftrace has a globbing thing in there somewhere as well and that does
> indeed take user input.
>
> Using recursion in kernel code is indeed not recommended, what Andi said
> we have tiny stacks.

ftrace has a very limited glob feature, and requires no recursion.
Basically, we allow:

*match

match*

*match*

We do not allow match*two, that would be the same as match*

I need to look at your code (I would like a generalized glob feature for
user input). Can you accomplish the same with using a loop instead of
recursion?

-- Steve

2008-12-17 16:03:23

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

> I need to look at your code (I would like a generalized glob feature for
> user input). Can you accomplish the same with using a loop instead of
> recursion?

Standard * always requires a stack, it's a recursive problem unless
you change the semantics to non greedy. While you can open code the
stack it would be be easier to just limit the recursion. Not more than
10 or so should be plenty.

But the other problem is that Linux doesn't accept anonymous code
contributions.

-Andi

--
[email protected]

2008-12-17 16:04:29

by George Spelvin

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Peter Zijlstra <[email protected]> wrote:
> ftrace has a globbing thing in there somewhere as well and that does
> indeed take user input.

Yes, but (I just looked at the code), its patterns take the form:
- Optional leading *
- Literal text
- Optional trailing *

Thus, only one case (*literal*) requires any backtracking, and it's
just strstr().

> Using recursion in kernel code is indeed not recommended, what Andi said
> we have tiny stacks.

The stack frame is tiny, and it only recurses on a *, so if the number
of *s in the pattern is bounded, the stack usage is bounded.

For the intended application (compile-time constant device blacklists),
this is not a problem at all. Indeed, nothing but a trailing * is even
needed; I only implemented the feature to be compatible with fnmatch().
(Which is part of the reason I'm reluctant to do anything heroic to
make it work.)

The problem is, what if some future thoughtless person feeds user data
into the pattern argument?

I could just take support for non-trailing * out entirely. That would be
a different sort of documentation burden.

Or I could just add an explicit 2-level stack. If you overflow the stack,
matching always fails. Unfortunately, the code will be larger.

Do people think that would be, on balance, better? It would be plenty
good enough for the blacklist application.

2008-12-17 16:22:50

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Wed, 2008-12-17 at 11:04 -0500, George Spelvin wrote:

>
> The problem is, what if some future thoughtless person feeds user data
> into the pattern argument?
>
> I could just take support for non-trailing * out entirely. That would be
> a different sort of documentation burden.
>
> Or I could just add an explicit 2-level stack. If you overflow the stack,
> matching always fails. Unfortunately, the code will be larger.
>
> Do people think that would be, on balance, better? It would be plenty
> good enough for the blacklist application.

Having a static function do the work and pass in a "depth" parameter
should be sufficient. As Andi mentioned, a depth of 10 should be plenty.

Like this:

static bool
globmatch_internal(const char *pat, const char *str, int depth)
{
if (depth > 10)
return false;

[...]

while (!globmatch_internal(pat+1, str, depth+1))
[...]
}

bool globmatch(const char *pat, const char *str)
{
return globmatch_internal(pat, str, 0);
}

Make sure you include a "Signed-off-by:" as well.

-- Steve

2008-12-17 16:23:10

by Tejun Heo

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Hello,

George Spelvin wrote:
> Do people think that would be, on balance, better? It would be plenty
> good enough for the blacklist application.

Just pass a depth parameter and trigger WARN_ON() and return -EINVAL
when it exceeds ten. It's a five minute change and should be enough
for kernel usages.

Thanks.

--
tejun

2008-12-17 16:31:42

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Thu, 2008-12-18 at 01:22 +0900, Tejun Heo wrote:
> Hello,
>
> George Spelvin wrote:
> > Do people think that would be, on balance, better? It would be plenty
> > good enough for the blacklist application.
>
> Just pass a depth parameter and trigger WARN_ON() and return -EINVAL
> when it exceeds ten. It's a five minute change and should be enough
> for kernel usages.

If this is ever expected to be used by userspace, I would not include
the WARN_ON. If this is a generic function, then I'll include in in
ftrace as well, and that takes userspace input. The last thing I want is
a DoS because of printk's to the serial console because some userspace
app is constantly writing bad patterns to this file.

-- Steve

2008-12-17 16:34:23

by Tejun Heo

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Steven Rostedt wrote:
> On Thu, 2008-12-18 at 01:22 +0900, Tejun Heo wrote:
>> Hello,
>>
>> George Spelvin wrote:
>>> Do people think that would be, on balance, better? It would be plenty
>>> good enough for the blacklist application.
>> Just pass a depth parameter and trigger WARN_ON() and return -EINVAL
>> when it exceeds ten. It's a five minute change and should be enough
>> for kernel usages.
>
> If this is ever expected to be used by userspace, I would not include
> the WARN_ON. If this is a generic function, then I'll include in in
> ftrace as well, and that takes userspace input. The last thing I want is
> a DoS because of printk's to the serial console because some userspace
> app is constantly writing bad patterns to this file.

Well, then, how about printk_ratelimit()? Having one too many
asterisk will be a very rare occasion and when it happens it's
something which can easily escape attention, so I think some form of
whining is in order.

Thanks.

--
tejun

2008-12-17 16:37:28

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Thu, 2008-12-18 at 01:33 +0900, Tejun Heo wrote:
> Steven Rostedt wrote:
> > On Thu, 2008-12-18 at 01:22 +0900, Tejun Heo wrote:
> >> Hello,
> >>
> >> George Spelvin wrote:
> >>> Do people think that would be, on balance, better? It would be plenty
> >>> good enough for the blacklist application.
> >> Just pass a depth parameter and trigger WARN_ON() and return -EINVAL
> >> when it exceeds ten. It's a five minute change and should be enough
> >> for kernel usages.
> >
> > If this is ever expected to be used by userspace, I would not include
> > the WARN_ON. If this is a generic function, then I'll include in in
> > ftrace as well, and that takes userspace input. The last thing I want is
> > a DoS because of printk's to the serial console because some userspace
> > app is constantly writing bad patterns to this file.
>
> Well, then, how about printk_ratelimit()? Having one too many
> asterisk will be a very rare occasion and when it happens it's
> something which can easily escape attention, so I think some form of
> whining is in order.

I do not think printk_ratelimit is appropriate here.

OK, lets compromise with WARN_ON_ONCE() ;-)

-- Steve

2008-12-17 16:36:49

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Thu, 2008-12-18 at 01:33 +0900, Tejun Heo wrote:
> Steven Rostedt wrote:
> > On Thu, 2008-12-18 at 01:22 +0900, Tejun Heo wrote:
> >> Hello,
> >>
> >> George Spelvin wrote:
> >>> Do people think that would be, on balance, better? It would be plenty
> >>> good enough for the blacklist application.
> >> Just pass a depth parameter and trigger WARN_ON() and return -EINVAL
> >> when it exceeds ten. It's a five minute change and should be enough
> >> for kernel usages.
> >
> > If this is ever expected to be used by userspace, I would not include
> > the WARN_ON. If this is a generic function, then I'll include in in
> > ftrace as well, and that takes userspace input. The last thing I want is
> > a DoS because of printk's to the serial console because some userspace
> > app is constantly writing bad patterns to this file.
>
> Well, then, how about printk_ratelimit()? Having one too many
> asterisk will be a very rare occasion and when it happens it's
> something which can easily escape attention, so I think some form of
> whining is in order.

having the write fail with -EINVAL seems suitably whiney..

2008-12-17 16:39:45

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

> OK, lets compromise with WARN_ON_ONCE() ;-)

I don't think WARN_ON is the right way to handle bad user input.
It implies that when someone typos the result might end up in Arjan's
kerneloops.org database, which is not good.

-Andi
--
[email protected]

2008-12-17 16:46:47

by Tejun Heo

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Peter Zijlstra wrote:
> On Thu, 2008-12-18 at 01:33 +0900, Tejun Heo wrote:
>> Steven Rostedt wrote:
>>> On Thu, 2008-12-18 at 01:22 +0900, Tejun Heo wrote:
>>>> Hello,
>>>>
>>>> George Spelvin wrote:
>>>>> Do people think that would be, on balance, better? It would be plenty
>>>>> good enough for the blacklist application.
>>>> Just pass a depth parameter and trigger WARN_ON() and return -EINVAL
>>>> when it exceeds ten. It's a five minute change and should be enough
>>>> for kernel usages.
>>> If this is ever expected to be used by userspace, I would not include
>>> the WARN_ON. If this is a generic function, then I'll include in in
>>> ftrace as well, and that takes userspace input. The last thing I want is
>>> a DoS because of printk's to the serial console because some userspace
>>> app is constantly writing bad patterns to this file.
>> Well, then, how about printk_ratelimit()? Having one too many
>> asterisk will be a very rare occasion and when it happens it's
>> something which can easily escape attention, so I think some form of
>> whining is in order.
>
> having the write fail with -EINVAL seems suitably whiney..

Yeah, I suppose so but that makes return value handling weird. -EINVAL
for error, 0 for mis-match and 1 for match. Three-way returns are
usually very sucky to handle and error-prone, so I was thinking about
returning -EINVAL in the internal function and printing out the caller
address and pattern on ratelimit or once.

Thanks.

--
tejun

2008-12-17 16:55:22

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

On Wed, 2008-12-17 at 17:51 +0100, Andi Kleen wrote:
> > OK, lets compromise with WARN_ON_ONCE() ;-)
>
> I don't think WARN_ON is the right way to handle bad user input.
> It implies that when someone typos the result might end up in Arjan's
> kerneloops.org database, which is not good.

Since I recommended an internal helper function, we could have:

int globmatch(const char *pat, const char *str)
{
int ret;

ret = globmatch_internal(pat, str, 0);
if (ret < 0)
WARN_ON();
return ret;
}

int globmatch_user(const char *pat, const char *str)
{
return globmatch_internal(pat, str, 0);
}

This would warn on if a kernel user caused the overflow, and also gives
a function that user space input could use, that would not warn on
overflow.

-- Steve

2008-12-18 08:00:39

by George Spelvin

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Steven Rostedt <[email protected]> wrote:
> I need to look at your code (I would like a generalized glob feature for
> user input). Can you accomplish the same with using a loop instead of
> recursion?

Unfortunately, no. Matching multiple * characters inherently requires
backtracking, and backtracking requires a stack.

I could make it a finite-depth explicit stack rather than using
the C stack implicitly, but it doesn't save much on reasonable
patterns.

But if you want it for user input, I can work at making it more
robust. What are your sped needs?
0 - Runs once only, speed unimportant
1 - Runs several thousand times (counting all NxM matches), would like
it to not be too slow.
2 - Run a thousand times per second, but it's a debug feature so
a speed hit is acceptable.
3 - Common code path, maybe rearchitect to save time.
4 - Kernel fast path, every cycle counts!

There are quite a few things (with commensurate code size increases)
that can't help the big-O worst-case performance, but improve the average
case considerably.

One thing I don't do that's quite possible is to find the fixed-length
head and tail of the pattern (the parts before the first * and after the
last) and match those before dealing with the annoying middle.

That would avoid all backtracking in cases with a single *, and make a
good pre-filter before doing anything expensive in the middle.

H'm... I think there's an equivalent and more general version based on
the number of characters a pattern matches. Any pattern's length is
N characters, plus a flag indicating if that's fixed or just a minimum.

When matching a *, compute the length of the following pattern and only
try matching cases which have the right length.

If we need to do a lot of matching against a single pattern, it's possible
to preprocess it to find the fixed-length spans and their lengths.
(The current blacklist application is exactly the opposite: matching
one string against a list of patterns.)

2008-12-18 08:55:42

by George Spelvin

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

Eureka!

Never mind all of this angst; I've figured out a non-recursive way
to do it. Thanks for pushing me to think a bit harder and find it.
(I was actually looking for an example of the exponential pathology I
kept referring to when it dawned on me that it's actually impossible
without the additional expressive power of regexps.)

Somewhat abusing TeX's line-breaking terminology, consider a shell glob to
consist of a series of "boxes" and "glue" A box is a run of character
classes (of which literal characters and ? are degenerate cases). The
point is, a box has a well-defined width, the number of characters in the
string that it must match.

A *, on the other hand, is infinitely elastic "glue" between boxes that
matches an unknown number of characters.

So consider a pattern of the form box1*box2*box3*...
Assuming box1 has matched, we search forward, trying various widths of
the glue *, to find a point where box2 matches. Then we start searching
for the width of the second *. But! If we can't match the tail of the
pattern box3*... at any position to the right of the end of box2, there
is no point trying to move box2 forward and search again. Any solution
where box2*box3*... would match starting k characters later in the string
would also be a match with box2 (only) shifted k characters left, i.e.
in its original position.

So we can match boxes greedily: once we've found the leftmost place
where one matches (that is still to the right of all previous boxes),
we never need to try any other positions. Moving a box to the right
can never produce a match which doesn't exist in its previous position.

I'll rewrite it to be non-recusrive and quadratic in the worst case.

Thanks; I feel stupid for not maving looked this up, but smart for having
thought of it myself.

(For a discussion of how to get exponential backtracking in a regular
expression, see http://swtch.com/~rsc/regexp/regexp1.html)

2008-12-18 19:53:40

by Casey Dahlin

[permalink] [raw]

Subject: Re: [RFC] globmatch() helper function

George Spelvin wrote:
> Eureka!
>
> Never mind all of this angst; I've figured out a non-recursive way
> to do it. Thanks for pushing me to think a bit harder and find it.
> (I was actually looking for an example of the exponential pathology I
> kept referring to when it dawned on me that it's actually impossible
> without the additional expressive power of regexps.)
>
> Somewhat abusing TeX's line-breaking terminology, consider a shell glob to
> consist of a series of "boxes" and "glue" A box is a run of character
> classes (of which literal characters and ? are degenerate cases). The
> point is, a box has a well-defined width, the number of characters in the
> string that it must match.
>
Yours is better, but I may as well post my solution:

--code starts--
#include <stdbool.h>
#include <string.h> /* For strchr */

#ifndef __pure
/* Kernel compatibility #define */
#define __pure __attribute__((pure))
#endif

/**
* is_in_class - globmatch() helper, returns true if character is in class.
* @c: Character to be tested.
* @pat: Character class to test for inclusion in, terminated by ']'.
*
* Evaluate the "body" of a character class. Given the class "[!a-z]",
* this expects @pat to point to the a, and proceeds until the closing ].
* The caller must skip the opening bracket and the complement character.
*
* The caller must verify that a terminating ] exists; this will NOT
* stop at a trailing nul byte. Also, the first character may be ];
* that is not taken as a terminator.
*
* Comparison is done using unsigned bytes, 0...255.
*/
static bool __pure
is_in_class(unsigned char c, char const *pat)
{
/*
* Iterate over each span in the character class.
* a span is either a single character x, or a
* range x-y.
*/
do {
unsigned char c1 = *pat++, c2 = c1;

if (pat[0] == '-' && pat[1] != ']') {
c2 = pat[1];
pat += 2;
}
/* Any special action if c1 > c2? */
if (c1 <= c && c <= c2)
return true;
} while (*pat != ']');
return false;
}

/**
* given a pattern, find the minimum string length that would match it
**/
static int __pure
patsize(char const *pat)
{
int retval = 0;
int grouplen = 0;

for(;*pat;pat++) {
if (*pat == '*') {
continue;
} else if ((grouplen > 2 || (grouplen == 2 && *(pat-1) != '^' &&
*(pat-1) != '!')) && *pat == ']') {
grouplen = 0;
} else if (grouplen) {
grouplen++;
} else {
if (*pat == '[') {
grouplen++;
}
retval++;
}
}
return retval;
}

/**
* Generates the next balls-and-boxes sequence of integer pattern widths
*/
static bool
next_arrangement(int *buckets, int found, int total)
{
int i;
bool retval = false;

if (!found || total == 1)
return false;

for (i = found; i < total-1; i++) {
buckets[total-1] += buckets[i];
buckets[i] = 0;
}

for (i = total - 2; i >= 0; i--) {
if (!buckets[i])
continue;
retval = true;
buckets[i]--;
buckets[i+1]++;
if ((i < total-2) && total > 2) {
buckets[i+1] += buckets[total-1];
buckets[total-1] = 0;
}
break;
}

return retval;
}

/**
* globmatch - Shell-style pattern matching, like !fnmatch(pat, str, 0)
* @pat: Pattern to match. Metacharacters are ?, *, [ and \.
* @str: String to match. the pattern must match the entire string.
*
* Perform shell-style glob matching, returning true if the match succeeds.
* Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT
treat / or
* leading . specially; it isn't actually used for pathnames.
*
* This is small and simple implementation intended for device blacklists
* where the pattern is part of the kernel. It recurses on each * in
* the pattern (although the stack frame is small), so shouldn't be
* fed pathological patterns with many *s.
*/
static bool __pure
globmatch(char const *in_pat, char const *in_str)
{
int star_width[10] = { 0,0,0,0,0,0,0,0,0,0 };
int stars_found = 0;
int stars_total = 0;
char const *str = in_str;
char const *pat = in_pat;
int i;

char const *loc;
for (loc = pat; *loc; stars_total += (*(loc++) == '*'));
if (stars_total > 10)
return false;

for (loc = str; *loc; loc++, star_width[0]++);
star_width[0] -= patsize(pat);

if (star_width[stars_total-1] < 0)
return false;

/*
* Loop over each token (character or class) in pat, matching
* it against the remaining unmatched tail of str. Return false
* on mismatch, or true after matching the trailing nul bytes.
*
*/

repeat: do {
char const *end;
bool inv;
int i;

/*
* (To handle * properly, which may match zero bytes, each case is
* required to increment the str pointer itself.
*/
switch (pat[0]) {
case '?':
if (!*str++)
goto rearrange;
break;
case '*':
if (pat[1] == '\0') /* Optimize trailing * case */
return true;
for (i = 0; *str && i < star_width[stars_found];
i++, str++);
stars_found++;
break;
case '[':
/* Character class */
/* Is it an inverted character class? */
inv = (pat[1] == '^' || pat[1] == '!');
/* If no terminating ], interpret as plain text. */
end = strchr(pat+2+inv, ']');
if (!end)
goto def;
pat += 1 + inv;
if (is_in_class(*str++, pat) == inv)
goto rearrange;
pat = end;
break;
case '\\':
pat++;
/* FALLLTHROUGH */
default:
def:
if (*pat != *str++)
goto rearrange;
break;
}
} while (*pat++);
return true;

rearrange:
if (next_arrangement(star_width, stars_found, stars_total)) {
stars_found = 0;
str = in_str;
pat = in_pat;
goto repeat;
}

return false;
}

/* Test code */
#include <stdio.h>
#include <stdlib.h>

static void
test(char const *pat, char const *str, bool expected)
{
bool match = globmatch(pat, str);
printf("\"%s\" vs. \"%s\": %s %s\n", pat, str, match ? "match" :
"mismatch",
match == expected ? "OK" : "*** ERROR ***");
/*if (match != expected)
exit(1);*/
}

int
main(void)
{
test("a", "a", true);
test("a", "b", false);
test("a", "aa", false);
test("a", "", false);
test("", "", true);
test("", "a", false);
test("?", "a", true);
test("?", "aa", false);
test("??", "a", false);
test("?x?", "axb", true);
test("?x?", "abx", false);
test("?x?", "xab", false);
test("*??", "a", false);
test("*??", "ab", true);
test("*??", "abc", true);
test("??*", "a", false);
test("??*", "ab", true);
test("??*", "abc", true);
test("?*?", "a", false);
test("?*?", "ab", true);
test("?*?", "abc", true);
test("*b", "b", true);
test("*b", "ab", true);
test("*b", "aab", true);
test("*bc", "abbc", true);
test("*bc", "bc", true);
test("*bc", "bbc", true);
test("*abcd*", "abcabcabcabcdefg", true);
test("*abcd*", "abcabcabcabcefg", false);
test("*abcd*q*r*", "abcabcabcabcdefgqabrcd", true);
test("[a]", "a", true);
test("[^a]", "a", false);
test("[!a]", "a", false);
test("[!a]", "b", true);
test("[ab]", "a", true);
test("[ab]", "b", true);
test("[ab]", "c", false);
test("[^ab]", "c", true);
test("[a-c]", "b", true);
test("[a-c]", "d", false);
test("[]a-ceg-ik[]", "a", true);
test("[]a-ceg-ik[]", "]", true);
test("[]a-ceg-ik[]", "[", true);
test("[]a-ceg-ik[]", "h", true);
test("[]a-ceg-ik[]", "f", false);
test("[!]a-ceg-ik[]", "h", false);
test("[!]a-ceg-ik[]", "]", false);
test("[!]a-ceg-ik[]", "f", true);

return 0;
}