2021-04-22 19:20:24

by Aditya Srivastava

[permalink] [raw]
Subject: [RFC] scripts: kernel-doc: reduce repeated regex expressions into variables

There are some regex expressions in the kernel-doc script, which are used
repeatedly in the script.

Reduce such expressions into variables, which can be used everywhere.

A quick manual check found that no errors and warnings were added/removed
in this process.

Suggested-by: Jonathan Corbet <[email protected]>
Signed-off-by: Aditya Srivastava <[email protected]>
---
scripts/kernel-doc | 89 ++++++++++++++++++++++++++--------------------
1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 2a85d34fdcd0..579c9fdd275f 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -406,6 +406,7 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
my $doc_inline_end = '^\s*\*/\s*$';
my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
+my $pointer_function = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};

my %parameterdescs;
my %parameterdesc_start_lines;
@@ -694,7 +695,7 @@ sub output_function_man(%) {
$post = ");";
}
$type = $args{'parametertypes'}{$parameter};
- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$pointer_function/) {
# pointer-to-function
print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" . $post . "\"\n";
} else {
@@ -974,7 +975,7 @@ sub output_function_rst(%) {
$count++;
$type = $args{'parametertypes'}{$parameter};

- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$pointer_function/) {
# pointer-to-function
print $1 . $parameter . ") (" . $2 . ")";
} else {
@@ -1210,8 +1211,14 @@ sub dump_struct($$) {
my $decl_type;
my $members;
my $type = qr{struct|union};
+ my $packed = qr{__packed};
+ my $aligned = qr{__aligned};
+ my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
+ my $cacheline_aligned = qr{____cacheline_aligned};
+ my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
# For capturing struct/union definition body, i.e. "{members*}qualifiers*"
- my $definition_body = qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
+ my $definition_body = qr{\{(.*)\}(?:\s*(?:$packed|$aligned|$cacheline_aligned_in_smp|$cacheline_aligned|$attribute))*};
+ my $struct_members = qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};

if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
$decl_type = $1;
@@ -1235,27 +1242,27 @@ sub dump_struct($$) {
# strip comments:
$members =~ s/\/\*.*?\*\///gos;
# strip attributes
- $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
- $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
- $members =~ s/\s*__packed\s*/ /gos;
+ $members =~ s/\s*$attribute/ /gi;
+ $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;
+ $members =~ s/\s*$packed\s*/ /gos;
$members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
- $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
- $members =~ s/\s*____cacheline_aligned/ /gos;
+ $members =~ s/\s*$cacheline_aligned_in_smp/ /gos;
+ $members =~ s/\s*$cacheline_aligned/ /gos;

+ my $args = qr{([^,)]+)};
# replace DECLARE_BITMAP
$members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
- $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
+ $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
# replace DECLARE_HASHTABLE
- $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
+ $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
# replace DECLARE_KFIFO
- $members =~ s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
+ $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2 \*$1/gos;
# replace DECLARE_KFIFO_PTR
- $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
-
+ $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
my $declaration = $members;

# Split nested struct/union elements as newer ones
- while ($members =~ m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
+ while ($members =~ m/$struct_members/) {
my $newmember;
my $maintype = $1;
my $ids = $4;
@@ -1315,7 +1322,7 @@ sub dump_struct($$) {
}
}
}
- $members =~ s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
+ $members =~ s/$struct_members/$newmember/;
}

# Ignore other nested elements, like enums
@@ -1555,8 +1562,9 @@ sub create_parameterlist($$$$) {
my $param;

# temporarily replace commas inside function pointer definition
- while ($args =~ /(\([^\),]+),/) {
- $args =~ s/(\([^\),]+),/$1#/g;
+ my $arg_expr = qr{\([^\),]+};
+ while ($args =~ /$arg_expr,/) {
+ $args =~ s/($arg_expr),/$1#/g;
}

foreach my $arg (split($splitter, $args)) {
@@ -1808,8 +1816,11 @@ sub dump_function($$) {
# - parport_register_device (function pointer parameters)
# - atomic_set (macro)
# - pci_match_device, __copy_to_user (long return type)
+ my $name = qr{[a-zA-Z0-9_~:]+};
+ my $prototype_end1 = qr{\(([^\(]*)\)};
+ my $prototype_end2 = qr{\(([^\{]*)\)};

- if ($define && $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s+/) {
+ if ($define && $prototype =~ m/^()($name)\s+/) {
# This is an object-like macro, it has no return type and no parameter
# list.
# Function-like macros are not allowed to have spaces between
@@ -1817,23 +1828,23 @@ sub dump_function($$) {
$return_type = $1;
$declaration_name = $2;
$noret = 1;
- } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
+ } elsif ($prototype =~ m/^()($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^(\w+)\s+($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^(\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^(\w+\s+\w+)\s+($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
+ $prototype =~ m/^()($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+)\s+($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
+ $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*($name)\s*$prototype_end2/) {
$return_type = $1;
$declaration_name = $2;
my $args = $3;
@@ -2110,12 +2121,12 @@ sub process_name($$) {
} elsif (/$doc_decl/o) {
$identifier = $1;
my $is_kernel_comment = 0;
- my $decl_start = qr{\s*\*};
+ my $decl_start = qr{$doc_com};
# test for pointer declaration type, foo * bar() - desc
my $fn_type = qr{\w+\s*\*\s*};
my $parenthesis = qr{\(\w*\)};
my $decl_end = qr{[-:].*};
- if (/^$decl_start\s*([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
+ if (/^$decl_start([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
$identifier = $1;
}
if ($identifier =~ m/^(struct|union|enum|typedef)\b\s*(\S*)/) {
@@ -2125,8 +2136,8 @@ sub process_name($$) {
}
# Look for foo() or static void foo() - description; or misspelt
# identifier
- elsif (/^$decl_start\s*$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
- /^$decl_start\s*$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
+ elsif (/^$decl_start$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
+ /^$decl_start$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
$identifier = $1;
$decl_type = 'function';
$identifier =~ s/^define\s+//;
--
2.17.1


2021-04-23 12:25:00

by Aditya Srivastava

[permalink] [raw]
Subject: Re: [RFC] scripts: kernel-doc: reduce repeated regex expressions into variables

On 23/4/21 1:03 am, Lukas Bulwahn wrote:
> Aditya Srivastava <[email protected]> schrieb am Do., 22. Apr. 2021,
> 21:18:
>
>> There are some regex expressions in the kernel-doc script, which are used
>> repeatedly in the script.
>>
>> Reduce such expressions into variables, which can be used everywhere.
>>
>> A quick manual check found that no errors and warnings were added/removed
>> in this process.
>>
>> Suggested-by: Jonathan Corbet <[email protected]>
>> Signed-off-by: Aditya Srivastava <[email protected]>
>> ---
>> scripts/kernel-doc | 89 ++++++++++++++++++++++++++--------------------
>> 1 file changed, 50 insertions(+), 39 deletions(-)
>>
>> diff --git a/scripts/kernel-doc b/scripts/kernel-doc
>> index 2a85d34fdcd0..579c9fdd275f 100755
>> --- a/scripts/kernel-doc
>> +++ b/scripts/kernel-doc
>> @@ -406,6 +406,7 @@ my $doc_inline_sect =
>> '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
>> my $doc_inline_end = '^\s*\*/\s*$';
>> my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
>> my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
>> +my $pointer_function = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
>>
>> my %parameterdescs;
>> my %parameterdesc_start_lines;
>> @@ -694,7 +695,7 @@ sub output_function_man(%) {
>> $post = ");";
>> }
>> $type = $args{'parametertypes'}{$parameter};
>> - if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
>> + if ($type =~ m/$pointer_function/) {
>> # pointer-to-function
>> print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" .
>> $post . "\"\n";
>> } else {
>> @@ -974,7 +975,7 @@ sub output_function_rst(%) {
>> $count++;
>> $type = $args{'parametertypes'}{$parameter};
>>
>> - if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
>> + if ($type =~ m/$pointer_function/) {
>> # pointer-to-function
>> print $1 . $parameter . ") (" . $2 . ")";
>> } else {
>> @@ -1210,8 +1211,14 @@ sub dump_struct($$) {
>> my $decl_type;
>> my $members;
>> my $type = qr{struct|union};
>> + my $packed = qr{__packed};
>> + my $aligned = qr{__aligned};
>> + my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
>> + my $cacheline_aligned = qr{____cacheline_aligned};
>> + my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
>> # For capturing struct/union definition body, i.e.
>> "{members*}qualifiers*"
>> - my $definition_body =
>> qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
>> + my $definition_body =
>> qr{\{(.*)\}(?:\s*(?:$packed|$aligned|$cacheline_aligned_in_smp|$cacheline_aligned|$attribute))*};
>> + my $struct_members =
>> qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};
>>
>> if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
>> $decl_type = $1;
>> @@ -1235,27 +1242,27 @@ sub dump_struct($$) {
>> # strip comments:
>> $members =~ s/\/\*.*?\*\///gos;
>> # strip attributes
>> - $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
>> - $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
>> - $members =~ s/\s*__packed\s*/ /gos;
>> + $members =~ s/\s*$attribute/ /gi;
>> + $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;
>> + $members =~ s/\s*$packed\s*/ /gos;
>> $members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
>> - $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
>> - $members =~ s/\s*____cacheline_aligned/ /gos;
>> + $members =~ s/\s*$cacheline_aligned_in_smp/ /gos;
>> + $members =~ s/\s*$cacheline_aligned/ /gos;
>>
>> + my $args = qr{([^,)]+)};
>> # replace DECLARE_BITMAP
>> $members =~
>> s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1,
>> __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
>> - $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned
>> long $1\[BITS_TO_LONGS($2)\]/gos;
>> + $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long
>> $1\[BITS_TO_LONGS($2)\]/gos;
>> # replace DECLARE_HASHTABLE
>> - $members =~
>> s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2)
>> - 1)\]/gos;
>> + $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long
>> $1\[1 << (($2) - 1)\]/gos;
>> # replace DECLARE_KFIFO
>> - $members =~
>> s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
>> + $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2
>> \*$1/gos;
>> # replace DECLARE_KFIFO_PTR
>> - $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2
>> \*$1/gos;
>> -
>> + $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
>> my $declaration = $members;
>>
>> # Split nested struct/union elements as newer ones
>> - while ($members =~
>> m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
>> + while ($members =~ m/$struct_members/) {
>> my $newmember;
>> my $maintype = $1;
>> my $ids = $4;
>> @@ -1315,7 +1322,7 @@ sub dump_struct($$) {
>> }
>> }
>> }
>> - $members =~
>> s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
>> + $members =~ s/$struct_members/$newmember/;
>> }
>>
>> # Ignore other nested elements, like enums
>> @@ -1555,8 +1562,9 @@ sub create_parameterlist($$$$) {
>> my $param;
>>
>> # temporarily replace commas inside function pointer definition
>> - while ($args =~ /(\([^\),]+),/) {
>> - $args =~ s/(\([^\),]+),/$1#/g;
>> + my $arg_expr = qr{\([^\),]+};
>> + while ($args =~ /$arg_expr,/) {
>> + $args =~ s/($arg_expr),/$1#/g;
>> }
>>
>> foreach my $arg (split($splitter, $args)) {
>> @@ -1808,8 +1816,11 @@ sub dump_function($$) {
>> # - parport_register_device (function pointer parameters)
>> # - atomic_set (macro)
>> # - pci_match_device, __copy_to_user (long return type)
>> + my $name = qr{[a-zA-Z0-9_~:]+};
>> + my $prototype_end1 = qr{\(([^\(]*)\)};
>> + my $prototype_end2 = qr{\(([^\{]*)\)};
>>
>
> Why do you need end1 and end2 here?
>

Thanks for pointing out, Lukas. I am looking into the possibility of
combining these expressions, and testing against the files.
Please let me know if there are any more improvements possible :)

Thanks
Aditya

2021-04-23 13:24:15

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] scripts: kernel-doc: reduce repeated regex expressions into variables

On Fri, Apr 23, 2021 at 12:48:39AM +0530, Aditya Srivastava wrote:
> +my $pointer_function = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};

Is that a pointer-to-function? Or as people who write C usually call it,
a function pointer? Wouldn't it be better to call it $function_pointer?

> @@ -1210,8 +1211,14 @@ sub dump_struct($$) {
> my $decl_type;
> my $members;
> my $type = qr{struct|union};
> + my $packed = qr{__packed};
> + my $aligned = qr{__aligned};
> + my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
> + my $cacheline_aligned = qr{____cacheline_aligned};

I don't think those four definitions actually simplify anything.

> + my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;

... whereas this one definitely does.

> - $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
> - $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
> - $members =~ s/\s*__packed\s*/ /gos;
> + $members =~ s/\s*$attribute/ /gi;
> + $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;

Maybe put the \s*\([^;]*\) into $aligned? Then it becomes a useful
abstraction.

> - } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
> + } elsif ($prototype =~ m/^()($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^(\w+)\s+($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^(\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^(\w+\s+\w+)\s+($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
> + $prototype =~ m/^()($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+)\s+($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
> + $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*($name)\s*$prototype_end2/) {

This is probably the best patch I've seen so far this year.

Now, can we go further? For example:
$prototype_end = $prototype_end1|$prototype_end2
That would let us cut the number of lines here in half.

Can we create a definition for a variable number of \w and \s and '*'
in the return type? In fact, can we define a regex that matches a type?
So this would become:

> + } elsif ($prototype =~ m/^($type)\s*($name)\s*$prototype_end/) {

2021-04-24 11:58:48

by Aditya Srivastava

[permalink] [raw]
Subject: Re: [RFC] scripts: kernel-doc: reduce repeated regex expressions into variables

On 23/4/21 6:51 pm, Matthew Wilcox wrote:
> On Fri, Apr 23, 2021 at 12:48:39AM +0530, Aditya Srivastava wrote:
>> +my $pointer_function = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
>
> Is that a pointer-to-function? Or as people who write C usually call it,
> a function pointer? Wouldn't it be better to call it $function_pointer?
>
Will do it.

>> @@ -1210,8 +1211,14 @@ sub dump_struct($$) {
>> my $decl_type;
>> my $members;
>> my $type = qr{struct|union};
>> + my $packed = qr{__packed};
>> + my $aligned = qr{__aligned};
>> + my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
>> + my $cacheline_aligned = qr{____cacheline_aligned};
>
> I don't think those four definitions actually simplify anything.
>
>> + my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
>
> ... whereas this one definitely does.
>
>> - $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
>> - $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
>> - $members =~ s/\s*__packed\s*/ /gos;
>> + $members =~ s/\s*$attribute/ /gi;
>> + $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;
>
> Maybe put the \s*\([^;]*\) into $aligned? Then it becomes a useful
> abstraction.

Actually, I had made these variables as they were repeated here and at
- my $definition_body =
qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
+ my $definition_body =
qr{\{(.*)\}(?:\s*(?:$packed|$aligned|$cacheline_aligned_in_smp|$cacheline_aligned|$attribute))*};

So, defining them at a place might help.

What do you think?

>
>> - } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
>> - $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
>> - $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
>> + } elsif ($prototype =~ m/^()($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^(\w+)\s+($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^(\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^(\w+\s+\w+)\s+($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end1/ ||
>> + $prototype =~ m/^()($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+)\s+($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*($name)\s*$prototype_end2/ ||
>> + $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*($name)\s*$prototype_end2/) {
>
> This is probably the best patch I've seen so far this year.
>
> Now, can we go further? For example:
> $prototype_end = $prototype_end1|$prototype_end2
> That would let us cut the number of lines here in half.
> > Can we create a definition for a variable number of \w and \s and '*'
> in the return type? In fact, can we define a regex that matches a type?
> So this would become:
>
>> + } elsif ($prototype =~ m/^($type)\s*($name)\s*$prototype_end/) {
>

I have been able to reduce these expressions furthermore. Will send a
v2 in few..

Thanks
Aditya

2021-04-24 12:51:07

by Aditya Srivastava

[permalink] [raw]
Subject: [RFC v2] scripts: kernel-doc: reduce repeated regex expressions into variables

There are some regex expressions in the kernel-doc script, which are used
repeatedly in the script.

Reduce such expressions into variables, which can be used everywhere.

A quick manual check found that no errors and warnings were added/removed
in this process.

Suggested-by: Jonathan Corbet <[email protected]>
Signed-off-by: Aditya Srivastava <[email protected]>
---
Changes in v2:
- Rename $pointer_function to $function_pointer
- Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
- Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end

scripts/kernel-doc | 80 +++++++++++++++++++++++-----------------------
1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 2a85d34fdcd0..fe7f51be44e0 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -406,6 +406,7 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
my $doc_inline_end = '^\s*\*/\s*$';
my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
+my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};

my %parameterdescs;
my %parameterdesc_start_lines;
@@ -694,7 +695,7 @@ sub output_function_man(%) {
$post = ");";
}
$type = $args{'parametertypes'}{$parameter};
- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$function_pointer/) {
# pointer-to-function
print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" . $post . "\"\n";
} else {
@@ -974,7 +975,7 @@ sub output_function_rst(%) {
$count++;
$type = $args{'parametertypes'}{$parameter};

- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$function_pointer/) {
# pointer-to-function
print $1 . $parameter . ") (" . $2 . ")";
} else {
@@ -1210,8 +1211,14 @@ sub dump_struct($$) {
my $decl_type;
my $members;
my $type = qr{struct|union};
+ my $packed = qr{__packed};
+ my $aligned = qr{__aligned};
+ my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
+ my $cacheline_aligned = qr{____cacheline_aligned};
+ my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
# For capturing struct/union definition body, i.e. "{members*}qualifiers*"
- my $definition_body = qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
+ my $definition_body = qr{\{(.*)\}(?:\s*(?:$packed|$aligned|$cacheline_aligned_in_smp|$cacheline_aligned|$attribute))*};
+ my $struct_members = qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};

if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
$decl_type = $1;
@@ -1235,27 +1242,27 @@ sub dump_struct($$) {
# strip comments:
$members =~ s/\/\*.*?\*\///gos;
# strip attributes
- $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
- $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
- $members =~ s/\s*__packed\s*/ /gos;
+ $members =~ s/\s*$attribute/ /gi;
+ $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;
+ $members =~ s/\s*$packed\s*/ /gos;
$members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
- $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
- $members =~ s/\s*____cacheline_aligned/ /gos;
+ $members =~ s/\s*$cacheline_aligned_in_smp/ /gos;
+ $members =~ s/\s*$cacheline_aligned/ /gos;

+ my $args = qr{([^,)]+)};
# replace DECLARE_BITMAP
$members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
- $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
+ $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
# replace DECLARE_HASHTABLE
- $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
+ $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
# replace DECLARE_KFIFO
- $members =~ s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
+ $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2 \*$1/gos;
# replace DECLARE_KFIFO_PTR
- $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
-
+ $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
my $declaration = $members;

# Split nested struct/union elements as newer ones
- while ($members =~ m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
+ while ($members =~ m/$struct_members/) {
my $newmember;
my $maintype = $1;
my $ids = $4;
@@ -1315,7 +1322,7 @@ sub dump_struct($$) {
}
}
}
- $members =~ s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
+ $members =~ s/$struct_members/$newmember/;
}

# Ignore other nested elements, like enums
@@ -1555,8 +1562,9 @@ sub create_parameterlist($$$$) {
my $param;

# temporarily replace commas inside function pointer definition
- while ($args =~ /(\([^\),]+),/) {
- $args =~ s/(\([^\),]+),/$1#/g;
+ my $arg_expr = qr{\([^\),]+};
+ while ($args =~ /$arg_expr,/) {
+ $args =~ s/($arg_expr),/$1#/g;
}

foreach my $arg (split($splitter, $args)) {
@@ -1808,8 +1816,14 @@ sub dump_function($$) {
# - parport_register_device (function pointer parameters)
# - atomic_set (macro)
# - pci_match_device, __copy_to_user (long return type)
-
- if ($define && $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s+/) {
+ my $name = qr{[a-zA-Z0-9_~:]+};
+ my $prototype_end1 = qr{[^\(]*};
+ my $prototype_end2 = qr{[^\{]*};
+ my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};
+ my $type1 = qr{[\w\s]+};
+ my $type2 = qr{$type1\*+};
+
+ if ($define && $prototype =~ m/^()($name)\s+/) {
# This is an object-like macro, it has no return type and no parameter
# list.
# Function-like macros are not allowed to have spaces between
@@ -1817,23 +1831,9 @@ sub dump_function($$) {
$return_type = $1;
$declaration_name = $2;
$noret = 1;
- } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
+ } elsif ($prototype =~ m/^()($name)\s*$prototype_end/ ||
+ $prototype =~ m/^($type1)\s+($name)\s*$prototype_end/ ||
+ $prototype =~ m/^($type2)+\s*($name)\s*$prototype_end/) {
$return_type = $1;
$declaration_name = $2;
my $args = $3;
@@ -2110,12 +2110,12 @@ sub process_name($$) {
} elsif (/$doc_decl/o) {
$identifier = $1;
my $is_kernel_comment = 0;
- my $decl_start = qr{\s*\*};
+ my $decl_start = qr{$doc_com};
# test for pointer declaration type, foo * bar() - desc
my $fn_type = qr{\w+\s*\*\s*};
my $parenthesis = qr{\(\w*\)};
my $decl_end = qr{[-:].*};
- if (/^$decl_start\s*([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
+ if (/^$decl_start([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
$identifier = $1;
}
if ($identifier =~ m/^(struct|union|enum|typedef)\b\s*(\S*)/) {
@@ -2125,8 +2125,8 @@ sub process_name($$) {
}
# Look for foo() or static void foo() - description; or misspelt
# identifier
- elsif (/^$decl_start\s*$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
- /^$decl_start\s*$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
+ elsif (/^$decl_start$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
+ /^$decl_start$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
$identifier = $1;
$decl_type = 'function';
$identifier =~ s/^define\s+//;
--
2.17.1

2021-04-26 17:34:54

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] scripts: kernel-doc: reduce repeated regex expressions into variables

On Sat, Apr 24, 2021 at 05:27:34PM +0530, Aditya Srivastava wrote:
> On 23/4/21 6:51 pm, Matthew Wilcox wrote:
> > On Fri, Apr 23, 2021 at 12:48:39AM +0530, Aditya Srivastava wrote:
> >> +my $pointer_function = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
> >
> > Is that a pointer-to-function? Or as people who write C usually call it,
> > a function pointer? Wouldn't it be better to call it $function_pointer?
> >
> Will do it.
>
> >> @@ -1210,8 +1211,14 @@ sub dump_struct($$) {
> >> my $decl_type;
> >> my $members;
> >> my $type = qr{struct|union};
> >> + my $packed = qr{__packed};
> >> + my $aligned = qr{__aligned};
> >> + my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
> >> + my $cacheline_aligned = qr{____cacheline_aligned};
> >
> > I don't think those four definitions actually simplify anything.
> >
> >> + my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
> >
> > ... whereas this one definitely does.
> >
> >> - $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
> >> - $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
> >> - $members =~ s/\s*__packed\s*/ /gos;
> >> + $members =~ s/\s*$attribute/ /gi;
> >> + $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;
> >
> > Maybe put the \s*\([^;]*\) into $aligned? Then it becomes a useful
> > abstraction.
>
> Actually, I had made these variables as they were repeated here and at
> - my $definition_body =
> qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
> + my $definition_body =
> qr{\{(.*)\}(?:\s*(?:$packed|$aligned|$cacheline_aligned_in_smp|$cacheline_aligned|$attribute))*};
>
> So, defining them at a place might help.
>
> What do you think?

I don't think that seeing $packed is any easier to read than __packed.
Indeed, I think it's harder, because now I have to look up what $packed
is defined as.

Defining a variable, say

$decorations = qr{__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\))}
(i didn't count brackets to be sure i got that right)

would be helpful because then we could say:

my $definition_body = qr{\{(.*)\}...$decorations...

and have a fighting chance of understanding what it means.

Now, this other place we use it, we do the =~ operation a number of times.
Is there a way to use the $decorations variable to do the same thing
with a single operation?

2021-04-27 16:12:09

by Jonathan Corbet

[permalink] [raw]
Subject: Re: [RFC v2] scripts: kernel-doc: reduce repeated regex expressions into variables

Aditya Srivastava <[email protected]> writes:

> There are some regex expressions in the kernel-doc script, which are used
> repeatedly in the script.
>
> Reduce such expressions into variables, which can be used everywhere.
>
> A quick manual check found that no errors and warnings were added/removed
> in this process.
>
> Suggested-by: Jonathan Corbet <[email protected]>
> Signed-off-by: Aditya Srivastava <[email protected]>

So my comments pretty much mirror Willy's ... there is a lot to really
like here mixed with some stuff that's not as helpful.

> Changes in v2:
> - Rename $pointer_function to $function_pointer
> - Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
> - Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end
>
> scripts/kernel-doc | 80 +++++++++++++++++++++++-----------------------
> 1 file changed, 40 insertions(+), 40 deletions(-)
>
> diff --git a/scripts/kernel-doc b/scripts/kernel-doc
> index 2a85d34fdcd0..fe7f51be44e0 100755
> --- a/scripts/kernel-doc
> +++ b/scripts/kernel-doc
> @@ -406,6 +406,7 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
> my $doc_inline_end = '^\s*\*/\s*$';
> my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
> my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
> +my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
>
> my %parameterdescs;
> my %parameterdesc_start_lines;
> @@ -694,7 +695,7 @@ sub output_function_man(%) {
> $post = ");";
> }
> $type = $args{'parametertypes'}{$parameter};
> - if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
> + if ($type =~ m/$function_pointer/) {

This kind of change is wonderful. The more that we can coalesce this
regex mess and reuse it throughout the script, the more maintainable it
will be.

> # pointer-to-function
> print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" . $post . "\"\n";
> } else {
> @@ -974,7 +975,7 @@ sub output_function_rst(%) {
> $count++;
> $type = $args{'parametertypes'}{$parameter};
>
> - if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
> + if ($type =~ m/$function_pointer/) {
> # pointer-to-function
> print $1 . $parameter . ") (" . $2 . ")";
> } else {
> @@ -1210,8 +1211,14 @@ sub dump_struct($$) {
> my $decl_type;
> my $members;
> my $type = qr{struct|union};
> + my $packed = qr{__packed};
> + my $aligned = qr{__aligned};
> + my $cacheline_aligned_in_smp = qr{____cacheline_aligned_in_smp};
> + my $cacheline_aligned = qr{____cacheline_aligned};
> + my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
> # For capturing struct/union definition body, i.e. "{members*}qualifiers*"
> - my $definition_body = qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
> + my $definition_body = qr{\{(.*)\}(?:\s*(?:$packed|$aligned|$cacheline_aligned_in_smp|$cacheline_aligned|$attribute))*};
> + my $struct_members = qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};

This is a bit less helpful - variables like $aligned don't do much for
us. That can be seen just below:

>
> if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
> $decl_type = $1;
> @@ -1235,27 +1242,27 @@ sub dump_struct($$) {
> # strip comments:
> $members =~ s/\/\*.*?\*\///gos;
> # strip attributes
> - $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
> - $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
> - $members =~ s/\s*__packed\s*/ /gos;
> + $members =~ s/\s*$attribute/ /gi;
> + $members =~ s/\s*$aligned\s*\([^;]*\)/ /gos;
> + $members =~ s/\s*$packed\s*/ /gos;

The use of the variables here doesn't really make those expressions more
readable.

> $members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
> - $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
> - $members =~ s/\s*____cacheline_aligned/ /gos;
> + $members =~ s/\s*$cacheline_aligned_in_smp/ /gos;
> + $members =~ s/\s*$cacheline_aligned/ /gos;
>
> + my $args = qr{([^,)]+)};
> # replace DECLARE_BITMAP
> $members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
> - $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
> + $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;

Here too ... this is the kind of stuff that makes me glad that Colorado
is a legal-weed state, and the new version, while better, doesn't change
that basic fact.

> # replace DECLARE_HASHTABLE
> - $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
> + $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
> # replace DECLARE_KFIFO
> - $members =~ s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
> + $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2 \*$1/gos;
> # replace DECLARE_KFIFO_PTR
> - $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
> -
> + $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
> my $declaration = $members;
>
> # Split nested struct/union elements as newer ones
> - while ($members =~ m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
> + while ($members =~ m/$struct_members/) {

...but this is great.

> my $newmember;
> my $maintype = $1;
> my $ids = $4;
> @@ -1315,7 +1322,7 @@ sub dump_struct($$) {
> }
> }
> }
> - $members =~ s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
> + $members =~ s/$struct_members/$newmember/;
> }
>
> # Ignore other nested elements, like enums
> @@ -1555,8 +1562,9 @@ sub create_parameterlist($$$$) {
> my $param;
>
> # temporarily replace commas inside function pointer definition
> - while ($args =~ /(\([^\),]+),/) {
> - $args =~ s/(\([^\),]+),/$1#/g;
> + my $arg_expr = qr{\([^\),]+};
> + while ($args =~ /$arg_expr,/) {
> + $args =~ s/($arg_expr),/$1#/g;
> }
>
> foreach my $arg (split($splitter, $args)) {
> @@ -1808,8 +1816,14 @@ sub dump_function($$) {
> # - parport_register_device (function pointer parameters)
> # - atomic_set (macro)
> # - pci_match_device, __copy_to_user (long return type)
> -
> - if ($define && $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s+/) {
> + my $name = qr{[a-zA-Z0-9_~:]+};
> + my $prototype_end1 = qr{[^\(]*};
> + my $prototype_end2 = qr{[^\{]*};
> + my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};
> + my $type1 = qr{[\w\s]+};
> + my $type2 = qr{$type1\*+};
> +
> + if ($define && $prototype =~ m/^()($name)\s+/) {
> # This is an object-like macro, it has no return type and no parameter
> # list.
> # Function-like macros are not allowed to have spaces between
> @@ -1817,23 +1831,9 @@ sub dump_function($$) {
> $return_type = $1;
> $declaration_name = $2;
> $noret = 1;
> - } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
> + } elsif ($prototype =~ m/^()($name)\s*$prototype_end/ ||
> + $prototype =~ m/^($type1)\s+($name)\s*$prototype_end/ ||
> + $prototype =~ m/^($type2)+\s*($name)\s*$prototype_end/) {

It's hard not to be happy about something like that - this is a definite
step toward clarifying this code.

I think I'll stop here; hopefully I've gotten my point across. I really
like where this work is heading; focusing just a bit more on pulling the
regexes together and making the whole thing more readable would be
wonderful.

Thanks,

jon

2021-04-27 16:58:22

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC v2] scripts: kernel-doc: reduce repeated regex expressions into variables

On Tue, Apr 27, 2021 at 09:55:35AM -0600, Jonathan Corbet wrote:
> The use of the variables here doesn't really make those expressions more
> readable.
>
> > $members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
> > - $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
> > - $members =~ s/\s*____cacheline_aligned/ /gos;
> > + $members =~ s/\s*$cacheline_aligned_in_smp/ /gos;
> > + $members =~ s/\s*$cacheline_aligned/ /gos;
> >
> > + my $args = qr{([^,)]+)};
> > # replace DECLARE_BITMAP
> > $members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
> > - $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
> > + $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
>
> Here too ... this is the kind of stuff that makes me glad that Colorado
> is a legal-weed state, and the new version, while better, doesn't change
> that basic fact.

I'm going to have to disagree with you on this one (I agree with you on all
the others). I find this much easier to read ...

"DECLARE_BITMAP followed by any amount of whitespace, literal open bracket,
an argument, literal comma, whitespace, another argument, literal close bracket"

Before, I get to "DECLARE_BITMAP followed by any amount of whitespace,
then some line noise".

Obviously I'm less experienced at reading regexes than you are, but this
simplification really does help me.

> I think I'll stop here; hopefully I've gotten my point across. I really
> like where this work is heading; focusing just a bit more on pulling the
> regexes together and making the whole thing more readable would be
> wonderful.

Amen.

2021-04-29 06:39:33

by Aditya Srivastava

[permalink] [raw]
Subject: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

There are some regex expressions in the kernel-doc script, which are used
repeatedly in the script.

Reduce such expressions into variables, which can be used everywhere.

A quick manual check found that no errors and warnings were added/removed
in this process.

Suggested-by: Jonathan Corbet <[email protected]>
Signed-off-by: Aditya Srivastava <[email protected]>
---
Changes in v3:
- Remove variables for separate qualifiers in "sub dump_struct"
- Make a common variable for all the qualifiers
- Make $attribute global variable to use it at "sub check_sections" as well

Changes in v2:
- Rename $pointer_function to $function_pointer
- Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
- Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end

scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 2a85d34fdcd0..721005a02e64 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -406,6 +406,8 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
my $doc_inline_end = '^\s*\*/\s*$';
my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
+my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
+my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;

my %parameterdescs;
my %parameterdesc_start_lines;
@@ -694,7 +696,7 @@ sub output_function_man(%) {
$post = ");";
}
$type = $args{'parametertypes'}{$parameter};
- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$function_pointer/) {
# pointer-to-function
print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" . $post . "\"\n";
} else {
@@ -974,7 +976,7 @@ sub output_function_rst(%) {
$count++;
$type = $args{'parametertypes'}{$parameter};

- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$function_pointer/) {
# pointer-to-function
print $1 . $parameter . ") (" . $2 . ")";
} else {
@@ -1211,7 +1213,9 @@ sub dump_struct($$) {
my $members;
my $type = qr{struct|union};
# For capturing struct/union definition body, i.e. "{members*}qualifiers*"
- my $definition_body = qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
+ my $qualifiers = qr{$attribute|__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned};
+ my $definition_body = qr{\{(.*)\}\s*$qualifiers*};
+ my $struct_members = qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};

if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
$decl_type = $1;
@@ -1235,27 +1239,27 @@ sub dump_struct($$) {
# strip comments:
$members =~ s/\/\*.*?\*\///gos;
# strip attributes
- $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
+ $members =~ s/\s*$attribute/ /gi;
$members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
$members =~ s/\s*__packed\s*/ /gos;
$members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
$members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
$members =~ s/\s*____cacheline_aligned/ /gos;

+ my $args = qr{([^,)]+)};
# replace DECLARE_BITMAP
$members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
- $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
+ $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
# replace DECLARE_HASHTABLE
- $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
+ $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
# replace DECLARE_KFIFO
- $members =~ s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
+ $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2 \*$1/gos;
# replace DECLARE_KFIFO_PTR
- $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
-
+ $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
my $declaration = $members;

# Split nested struct/union elements as newer ones
- while ($members =~ m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
+ while ($members =~ m/$struct_members/) {
my $newmember;
my $maintype = $1;
my $ids = $4;
@@ -1315,7 +1319,7 @@ sub dump_struct($$) {
}
}
}
- $members =~ s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
+ $members =~ s/$struct_members/$newmember/;
}

# Ignore other nested elements, like enums
@@ -1555,8 +1559,9 @@ sub create_parameterlist($$$$) {
my $param;

# temporarily replace commas inside function pointer definition
- while ($args =~ /(\([^\),]+),/) {
- $args =~ s/(\([^\),]+),/$1#/g;
+ my $arg_expr = qr{\([^\),]+};
+ while ($args =~ /$arg_expr,/) {
+ $args =~ s/($arg_expr),/$1#/g;
}

foreach my $arg (split($splitter, $args)) {
@@ -1707,7 +1712,7 @@ sub check_sections($$$$$) {
foreach $px (0 .. $#prms) {
$prm_clean = $prms[$px];
$prm_clean =~ s/\[.*\]//;
- $prm_clean =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
+ $prm_clean =~ s/$attribute//i;
# ignore array size in a parameter string;
# however, the original param string may contain
# spaces, e.g.: addr[6 + 2]
@@ -1808,8 +1813,14 @@ sub dump_function($$) {
# - parport_register_device (function pointer parameters)
# - atomic_set (macro)
# - pci_match_device, __copy_to_user (long return type)
-
- if ($define && $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s+/) {
+ my $name = qr{[a-zA-Z0-9_~:]+};
+ my $prototype_end1 = qr{[^\(]*};
+ my $prototype_end2 = qr{[^\{]*};
+ my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};
+ my $type1 = qr{[\w\s]+};
+ my $type2 = qr{$type1\*+};
+
+ if ($define && $prototype =~ m/^()($name)\s+/) {
# This is an object-like macro, it has no return type and no parameter
# list.
# Function-like macros are not allowed to have spaces between
@@ -1817,23 +1828,9 @@ sub dump_function($$) {
$return_type = $1;
$declaration_name = $2;
$noret = 1;
- } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
+ } elsif ($prototype =~ m/^()($name)\s*$prototype_end/ ||
+ $prototype =~ m/^($type1)\s+($name)\s*$prototype_end/ ||
+ $prototype =~ m/^($type2)+\s*($name)\s*$prototype_end/) {
$return_type = $1;
$declaration_name = $2;
my $args = $3;
@@ -2110,12 +2107,12 @@ sub process_name($$) {
} elsif (/$doc_decl/o) {
$identifier = $1;
my $is_kernel_comment = 0;
- my $decl_start = qr{\s*\*};
+ my $decl_start = qr{$doc_com};
# test for pointer declaration type, foo * bar() - desc
my $fn_type = qr{\w+\s*\*\s*};
my $parenthesis = qr{\(\w*\)};
my $decl_end = qr{[-:].*};
- if (/^$decl_start\s*([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
+ if (/^$decl_start([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
$identifier = $1;
}
if ($identifier =~ m/^(struct|union|enum|typedef)\b\s*(\S*)/) {
@@ -2125,8 +2122,8 @@ sub process_name($$) {
}
# Look for foo() or static void foo() - description; or misspelt
# identifier
- elsif (/^$decl_start\s*$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
- /^$decl_start\s*$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
+ elsif (/^$decl_start$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
+ /^$decl_start$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
$identifier = $1;
$decl_type = 'function';
$identifier =~ s/^define\s+//;
--
2.17.1

2021-04-29 23:42:05

by Jonathan Corbet

[permalink] [raw]
Subject: Re: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

Aditya Srivastava <[email protected]> writes:

> There are some regex expressions in the kernel-doc script, which are used
> repeatedly in the script.
>
> Reduce such expressions into variables, which can be used everywhere.
>
> A quick manual check found that no errors and warnings were added/removed
> in this process.
>
> Suggested-by: Jonathan Corbet <[email protected]>
> Signed-off-by: Aditya Srivastava <[email protected]>
> ---
> Changes in v3:
> - Remove variables for separate qualifiers in "sub dump_struct"
> - Make a common variable for all the qualifiers
> - Make $attribute global variable to use it at "sub check_sections" as well
>
> Changes in v2:
> - Rename $pointer_function to $function_pointer
> - Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
> - Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end
>
> scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
> 1 file changed, 34 insertions(+), 37 deletions(-)

So this looks good but ... it adds a warning to the build:

/stuff/k/git/kernel/Documentation/driver-api/media/v4l2-controls:823: ./include/media/v4l2-ctrls.h:964: WARNING: Invalid C declaration: Expected identifier in nested name. [error at 6]
const * v4l2_ctrl_get_menu (u32 id)
------^

So it looks like something isn't being parsed quite identically?

Thanks,

jon

2021-04-30 02:05:38

by Joe Perches

[permalink] [raw]
Subject: Re: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

On Thu, 2021-04-29 at 17:39 -0600, Jonathan Corbet wrote:
> Aditya Srivastava <[email protected]> writes:
>
> > There are some regex expressions in the kernel-doc script, which are used
> > repeatedly in the script.
> >
> > Reduce such expressions into variables, which can be used everywhere.
> >
> > A quick manual check found that no errors and warnings were added/removed
> > in this process.
> >
> > Suggested-by: Jonathan Corbet <[email protected]>
> > Signed-off-by: Aditya Srivastava <[email protected]>
> > ---
> > Changes in v3:
> > - Remove variables for separate qualifiers in "sub dump_struct"
> > - Make a common variable for all the qualifiers
> > - Make $attribute global variable to use it at "sub check_sections" as well
> >
> > Changes in v2:
> > - Rename $pointer_function to $function_pointer
> > - Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
> > - Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end
> >
> > ?scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
> > ?1 file changed, 34 insertions(+), 37 deletions(-)
>
> So this looks good but ... it adds a warning to the build:
>
> /stuff/k/git/kernel/Documentation/driver-api/media/v4l2-controls:823: ./include/media/v4l2-ctrls.h:964: WARNING: Invalid C declaration: Expected identifier in nested name. [error at 6]
> ??const * v4l2_ctrl_get_menu (u32 id)
> ??------^
>
> So it looks like something isn't being parsed quite identically?

Perhaps a few of the regexes from checkpatch could be used or
maybe a linux specific perl module produced.


2021-05-01 09:34:21

by Aditya Srivastava

[permalink] [raw]
Subject: Re: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

On 30/4/21 5:09 am, Jonathan Corbet wrote:
> Aditya Srivastava <[email protected]> writes:
>
>> There are some regex expressions in the kernel-doc script, which are used
>> repeatedly in the script.
>>
>> Reduce such expressions into variables, which can be used everywhere.
>>
>> A quick manual check found that no errors and warnings were added/removed
>> in this process.
>>
>> Suggested-by: Jonathan Corbet <[email protected]>
>> Signed-off-by: Aditya Srivastava <[email protected]>
>> ---
>> Changes in v3:
>> - Remove variables for separate qualifiers in "sub dump_struct"
>> - Make a common variable for all the qualifiers
>> - Make $attribute global variable to use it at "sub check_sections" as well
>>
>> Changes in v2:
>> - Rename $pointer_function to $function_pointer
>> - Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
>> - Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end
>>
>> scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
>> 1 file changed, 34 insertions(+), 37 deletions(-)
>
> So this looks good but ... it adds a warning to the build:
>
> /stuff/k/git/kernel/Documentation/driver-api/media/v4l2-controls:823: ./include/media/v4l2-ctrls.h:964: WARNING: Invalid C declaration: Expected identifier in nested name. [error at 6]
> const * v4l2_ctrl_get_menu (u32 id)
> ------^
>
> So it looks like something isn't being parsed quite identically?
>

Hi Jonathan!
I could not reproduce this error..
Can you suggest me how can I reproduce this error?
I ran kernel-doc -none {$file} over the tree.

Probably, this is not a kernel-doc error

Thanks
Aditya

2021-05-01 15:12:29

by Jonathan Corbet

[permalink] [raw]
Subject: Re: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

Aditya Srivastava <[email protected]> writes:

> On 30/4/21 5:09 am, Jonathan Corbet wrote:
>> Aditya Srivastava <[email protected]> writes:
>>
>>> There are some regex expressions in the kernel-doc script, which are used
>>> repeatedly in the script.
>>>
>>> Reduce such expressions into variables, which can be used everywhere.
>>>
>>> A quick manual check found that no errors and warnings were added/removed
>>> in this process.
>>>
>>> Suggested-by: Jonathan Corbet <[email protected]>
>>> Signed-off-by: Aditya Srivastava <[email protected]>
>>> ---
>>> Changes in v3:
>>> - Remove variables for separate qualifiers in "sub dump_struct"
>>> - Make a common variable for all the qualifiers
>>> - Make $attribute global variable to use it at "sub check_sections" as well
>>>
>>> Changes in v2:
>>> - Rename $pointer_function to $function_pointer
>>> - Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
>>> - Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end
>>>
>>> scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
>>> 1 file changed, 34 insertions(+), 37 deletions(-)
>>
>> So this looks good but ... it adds a warning to the build:
>>
>> /stuff/k/git/kernel/Documentation/driver-api/media/v4l2-controls:823: ./include/media/v4l2-ctrls.h:964: WARNING: Invalid C declaration: Expected identifier in nested name. [error at 6]
>> const * v4l2_ctrl_get_menu (u32 id)
>> ------^
>>
>> So it looks like something isn't being parsed quite identically?
>>
>
> Hi Jonathan!
> I could not reproduce this error..
> Can you suggest me how can I reproduce this error?
> I ran kernel-doc -none {$file} over the tree.
>
> Probably, this is not a kernel-doc error

It's a Sphinx error; run "make htmldocs" to see it. That said, the
error itself should be enough to point at where the problem is.

Thanks,

jon

2021-05-01 15:47:56

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

On Thu, Apr 29, 2021 at 12:07:29PM +0530, Aditya Srivastava wrote:
> + my $name = qr{[a-zA-Z0-9_~:]+};
> + my $prototype_end1 = qr{[^\(]*};
> + my $prototype_end2 = qr{[^\{]*};
> + my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};

Would this be better written as:

my $prototype_end = qr{\([^\(\{]*\)}

And now that I look at the whole thing, doesn't this fail to parse
a function declared as:

int f(void (*g)(long));

(that is, f takes a single argument, which is a pointer to a function
which takes a long argument and returns void)

Still, I don't think this was parsed correctly before, so it's not an
argument against this patch, just something to take care of later.

> + my $type1 = qr{[\w\s]+};
> + my $type2 = qr{$type1\*+};
> +
> + if ($define && $prototype =~ m/^()($name)\s+/) {
> # This is an object-like macro, it has no return type and no parameter
> # list.
> # Function-like macros are not allowed to have spaces between
> @@ -1817,23 +1828,9 @@ sub dump_function($$) {
> $return_type = $1;
> $declaration_name = $2;
> $noret = 1;
> - } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
> + } elsif ($prototype =~ m/^()($name)\s*$prototype_end/ ||
> + $prototype =~ m/^($type1)\s+($name)\s*$prototype_end/ ||
> + $prototype =~ m/^($type2)+\s*($name)\s*$prototype_end/) {
> $return_type = $1;
> $declaration_name = $2;
> my $args = $3;

2021-05-14 22:53:45

by Aditya Srivastava

[permalink] [raw]
Subject: [RFC v4] scripts: kernel-doc: reduce repeated regex expressions into variables

There are some regex expressions in the kernel-doc script, which are used
repeatedly in the script.

Reduce such expressions into variables, which can be used everywhere.

A quick manual check found that no errors and warnings were added/removed
in this process.

Suggested-by: Jonathan Corbet <[email protected]>
Signed-off-by: Aditya Srivastava <[email protected]>
---
Changes in v4:
- Fix htmldocs warning at function parsing, involving repeated $type2 identifiers capture
- Re-tested against all files in kernel tree

Changes in v3:
- Remove variables for separate qualifiers in "sub dump_struct"
- Make a common variable for all the qualifiers
- Make $attribute global variable to use it at "sub check_sections" as well

Changes in v2:
- Rename $pointer_function to $function_pointer
- Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
- Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end

scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 4840e748fca8..7c4a6a507ac4 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -406,6 +406,8 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
my $doc_inline_end = '^\s*\*/\s*$';
my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
+my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
+my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;

my %parameterdescs;
my %parameterdesc_start_lines;
@@ -694,7 +696,7 @@ sub output_function_man(%) {
$post = ");";
}
$type = $args{'parametertypes'}{$parameter};
- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$function_pointer/) {
# pointer-to-function
print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" . $post . "\"\n";
} else {
@@ -974,7 +976,7 @@ sub output_function_rst(%) {
$count++;
$type = $args{'parametertypes'}{$parameter};

- if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
+ if ($type =~ m/$function_pointer/) {
# pointer-to-function
print $1 . $parameter . ") (" . $2 . ")";
} else {
@@ -1211,7 +1213,9 @@ sub dump_struct($$) {
my $members;
my $type = qr{struct|union};
# For capturing struct/union definition body, i.e. "{members*}qualifiers*"
- my $definition_body = qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
+ my $qualifiers = qr{$attribute|__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned};
+ my $definition_body = qr{\{(.*)\}\s*$qualifiers*};
+ my $struct_members = qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};

if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
$decl_type = $1;
@@ -1235,27 +1239,27 @@ sub dump_struct($$) {
# strip comments:
$members =~ s/\/\*.*?\*\///gos;
# strip attributes
- $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
+ $members =~ s/\s*$attribute/ /gi;
$members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
$members =~ s/\s*__packed\s*/ /gos;
$members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
$members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
$members =~ s/\s*____cacheline_aligned/ /gos;

+ my $args = qr{([^,)]+)};
# replace DECLARE_BITMAP
$members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
- $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
+ $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
# replace DECLARE_HASHTABLE
- $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
+ $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
# replace DECLARE_KFIFO
- $members =~ s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
+ $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2 \*$1/gos;
# replace DECLARE_KFIFO_PTR
- $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
-
+ $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
my $declaration = $members;

# Split nested struct/union elements as newer ones
- while ($members =~ m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
+ while ($members =~ m/$struct_members/) {
my $newmember;
my $maintype = $1;
my $ids = $4;
@@ -1315,7 +1319,7 @@ sub dump_struct($$) {
}
}
}
- $members =~ s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
+ $members =~ s/$struct_members/$newmember/;
}

# Ignore other nested elements, like enums
@@ -1555,8 +1559,9 @@ sub create_parameterlist($$$$) {
my $param;

# temporarily replace commas inside function pointer definition
- while ($args =~ /(\([^\),]+),/) {
- $args =~ s/(\([^\),]+),/$1#/g;
+ my $arg_expr = qr{\([^\),]+};
+ while ($args =~ /$arg_expr,/) {
+ $args =~ s/($arg_expr),/$1#/g;
}

foreach my $arg (split($splitter, $args)) {
@@ -1707,7 +1712,7 @@ sub check_sections($$$$$) {
foreach $px (0 .. $#prms) {
$prm_clean = $prms[$px];
$prm_clean =~ s/\[.*\]//;
- $prm_clean =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
+ $prm_clean =~ s/$attribute//i;
# ignore array size in a parameter string;
# however, the original param string may contain
# spaces, e.g.: addr[6 + 2]
@@ -1809,8 +1814,14 @@ sub dump_function($$) {
# - parport_register_device (function pointer parameters)
# - atomic_set (macro)
# - pci_match_device, __copy_to_user (long return type)
-
- if ($define && $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s+/) {
+ my $name = qr{[a-zA-Z0-9_~:]+};
+ my $prototype_end1 = qr{[^\(]*};
+ my $prototype_end2 = qr{[^\{]*};
+ my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};
+ my $type1 = qr{[\w\s]+};
+ my $type2 = qr{$type1\*+};
+
+ if ($define && $prototype =~ m/^()($name)\s+/) {
# This is an object-like macro, it has no return type and no parameter
# list.
# Function-like macros are not allowed to have spaces between
@@ -1818,23 +1829,9 @@ sub dump_function($$) {
$return_type = $1;
$declaration_name = $2;
$noret = 1;
- } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
- $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
- $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
+ } elsif ($prototype =~ m/^()($name)\s*$prototype_end/ ||
+ $prototype =~ m/^($type1)\s+($name)\s*$prototype_end/ ||
+ $prototype =~ m/^($type2+)\s*($name)\s*$prototype_end/) {
$return_type = $1;
$declaration_name = $2;
my $args = $3;
@@ -2111,12 +2108,12 @@ sub process_name($$) {
} elsif (/$doc_decl/o) {
$identifier = $1;
my $is_kernel_comment = 0;
- my $decl_start = qr{\s*\*};
+ my $decl_start = qr{$doc_com};
# test for pointer declaration type, foo * bar() - desc
my $fn_type = qr{\w+\s*\*\s*};
my $parenthesis = qr{\(\w*\)};
my $decl_end = qr{[-:].*};
- if (/^$decl_start\s*([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
+ if (/^$decl_start([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
$identifier = $1;
}
if ($identifier =~ m/^(struct|union|enum|typedef)\b\s*(\S*)/) {
@@ -2126,8 +2123,8 @@ sub process_name($$) {
}
# Look for foo() or static void foo() - description; or misspelt
# identifier
- elsif (/^$decl_start\s*$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
- /^$decl_start\s*$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
+ elsif (/^$decl_start$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
+ /^$decl_start$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
$identifier = $1;
$decl_type = 'function';
$identifier =~ s/^define\s+//;
--
2.17.1


2021-05-14 23:06:50

by Aditya Srivastava

[permalink] [raw]
Subject: Re: [RFC v4] scripts: kernel-doc: reduce repeated regex expressions into variables

On 14/5/21 8:12 pm, Aditya Srivastava wrote:
> There are some regex expressions in the kernel-doc script, which are used
> repeatedly in the script.
>
> Reduce such expressions into variables, which can be used everywhere.
>
> A quick manual check found that no errors and warnings were added/removed
> in this process.
>
> Suggested-by: Jonathan Corbet <[email protected]>
> Signed-off-by: Aditya Srivastava <[email protected]>
> ---
> Changes in v4:
> - Fix htmldocs warning at function parsing, involving repeated $type2 identifiers capture
> - Re-tested against all files in kernel tree
>
> Changes in v3:
> - Remove variables for separate qualifiers in "sub dump_struct"
> - Make a common variable for all the qualifiers
> - Make $attribute global variable to use it at "sub check_sections" as well
>
> Changes in v2:
> - Rename $pointer_function to $function_pointer
> - Combine elsif-block expressions at "sub dump_function" into lesser regex expressions
> - Combine $prototype_end1,$prototype_end2 expressions into a common $prototype_end
>
> scripts/kernel-doc | 71 ++++++++++++++++++++++------------------------
> 1 file changed, 34 insertions(+), 37 deletions(-)
>
> diff --git a/scripts/kernel-doc b/scripts/kernel-doc
> index 4840e748fca8..7c4a6a507ac4 100755
> --- a/scripts/kernel-doc
> +++ b/scripts/kernel-doc
> @@ -406,6 +406,8 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)';
> my $doc_inline_end = '^\s*\*/\s*$';
> my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$';
> my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;';
> +my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)};
> +my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i;
>
> my %parameterdescs;
> my %parameterdesc_start_lines;
> @@ -694,7 +696,7 @@ sub output_function_man(%) {
> $post = ");";
> }
> $type = $args{'parametertypes'}{$parameter};
> - if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
> + if ($type =~ m/$function_pointer/) {
> # pointer-to-function
> print ".BI \"" . $parenth . $1 . "\" " . " \") (" . $2 . ")" . $post . "\"\n";
> } else {
> @@ -974,7 +976,7 @@ sub output_function_rst(%) {
> $count++;
> $type = $args{'parametertypes'}{$parameter};
>
> - if ($type =~ m/([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)/) {
> + if ($type =~ m/$function_pointer/) {
> # pointer-to-function
> print $1 . $parameter . ") (" . $2 . ")";
> } else {
> @@ -1211,7 +1213,9 @@ sub dump_struct($$) {
> my $members;
> my $type = qr{struct|union};
> # For capturing struct/union definition body, i.e. "{members*}qualifiers*"
> - my $definition_body = qr{\{(.*)\}(?:\s*(?:__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned|__attribute__\s*\(\([a-z0-9,_\s\(\)]*\)\)))*};
> + my $qualifiers = qr{$attribute|__packed|__aligned|____cacheline_aligned_in_smp|____cacheline_aligned};
> + my $definition_body = qr{\{(.*)\}\s*$qualifiers*};
> + my $struct_members = qr{($type)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;};
>
> if ($x =~ /($type)\s+(\w+)\s*$definition_body/) {
> $decl_type = $1;
> @@ -1235,27 +1239,27 @@ sub dump_struct($$) {
> # strip comments:
> $members =~ s/\/\*.*?\*\///gos;
> # strip attributes
> - $members =~ s/\s*__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)/ /gi;
> + $members =~ s/\s*$attribute/ /gi;
> $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
> $members =~ s/\s*__packed\s*/ /gos;
> $members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
> $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
> $members =~ s/\s*____cacheline_aligned/ /gos;
>
> + my $args = qr{([^,)]+)};
> # replace DECLARE_BITMAP
> $members =~ s/__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)/DECLARE_BITMAP($1, __ETHTOOL_LINK_MODE_MASK_NBITS)/gos;
> - $members =~ s/DECLARE_BITMAP\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
> + $members =~ s/DECLARE_BITMAP\s*\($args,\s*$args\)/unsigned long $1\[BITS_TO_LONGS($2)\]/gos;
> # replace DECLARE_HASHTABLE
> - $members =~ s/DECLARE_HASHTABLE\s*\(([^,)]+),\s*([^,)]+)\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
> + $members =~ s/DECLARE_HASHTABLE\s*\($args,\s*$args\)/unsigned long $1\[1 << (($2) - 1)\]/gos;
> # replace DECLARE_KFIFO
> - $members =~ s/DECLARE_KFIFO\s*\(([^,)]+),\s*([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
> + $members =~ s/DECLARE_KFIFO\s*\($args,\s*$args,\s*$args\)/$2 \*$1/gos;
> # replace DECLARE_KFIFO_PTR
> - $members =~ s/DECLARE_KFIFO_PTR\s*\(([^,)]+),\s*([^,)]+)\)/$2 \*$1/gos;
> -
> + $members =~ s/DECLARE_KFIFO_PTR\s*\($args,\s*$args\)/$2 \*$1/gos;
> my $declaration = $members;
>
> # Split nested struct/union elements as newer ones
> - while ($members =~ m/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/) {
> + while ($members =~ m/$struct_members/) {
> my $newmember;
> my $maintype = $1;
> my $ids = $4;
> @@ -1315,7 +1319,7 @@ sub dump_struct($$) {
> }
> }
> }
> - $members =~ s/(struct|union)([^\{\};]+)\{([^\{\}]*)\}([^\{\}\;]*)\;/$newmember/;
> + $members =~ s/$struct_members/$newmember/;
> }
>
> # Ignore other nested elements, like enums
> @@ -1555,8 +1559,9 @@ sub create_parameterlist($$$$) {
> my $param;
>
> # temporarily replace commas inside function pointer definition
> - while ($args =~ /(\([^\),]+),/) {
> - $args =~ s/(\([^\),]+),/$1#/g;
> + my $arg_expr = qr{\([^\),]+};
> + while ($args =~ /$arg_expr,/) {
> + $args =~ s/($arg_expr),/$1#/g;
> }
>
> foreach my $arg (split($splitter, $args)) {
> @@ -1707,7 +1712,7 @@ sub check_sections($$$$$) {
> foreach $px (0 .. $#prms) {
> $prm_clean = $prms[$px];
> $prm_clean =~ s/\[.*\]//;
> - $prm_clean =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
> + $prm_clean =~ s/$attribute//i;
> # ignore array size in a parameter string;
> # however, the original param string may contain
> # spaces, e.g.: addr[6 + 2]
> @@ -1809,8 +1814,14 @@ sub dump_function($$) {
> # - parport_register_device (function pointer parameters)
> # - atomic_set (macro)
> # - pci_match_device, __copy_to_user (long return type)
> -
> - if ($define && $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s+/) {
> + my $name = qr{[a-zA-Z0-9_~:]+};
> + my $prototype_end1 = qr{[^\(]*};
> + my $prototype_end2 = qr{[^\{]*};
> + my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};
> + my $type1 = qr{[\w\s]+};
> + my $type2 = qr{$type1\*+};
> +
> + if ($define && $prototype =~ m/^()($name)\s+/) {
> # This is an object-like macro, it has no return type and no parameter
> # list.
> # Function-like macros are not allowed to have spaces between
> @@ -1818,23 +1829,9 @@ sub dump_function($$) {
> $return_type = $1;
> $declaration_name = $2;
> $noret = 1;
> - } elsif ($prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\(]*)\)/ ||
> - $prototype =~ m/^()([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+)\s+([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s+\w+\s+\w+\s*\*+)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/ ||
> - $prototype =~ m/^(\w+\s+\w+\s*\*+\s*\w+\s*\*+\s*)\s*([a-zA-Z0-9_~:]+)\s*\(([^\{]*)\)/) {
> + } elsif ($prototype =~ m/^()($name)\s*$prototype_end/ ||
> + $prototype =~ m/^($type1)\s+($name)\s*$prototype_end/ ||
> + $prototype =~ m/^($type2+)\s*($name)\s*$prototype_end/) {
> $return_type = $1;
> $declaration_name = $2;
> my $args = $3;
> @@ -2111,12 +2108,12 @@ sub process_name($$) {
> } elsif (/$doc_decl/o) {
> $identifier = $1;
> my $is_kernel_comment = 0;
> - my $decl_start = qr{\s*\*};
> + my $decl_start = qr{$doc_com};
> # test for pointer declaration type, foo * bar() - desc
> my $fn_type = qr{\w+\s*\*\s*};
> my $parenthesis = qr{\(\w*\)};
> my $decl_end = qr{[-:].*};
> - if (/^$decl_start\s*([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
> + if (/^$decl_start([\w\s]+?)$parenthesis?\s*$decl_end?$/) {
> $identifier = $1;
> }
> if ($identifier =~ m/^(struct|union|enum|typedef)\b\s*(\S*)/) {
> @@ -2126,8 +2123,8 @@ sub process_name($$) {
> }
> # Look for foo() or static void foo() - description; or misspelt
> # identifier
> - elsif (/^$decl_start\s*$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
> - /^$decl_start\s*$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
> + elsif (/^$decl_start$fn_type?(\w+)\s*$parenthesis?\s*$decl_end?$/ ||
> + /^$decl_start$fn_type?(\w+.*)$parenthesis?\s*$decl_end$/) {
> $identifier = $1;
> $decl_type = 'function';
> $identifier =~ s/^define\s+//;
>

Hi Jonathan!
The warning you mentioned was not showing to me on running "make
htmldocs", for some reason.. As a result, I haven't been able to test
the patch for this warning.. However, I understood the reason for the
error.
It was in this line:
> + $prototype =~ m/^($type2)+\s*($name)\s*$prototype_end/) {

Here, $1 was taking only the last captured value, instead of all the
occurrences, as was desired by me.

Just for reference, these were the warnings which I was getting:
https://github.com/AdityaSrivast/kernel-tasks/blob/master/random/kernel-doc/htmldocs_msgs

Thanks
Aditya

2021-05-15 01:45:52

by Aditya Srivastava

[permalink] [raw]
Subject: Re: [RFC v3] scripts: kernel-doc: reduce repeated regex expressions into variables

On 1/5/21 9:13 pm, Matthew Wilcox wrote:
> On Thu, Apr 29, 2021 at 12:07:29PM +0530, Aditya Srivastava wrote:
>> + my $name = qr{[a-zA-Z0-9_~:]+};
>> + my $prototype_end1 = qr{[^\(]*};
>> + my $prototype_end2 = qr{[^\{]*};
>> + my $prototype_end = qr{\(($prototype_end1|$prototype_end2)\)};
>
> Would this be better written as:
>
> my $prototype_end = qr{\([^\(\{]*\)}
>

Hi Matthew
I have actually tried this earlier, but it does not work as expected,
probably because of greedy matching. I have produced the list of
warning differences before and after over the files, when using this
regex:
https://github.com/AdityaSrivast/kernel-tasks/blob/master/random/kernel-doc/diff_on_alt_protend


> And now that I look at the whole thing, doesn't this fail to parse
> a function declared as:
>
> int f(void (*g)(long));
>
> (that is, f takes a single argument, which is a pointer to a function
> which takes a long argument and returns void)
>

I think this will match against:
$prototype =~ m/^($type1)\s+($name)\s*$prototype_end/

Thanks
Aditya



2021-05-18 21:14:32

by Jonathan Corbet

[permalink] [raw]
Subject: Re: [RFC v4] scripts: kernel-doc: reduce repeated regex expressions into variables

Aditya Srivastava <[email protected]> writes:

> There are some regex expressions in the kernel-doc script, which are used
> repeatedly in the script.
>
> Reduce such expressions into variables, which can be used everywhere.
>
> A quick manual check found that no errors and warnings were added/removed
> in this process.
>
> Suggested-by: Jonathan Corbet <[email protected]>
> Signed-off-by: Aditya Srivastava <[email protected]>
> ---
> Changes in v4:
> - Fix htmldocs warning at function parsing, involving repeated $type2 identifiers capture
> - Re-tested against all files in kernel tree

Applied, thanks for stickint with this.

jon