#!/usr/bin/env perl # # Copyright (c) 2021-2022 Eric Sunshine # # This tool scans shell scripts for test definitions and checks those tests for # problems, such as broken &&-chains, which might hide bugs in the tests # themselves or in behaviors being exercised by the tests. # # Input arguments are pathnames of shell scripts containing test definitions, # or globs referencing a collection of scripts. For each problem discovered, # the pathname of the script containing the test is printed along with the test # name and the test body with a `?!FOO?!` annotation at the location of each # detected problem, where "FOO" is a tag such as "AMP" which indicates a broken # &&-chain. Returns zero if no problems are discovered, otherwise non-zero. use warnings; use strict; use Config; use File::Glob; use Getopt::Long; my $jobs = -1; my $show_stats; my $emit_all; # Lexer tokenizes POSIX shell scripts. It is roughly modeled after section 2.3 # "Token Recognition" of POSIX chapter 2 "Shell Command Language". Although # similar to lexical analyzers for other languages, this one differs in a few # substantial ways due to quirks of the shell command language. # # For instance, in many languages, newline is just whitespace like space or # TAB, but in shell a newline is a command separator, thus a distinct lexical # token. A newline is significant and returned as a distinct token even at the # end of a shell comment. # # In other languages, `1+2` would typically be scanned as three tokens # (`1`, `+`, and `2`), but in shell it is a single token. However, the similar # `1 + 2`, which embeds whitepace, is scanned as three token in shell, as well. # In shell, several characters with special meaning lose that meaning when not # surrounded by whitespace. For instance, the negation operator `!` is special # when standing alone surrounded by whitespace; whereas in `foo!uucp` it is # just a plain character in the longer token "foo!uucp". In many other # languages, `"string"/foo:'string'` might be scanned as five tokens ("string", # `/`, `foo`, `:`, and 'string'), but in shell, it is just a single token. # # The lexical analyzer for the shell command language is also somewhat unusual # in that it recursively invokes the parser to handle the body of `$(...)` # expressions which can contain arbitrary shell code. Such expressions may be # encountered both inside and outside of double-quoted strings. # # The lexical analyzer is responsible for consuming shell here-doc bodies which # extend from the line following a `< $parser, buff => $s, heretags => [] } => $class; } sub scan_heredoc_tag { my $self = shift @_; ${$self->{buff}} =~ /\G(-?)/gc; my $indented = $1; my $tag = $self->scan_token(); $tag =~ s/['"\\]//g; push(@{$self->{heretags}}, $indented ? "\t$tag" : "$tag"); return "<<$indented$tag"; } sub scan_op { my ($self, $c) = @_; my $b = $self->{buff}; return $c unless $$b =~ /\G(.)/sgc; my $cc = $c . $1; return scan_heredoc_tag($self) if $cc eq '<<'; return $cc if $cc =~ /^(?:&&|\|\||>>|;;|<&|>&|<>|>\|)$/; pos($$b)--; return $c; } sub scan_sqstring { my $self = shift @_; ${$self->{buff}} =~ /\G([^']*'|.*\z)/sgc; return "'" . $1; } sub scan_dqstring { my $self = shift @_; my $b = $self->{buff}; my $s = '"'; while (1) { # slurp up non-special characters $s .= $1 if $$b =~ /\G([^"\$\\]+)/gc; # handle special characters last unless $$b =~ /\G(.)/sgc; my $c = $1; $s .= '"', last if $c eq '"'; $s .= '$' . $self->scan_dollar(), next if $c eq '$'; if ($c eq '\\') { $s .= '\\', last unless $$b =~ /\G(.)/sgc; $c = $1; next if $c eq "\n"; # line splice # backslash escapes only $, `, ", \ in dq-string $s .= '\\' unless $c =~ /^[\$`"\\]$/; $s .= $c; next; } die("internal error scanning dq-string '$c'\n"); } return $s; } sub scan_balanced { my ($self, $c1, $c2) = @_; my $b = $self->{buff}; my $depth = 1; my $s = $c1; while ($$b =~ /\G([^\Q$c1$c2\E]*(?:[\Q$c1$c2\E]|\z))/gc) { $s .= $1; $depth++, next if $s =~ /\Q$c1\E$/; $depth--; last if $depth == 0; } return $s; } sub scan_subst { my $self = shift @_; my @tokens = $self->{parser}->parse(qr/^\)$/); $self->{parser}->next_token(); # closing ")" return @tokens; } sub scan_dollar { my $self = shift @_; my $b = $self->{buff}; return $self->scan_balanced('(', ')') if $$b =~ /\G\((?=\()/gc; # $((...)) return '(' . join(' ', $self->scan_subst()) . ')' if $$b =~ /\G\(/gc; # $(...) return $self->scan_balanced('{', '}') if $$b =~ /\G\{/gc; # ${...} return $1 if $$b =~ /\G(\w+)/gc; # $var return $1 if $$b =~ /\G([@*#?$!0-9-])/gc; # $*, $1, $$, etc. return ''; } sub swallow_heredocs { my $self = shift @_; my $b = $self->{buff}; my $tags = $self->{heretags}; while (my $tag = shift @$tags) { my $indent = $tag =~ s/^\t// ? '\\s*' : ''; $$b =~ /(?:\G|\n)$indent\Q$tag\E(?:\n|\z)/gc; } } sub scan_token { my $self = shift @_; my $b = $self->{buff}; my $token = ''; RESTART: $$b =~ /\G[ \t]+/gc; # skip whitespace (but not newline) return "\n" if $$b =~ /\G#[^\n]*(?:\n|\z)/gc; # comment while (1) { # slurp up non-special characters $token .= $1 if $$b =~ /\G([^\\;&|<>(){}'"\$\s]+)/gc; # handle special characters last unless $$b =~ /\G(.)/sgc; my $c = $1; last if $c =~ /^[ \t]$/; # whitespace ends token pos($$b)--, last if length($token) && $c =~ /^[;&|<>(){}\n]$/; $token .= $self->scan_sqstring(), next if $c eq "'"; $token .= $self->scan_dqstring(), next if $c eq '"'; $token .= $c . $self->scan_dollar(), next if $c eq '$'; $self->swallow_heredocs(), $token = $c, last if $c eq "\n"; $token = $self->scan_op($c), last if $c =~ /^[;&|<>]$/; $token = $c, last if $c =~ /^[(){}]$/; if ($c eq '\\') { $token .= '\\', last unless $$b =~ /\G(.)/sgc; $c = $1; next if $c eq "\n" && length($token); # line splice goto RESTART if $c eq "\n"; # line splice $token .= '\\' . $c; next; } die("internal error scanning character '$c'\n"); } return length($token) ? $token : undef; } # ShellParser parses POSIX shell scripts (with minor extensions for Bash). It # is a recursive descent parser very roughly modeled after section 2.10 "Shell # Grammar" of POSIX chapter 2 "Shell Command Language". package ShellParser; sub new { my ($class, $s) = @_; my $self = bless { buff => [], stop => [], output => [] } => $class; $self->{lexer} = Lexer->new($self, $s); return $self; } sub next_token { my $self = shift @_; return pop(@{$self->{buff}}) if @{$self->{buff}}; return $self->{lexer}->scan_token(); } sub untoken { my $self = shift @_; push(@{$self->{buff}}, @_); } sub peek { my $self = shift @_; my $token = $self->next_token(); return undef unless defined($token); $self->untoken($token); return $token; } sub stop_at { my ($self, $token) = @_; return 1 unless defined($token); my $stop = ${$self->{stop}}[-1] if @{$self->{stop}}; return defined($stop) && $token =~ $stop; } sub expect { my ($self, $expect) = @_; my $token = $self->next_token(); return $token if defined($token) && $token eq $expect; push(@{$self->{output}}, "?!ERR?! expected '$expect' but found '" . (defined($token) ? $token : "") . "'\n"); $self->untoken($token) if defined($token); return (); } sub optional_newlines { my $self = shift @_; my @tokens; while (my $token = $self->peek()) { last unless $token eq "\n"; push(@tokens, $self->next_token()); } return @tokens; } sub parse_group { my $self = shift @_; return ($self->parse(qr/^}$/), $self->expect('}')); } sub parse_subshell { my $self = shift @_; return ($self->parse(qr/^\)$/), $self->expect(')')); } sub parse_case_pattern { my $self = shift @_; my @tokens; while (defined(my $token = $self->next_token())) { push(@tokens, $token); last if $token eq ')'; } return @tokens; } sub parse_case { my $self = shift @_; my @tokens; push(@tokens, $self->next_token(), # subject $self->optional_newlines(), $self->expect('in'), $self->optional_newlines()); while (1) { my $token = $self->peek(); last unless defined($token) && $token ne 'esac'; push(@tokens, $self->parse_case_pattern(), $self->optional_newlines(), $self->parse(qr/^(?:;;|esac)$/)); # item body $token = $self->peek(); last unless defined($token) && $token ne 'esac'; push(@tokens, $self->expect(';;'), $self->optional_newlines()); } push(@tokens, $self->expect('esac')); return @tokens; } sub parse_for { my $self = shift @_; my @tokens; push(@tokens, $self->next_token(), # variable $self->optional_newlines()); my $token = $self->peek(); if (defined($token) && $token eq 'in') { push(@tokens, $self->expect('in'), $self->optional_newlines()); } push(@tokens, $self->parse(qr/^do$/), # items $self->expect('do'), $self->optional_newlines(), $self->parse_loop_body(), $self->expect('done')); return @tokens; } sub parse_if { my $self = shift @_; my @tokens; while (1) { push(@tokens, $self->parse(qr/^then$/), # if/elif condition $self->expect('then'), $self->optional_newlines(), $self->parse(qr/^(?:elif|else|fi)$/)); # if/elif body my $token = $self->peek(); last unless defined($token) && $token eq 'elif'; push(@tokens, $self->expect('elif')); } my $token = $self->peek(); if (defined($token) && $token eq 'else') { push(@tokens, $self->expect('else'), $self->optional_newlines(), $self->parse(qr/^fi$/)); # else body } push(@tokens, $self->expect('fi')); return @tokens; } sub parse_loop_body { my $self = shift @_; return $self->parse(qr/^done$/); } sub parse_loop { my $self = shift @_; return ($self->parse(qr/^do$/), # condition $self->expect('do'), $self->optional_newlines(), $self->parse_loop_body(), $self->expect('done')); } sub parse_func { my $self = shift @_; return ($self->expect('('), $self->expect(')'), $self->optional_newlines(), $self->parse_cmd()); # body } sub parse_bash_array_assignment { my $self = shift @_; my @tokens = $self->expect('('); while (defined(my $token = $self->next_token())) { push(@tokens, $token); last if $token eq ')'; } return @tokens; } my %compound = ( '{' => \&parse_group, '(' => \&parse_subshell, 'case' => \&parse_case, 'for' => \&parse_for, 'if' => \&parse_if, 'until' => \&parse_loop, 'while' => \&parse_loop); sub parse_cmd { my $self = shift @_; my $cmd = $self->next_token(); return () unless defined($cmd); return $cmd if $cmd eq "\n"; my $token; my @tokens = $cmd; if ($cmd eq '!') { push(@tokens, $self->parse_cmd()); return @tokens; } elsif (my $f = $compound{$cmd}) { push(@tokens, $self->$f()); } elsif (defined($token = $self->peek()) && $token eq '(') { if ($cmd !~ /\w=$/) { push(@tokens, $self->parse_func()); return @tokens; } $tokens[-1] .= join(' ', $self->parse_bash_array_assignment()); } while (defined(my $token = $self->next_token())) { $self->untoken($token), last if $self->stop_at($token); push(@tokens, $token); last if $token =~ /^(?:[;&\n|]|&&|\|\|)$/; } push(@tokens, $self->next_token()) if $tokens[-1] ne "\n" && defined($token = $self->peek()) && $token eq "\n"; return @tokens; } sub accumulate { my ($self, $tokens, $cmd) = @_; push(@$tokens, @$cmd); } sub parse { my ($self, $stop) = @_; push(@{$self->{stop}}, $stop); goto DONE if $self->stop_at($self->peek()); my @tokens; while (my @cmd = $self->parse_cmd()) { $self->accumulate(\@tokens, \@cmd); last if $self->stop_at($self->peek()); } DONE: pop(@{$self->{stop}}); return @tokens; } # TestParser is a subclass of ShellParser which, beyond parsing shell script # code, is also imbued with semantic knowledge of test construction, and checks # tests for common problems (such as broken &&-chains) which might hide bugs in # the tests themselves or in behaviors being exercised by the tests. As such, # TestParser is only called upon to parse test bodies, not the top-level # scripts in which the tests are defined. package TestParser; use base 'ShellParser'; sub find_non_nl { my $tokens = shift @_; my $n = shift @_; $n = $#$tokens if !defined($n); $n-- while $n >= 0 && $$tokens[$n] eq "\n"; return $n; } sub ends_with { my ($tokens, $needles) = @_; my $n = find_non_nl($tokens); for my $needle (reverse(@$needles)) { return undef if $n < 0; $n = find_non_nl($tokens, $n), next if $needle eq "\n"; return undef if $$tokens[$n] !~ $needle; $n--; } return 1; } sub match_ending { my ($tokens, $endings) = @_; for my $needles (@$endings) { next if @$tokens < scalar(grep {$_ ne "\n"} @$needles); return 1 if ends_with($tokens, $needles); } return undef; } sub parse_loop_body { my $self = shift @_; my @tokens = $self->SUPER::parse_loop_body(@_); # did loop signal failure via "|| return" or "|| exit"? return @tokens if !@tokens || grep(/^(?:return|exit|\$\?)$/, @tokens); # did loop upstream of a pipe signal failure via "|| echo 'impossible # text'" as the final command in the loop body? return @tokens if ends_with(\@tokens, [qr/^\|\|$/, "\n", qr/^echo$/, qr/^.+$/]); # flag missing "return/exit" handling explicit failure in loop body my $n = find_non_nl(\@tokens); splice(@tokens, $n + 1, 0, '?!LOOP?!'); return @tokens; } my @safe_endings = ( [qr/^(?:&&|\|\||\||&)$/], [qr/^(?:exit|return)$/, qr/^(?:\d+|\$\?)$/], [qr/^(?:exit|return)$/, qr/^(?:\d+|\$\?)$/, qr/^;$/], [qr/^(?:exit|return|continue)$/], [qr/^(?:exit|return|continue)$/, qr/^;$/]); sub accumulate { my ($self, $tokens, $cmd) = @_; goto DONE unless @$tokens; goto DONE if @$cmd == 1 && $$cmd[0] eq "\n"; # did previous command end with "&&", "|", "|| return" or similar? goto DONE if match_ending($tokens, \@safe_endings); # if this command handles "$?" specially, then okay for previous # command to be missing "&&" for my $token (@$cmd) { goto DONE if $token =~ /\$\?/; } # if this command is "false", "return 1", or "exit 1" (which signal # failure explicitly), then okay for all preceding commands to be # missing "&&" if ($$cmd[0] =~ /^(?:false|return|exit)$/) { @$tokens = grep(!/^\?!AMP\?!$/, @$tokens); goto DONE; } # flag missing "&&" at end of previous command my $n = find_non_nl($tokens); splice(@$tokens, $n + 1, 0, '?!AMP?!') unless $n < 0; DONE: $self->SUPER::accumulate($tokens, $cmd); } # ScriptParser is a subclass of ShellParser which identifies individual test # definitions within test scripts, and passes each test body through TestParser # to identify possible problems. ShellParser detects test definitions not only # at the top-level of test scripts but also within compound commands such as # loops and function definitions. package ScriptParser; use base 'ShellParser'; sub new { my $class = shift @_; my $self = $class->SUPER::new(@_); $self->{ntests} = 0; return $self; } # extract the raw content of a token, which may be a single string or a # composition of multiple strings and non-string character runs; for instance, # `"test body"` unwraps to `test body`; `word"a b"42'c d'` to `worda b42c d` sub unwrap { my $token = @_ ? shift @_ : $_; # simple case: 'sqstring' or "dqstring" return $token if $token =~ s/^'([^']*)'$/$1/; return $token if $token =~ s/^"([^"]*)"$/$1/; # composite case my ($s, $q, $escaped); while (1) { # slurp up non-special characters $s .= $1 if $token =~ /\G([^\\'"]*)/gc; # handle special characters last unless $token =~ /\G(.)/sgc; my $c = $1; $q = undef, next if defined($q) && $c eq $q; $q = $c, next if !defined($q) && $c =~ /^['"]$/; if ($c eq '\\') { last unless $token =~ /\G(.)/sgc; $c = $1; $s .= '\\' if $c eq "\n"; # preserve line splice } $s .= $c; } return $s } sub check_test { my $self = shift @_; my ($title, $body) = map(unwrap, @_); $self->{ntests}++; my $parser = TestParser->new(\$body); my @tokens = $parser->parse(); return unless $emit_all || grep(/\?![^?]+\?!/, @tokens); my $checked = join(' ', @tokens); $checked =~ s/^\n//; $checked =~ s/^ //mg; $checked =~ s/ $//mg; $checked .= "\n" unless $checked =~ /\n$/; push(@{$self->{output}}, "# chainlint: $title\n$checked"); } sub parse_cmd { my $self = shift @_; my @tokens = $self->SUPER::parse_cmd(); return @tokens unless @tokens && $tokens[0] =~ /^test_expect_(?:success|failure)$/; my $n = $#tokens; $n-- while $n >= 0 && $tokens[$n] =~ /^(?:[;&\n|]|&&|\|\|)$/; $self->check_test($tokens[1], $tokens[2]) if $n == 2; # title body $self->check_test($tokens[2], $tokens[3]) if $n > 2; # prereq title body return @tokens; } # main contains high-level functionality for processing command-line switches, # feeding input test scripts to ScriptParser, and reporting results. package main; my $getnow = sub { return time(); }; my $interval = sub { return time() - shift; }; if (eval {require Time::HiRes; Time::HiRes->import(); 1;}) { $getnow = sub { return [Time::HiRes::gettimeofday()]; }; $interval = sub { return Time::HiRes::tv_interval(shift); }; } sub ncores { # Windows return $ENV{NUMBER_OF_PROCESSORS} if exists($ENV{NUMBER_OF_PROCESSORS}); # Linux / MSYS2 / Cygwin / WSL do { local @ARGV='/proc/cpuinfo'; return scalar(grep(/^processor\s*:/, <>)); } if -r '/proc/cpuinfo'; # macOS & BSD return qx/sysctl -n hw.ncpu/ if $^O =~ /(?:^darwin$|bsd)/; return 1; } sub show_stats { my ($start_time, $stats) = @_; my $walltime = $interval->($start_time); my ($usertime) = times(); my ($total_workers, $total_scripts, $total_tests, $total_errs) = (0, 0, 0, 0); for (@$stats) { my ($worker, $nscripts, $ntests, $nerrs) = @$_; print(STDERR "worker $worker: $nscripts scripts, $ntests tests, $nerrs errors\n"); $total_workers++; $total_scripts += $nscripts; $total_tests += $ntests; $total_errs += $nerrs; } printf(STDERR "total: %d workers, %d scripts, %d tests, %d errors, %.2fs/%.2fs (wall/user)\n", $total_workers, $total_scripts, $total_tests, $total_errs, $walltime, $usertime); } sub check_script { my ($id, $next_script, $emit) = @_; my ($nscripts, $ntests, $nerrs) = (0, 0, 0); while (my $path = $next_script->()) { $nscripts++; my $fh; unless (open($fh, "<", $path)) { $emit->("?!ERR?! $path: $!\n"); next; } my $s = do { local $/; <$fh> }; close($fh); my $parser = ScriptParser->new(\$s); 1 while $parser->parse_cmd(); if (@{$parser->{output}}) { my $s = join('', @{$parser->{output}}); $emit->("# chainlint: $path\n" . $s); $nerrs += () = $s =~ /\?![^?]+\?!/g; } $ntests += $parser->{ntests}; } return [$id, $nscripts, $ntests, $nerrs]; } sub exit_code { my $stats = shift @_; for (@$stats) { my ($worker, $nscripts, $ntests, $nerrs) = @$_; return 1 if $nerrs; } return 0; } Getopt::Long::Configure(qw{bundling}); GetOptions( "emit-all!" => \$emit_all, "jobs|j=i" => \$jobs, "stats|show-stats!" => \$show_stats) or die("option error\n"); $jobs = ncores() if $jobs < 1; my $start_time = $getnow->(); my @stats; my @scripts; push(@scripts, File::Glob::bsd_glob($_)) for (@ARGV); unless (@scripts) { show_stats($start_time, \@stats) if $show_stats; exit; } unless ($Config{useithreads} && eval { require threads; threads->import(); require Thread::Queue; Thread::Queue->import(); 1; }) { push(@stats, check_script(1, sub { shift(@scripts); }, sub { print(@_); })); show_stats($start_time, \@stats) if $show_stats; exit(exit_code(\@stats)); } my $script_queue = Thread::Queue->new(); my $output_queue = Thread::Queue->new(); sub next_script { return $script_queue->dequeue(); } sub emit { $output_queue->enqueue(@_); } sub monitor { while (my $s = $output_queue->dequeue()) { print($s); } } my $mon = threads->create({'context' => 'void'}, \&monitor); threads->create({'context' => 'list'}, \&check_script, $_, \&next_script, \&emit) for 1..$jobs; $script_queue->enqueue(@scripts); $script_queue->end(); for (threads->list()) { push(@stats, $_->join()) unless $_ == $mon; } $output_queue->end(); $mon->join(); show_stats($start_time, \@stats) if $show_stats; exit(exit_code(\@stats));