nfu: Command-line numeric fu | Spencer Tipping

Skapad 3 år sedan, skriven i Perl, med 319 visningar.
URL http://kod.perl.se/view/f207dc1a Bädda in
Ladda hem koden eller visa koden i råformatVisa bredare version
  1. #!/usr/bin/env perl
  2. # nfu: Command-line numeric fu | Spencer Tipping
  3. # Licensed under the terms of the MIT source code license
  4.  
  5. use v5.14;
  6. use strict;
  7. use warnings;
  8.  
  9. use Time::HiRes qw/time/;
  10. use POSIX       qw/dup2 mkfifo :sys_wait_h/;
  11. use File::Temp  qw/tmpnam/;
  12.  
  13. use constant VERBOSE_INTERVAL => 30;
  14.  
  15. ++$|;
  16.  
  17. # Setup child capture. All we need to do is wait for child pids; there's no
  18. # formal teardown.
  19. $SIG{CHLD} = sub {
  20.   local ($!, $?);
  21.   waitpid -1, WNOHANG;
  22. };
  23.  
  24. # NB: This import is not used in nfu directly; it's here so you can use these
  25. # functions inside aggregators.
  26. use List::Util qw(first max maxstr min minstr reduce shuffle sum);
  27.  
  28. # Same for this, which is especially useful from aggregators because multiple
  29. # values create multiple output rows, not multiple columns on the same output
  30. # row.
  31. sub row {join "\t", map s/\n//gr, @_}
  32.  
  33. # Order-preserving unique values for strings. This is just too useful not to
  34. # provide.
  35. sub uniq {
  36.   my %seen;
  37.   my @order;
  38.   $seen{$_}++ or push @order, $_ for @_;
  39.   @order;
  40. }
  41.  
  42. sub frequencies {
  43.   my %freqs;
  44.   ++$freqs{$_} for @_;
  45.   %freqs;
  46. }
  47.  
  48. sub mean {scalar @_ && sum(@_) / @_}
  49.  
  50. # JSON support (if available)
  51. our $json;
  52. if (eval {require JSON}) {
  53.   JSON->import;
  54.   no warnings qw(uninitialized);
  55.   $json = JSON->new->allow_nonref->utf8(1);
  56. } elsif (eval {require JSON::PP}) {
  57.   JSON::PP->import;
  58.   no warnings qw(uninitialized);
  59.   $json = JSON::PP->new->allow_nonref->utf8(1);
  60. } else {
  61.   print STDERR "note: no JSON support detected (try 'cpan install JSON')\n";
  62.   print STDERR "nfu will soon have its own JSON parser rather than using ";
  63.   print STDERR "a native library for this. Sorry for the inconvenience.";
  64. }
  65.  
  66. # These are callable from evaled code
  67. sub expand_filename_shorthands;
  68. sub read_file {
  69.   open my $fh, expand_filename_shorthands $_[0], 1;
  70.   my $result = join '', <$fh>;
  71.   close $fh;
  72.   $result;
  73. }
  74.  
  75. sub read_lines {
  76.   open my $fh, expand_filename_shorthands $_[0], 1;
  77.   my @result;
  78.   chomp, push @result, $_ for <$fh>;
  79.   close $fh;
  80.   @result;
  81. }
  82.  
  83. sub json_encode {$json->encode(@_)}
  84. sub json_decode {$json->decode(@_)}
  85.  
  86. sub je {json_encode @_}
  87. sub jd {json_decode @_}
  88.  
  89. # File functions
  90. sub expand_filename_shorthands {
  91.   my ($f, $always_make_a_command) = @_;
  92.   my $result;
  93.   if (-e $f) {
  94.     # It's really a filename, so push it onto @ARGV. If it's compressed, run it
  95.     # through the appropriate decompressor first.
  96.     my $piped = $f =~ s/^(.*\.gz)/cat '$1' | gzip -d/ri
  97.                    =~ s/^(.*\.bz2)/cat '$1' | bzip -d/ri
  98.                    =~ s/^(.*\.xz)/cat '$1' | xz -d/ri
  99.                    =~ s/^(.*\.lzo)/cat '$1' | lzop -d/ri;
  100.     $result = $piped =~ /\|/ ? "$piped |" : $piped;
  101.   } elsif ($f =~ s/^(?:http(s?):)?\/\//"http" . ($1 || '') . ":\/\/"/e) {
  102.     # Assume a URL and curl it
  103.     $result = "curl $f |";
  104.   } elsif ($f =~ s/^sh://) {
  105.     # Execute a command and capture stdout
  106.     $result = "$f |";
  107.   } elsif ($f =~ s/^perl://) {
  108.     # Evaluate a Perl expression
  109.     $f =~ s/'/'"'"'/g;
  110.     $result = "perl -e 'print \$_, \"\\n\" for ($f)' |";
  111.   } elsif ($f =~ /(\w*@?[^:]+):(.*)$/) {
  112.     # Access file over SSH
  113.     $result = "ssh -C '$1' cat '$2' |";
  114.   } else {
  115.     return undef;
  116.   }
  117.  
  118.   $always_make_a_command && $result !~ /\|/ ? "cat '$result' |"
  119.                                             : $result;
  120. }
  121.  
  122. # Flags
  123. our $verbose    = 0;
  124. our $n_lines    = 0;
  125. our $n_bytes    = 0;
  126. our $start_time = undef;
  127.  
  128. our $verbose_command = '';
  129. our @verbose_args;
  130. our $verbose_command_formatted = undef;
  131.  
  132. our $last_verbose_report = 0;
  133. our $verbose_row         = 0;
  134.  
  135. # Call it like this:
  136. # while (<>) {
  137. #   be_verbose_as_appropriate length;
  138. #   ...
  139. # }
  140. sub be_verbose_as_appropriate {
  141.   return unless $verbose;
  142.   my ($record_length) = @_;
  143.   $n_lines += !!$record_length;
  144.   $n_bytes += $record_length;
  145.   my $now = time;
  146.   return unless $record_length == 0
  147.              || ($now - $last_verbose_report) * 1000 > VERBOSE_INTERVAL;
  148.  
  149.   $last_verbose_report = $now;
  150.   $verbose_command_formatted //= join ' ', $verbose_command, @verbose_args;
  151.   $start_time //= $now;
  152.   my $runtime = $now - $start_time || 1;
  153.   printf STDERR "\033[%d;1H\033[K%10dl %8.1fl/s %10dk %8.1fkB/s  %s",
  154.                 $verbose_row,
  155.                 $n_lines,
  156.                 $n_lines / $runtime,
  157.                 $n_bytes / 1024,
  158.                 $n_bytes / 1024 / $runtime,
  159.                 $verbose_command_formatted;
  160. }
  161.  
  162. END {
  163.   be_verbose_as_appropriate 0;
  164.   print STDERR "\n" if $verbose;
  165. }
  166.  
  167. # This variable will keep track of any state accumulated from --use or --run
  168. # arguments. This is required for --pmap to work correctly.
  169. my @uses;
  170.  
  171. sub quote_self {join ' ', $0,
  172.                 map {/[^-\w]/ ? "'" . s/'/'\\''/gr . "'" : $_} @_}
  173.  
  174. my %explosions = (
  175.   a => '--average',
  176.   A => '--aggregate',
  177.   c => '--count',
  178.   C => '--uncount',
  179.   D => '--drop',
  180.   E => '--every',
  181.   f => '--fields',
  182.   F => '--fieldsplit',
  183.   g => '--group',
  184.   G => '--rgroup',
  185.   i => '--index',
  186.   I => '--indexouter',
  187.   k => '--keep',
  188.   K => '--remove',
  189.   l => '--log',
  190.   L => '--exp',
  191.   m => '--map',
  192.   M => '--pmap',
  193.   n => '--number',
  194.   o => '--order',
  195.   O => '--rorder',
  196.   p => '--plot',
  197.   P => '--poll',
  198.   q => '--quant',
  199.   s => '--sum',
  200.   S => '--delta',
  201.   T => '--take',
  202.   V => '--variance',
  203.   w => '--with',
  204. );
  205.  
  206. # Minimum number of required arguments for each function. Numeric arguments are
  207. # automatically forwarded, so are always optional.
  208. my %arity = (
  209.   average    => 0,
  210.   aggregate  => 1,
  211.   count      => 0,
  212.   uncount    => 0,
  213.   delta      => 0,
  214.   drop       => 0,
  215.   every      => 1,
  216.   fields     => 0,
  217.   fieldsplit => 1,
  218.   fold       => 1,
  219.   group      => 0,
  220.   rgroup     => 0,
  221.   index      => 2,
  222.   indexouter => 2,
  223.   keep       => 1,
  224.   log        => 0,
  225.   exp        => 0,
  226.   map        => 1,
  227.   pmap       => 1,
  228.   number     => 0,
  229.   order      => 0,
  230.   rorder     => 0,
  231.   plot       => 1,
  232.   poll       => 2,
  233.   sum        => 0,
  234.   quant      => 1,
  235.   remove     => 1,
  236.   repeat     => 2,
  237.   sample     => 1,
  238.   take       => 0,
  239.   variance   => 0,
  240.   with       => 1,
  241.  
  242.   # Commands with no shorthands
  243.   append     => 1,
  244.   prepend    => 1,
  245.   tee        => 1,
  246.   duplicate  => 2,
  247.   partition  => 2,
  248.   splot      => 1,
  249.   sd         => 0,
  250.   mplot      => 1,
  251.   preview    => 0,
  252.   pipe       => 1,
  253. );
  254.  
  255. my %usages = (
  256.   average    => 'window size (0 for full average) -- running average',
  257.   aggregate  => 'aggregator fn',
  258.   count      => 'counts by first column value; like uniq -c',
  259.   uncount    => 'the opposite of --count; repeats each row N times',
  260.   delta      => 'value -> difference from last value',
  261.   drop       => 'number of records to drop',
  262.   every      => 'n (returns every nth row)',
  263.   fields     => 'string of digits, each a zero-indexed column selector',
  264.   fieldsplit => 'regexp to use for splitting',
  265.   fold       => 'function that returns true when line should be folded',
  266.   group      => 'sorts ascending, takes optional column list',
  267.   rgroup     => 'sorts descending, takes optional column list',
  268.   index      => 'field index, pseudofile to join against',
  269.   indexouter => 'field index, pseudofile to join against',
  270.   keep       => 'row filter fn',
  271.   log        => 'optional base (default e)',
  272.   exp        => 'optional base (default e)',
  273.   map        => 'row map fn',
  274.   pmap       => 'row map fn (executed multiple times in parallel)',
  275.   number     => 'prepends line number to each line',
  276.   order      => 'sorts ascending by general numeric value',
  277.   rorder     => 'sorts descending by general numeric value',
  278.   plot       => 'gnuplot arguments',
  279.   poll       => 'interval in seconds, command whose output to collect',
  280.   sum        => 'value -> total += value',
  281.   quant      => 'number to round to',
  282.   remove     => 'inverted row filter fn',
  283.   repeat     => 'repeat count, pseudofile to repeat',
  284.   sample     => 'row selection probability in [0, 1]',
  285.   take       => 'n to take first n, +n to take last n',
  286.   variance   => 'running variance',
  287.   with       => 'pseudofile to join column-wise onto input',
  288.  
  289.   append     => 'pseudofile; appends its contents to current stream',
  290.   prepend    => 'pseudofile; prepends its contents to current stream',
  291.   tee        => 'shell command; duplicates data to stdin of command',
  292.   duplicate  => 'two shell commands as separate arguments',
  293.   partition  => 'partition id fn, shell command (using {})',
  294.   splot      => 'gnuplot arguments',
  295.   sd         => 'running standard deviation',
  296.   mplot      => 'gnuplot arguments per column, separated by ;',
  297.   preview    => '',
  298.   pipe       => 'shell command to pipe through',
  299. );
  300.  
  301. my %env_docs = (
  302.   NFU_SORT_BUFFER      => 'default 256M; size of in-memory sort for -g and -o',
  303.   NFU_SORT_PARALLEL    => 'default 4; number of concurrent sorts to run',
  304.   NFU_SORT_COMPRESS    => 'default none; compression program for sort tempfiles',
  305.   NFU_NO_PAGER         => 'if set, nfu will not use "less" to preview stdout',
  306.   NFU_PMAP_PARALLELISM => 'number of subprocesses for -M',
  307.   NFU_MAX_FILEHANDLES  => 'default 256; maximum filehandles for --partition',
  308. );
  309.  
  310. my %gnuplot_aliases = (
  311.   '%l' => ' with lines',
  312.   '%d' => ' with dots',
  313.   '%i' => ' with impulses',
  314.   '%u' => ' using ',
  315.   '%t' => ' title ',
  316. );
  317.  
  318. sub expand_gnuplot_options {
  319.   my @transformed_opts;
  320.   for my $opt (@_) {
  321.     $opt =~ s/$_/$gnuplot_aliases{$_}/g for keys %gnuplot_aliases;
  322.     push @transformed_opts, $opt;
  323.   }
  324.   @transformed_opts;
  325. }
  326.  
  327. sub expand_eval_shorthands {
  328.   $_[0] =~ s/%(\d+)/\$_[$1]/gr
  329.         =~ s/([a-zA-Z0-9_\)\}\]?\$])\.([\$_a-zA-Z][-_\w?\$]*)/$1\->{'$2'}/gr;
  330. }
  331.  
  332. sub compile_eval_into_function {
  333.   my ($code, $name) = @_;
  334.   $code = expand_eval_shorthands $code;
  335.   eval "sub {\n$code\n}"
  336.     or die "failed to compile $name function: $@\n  (code was $code)";
  337. }
  338.  
  339. sub stateless_unary_fn {
  340.   my ($name, $f) = @_;
  341.   my $arity = $arity{$name};
  342.   ($name, sub {
  343.     my @columns = split //, (@_ > $arity ? shift : undef) // '0';
  344.     while (<>) {
  345.       be_verbose_as_appropriate length;
  346.       chomp;
  347.       my @fs = split /\t/;
  348.       $fs[$_] = $f->($fs[$_], @_) for @columns;
  349.       print row(@fs), "\n";
  350.     }
  351.   });
  352. }
  353.  
  354. sub stateful_unary_fn {
  355.   my ($name, $setup, $f) = @_;
  356.   my $arity = $arity{$name};
  357.   ($name, sub {
  358.     my @columns = split //, (@_ > $arity ? shift : undef) // '0';
  359.     my %states;
  360.     $states{$_} = $setup->(@_) for @columns;
  361.     while (<>) {
  362.       be_verbose_as_appropriate length;
  363.       chomp;
  364.       my @fs = split /\t/;
  365.       $fs[$_] = $f->($fs[$_], $states{$_}, @_) for @columns;
  366.       print row(@fs), "\n";
  367.     }
  368.   });
  369. }
  370.  
  371. sub exec_with_stdin {
  372.   open(my $fh, '|' . join(' ', map {"'$_'"} @_)) or die "failed to exec @_";
  373.   be_verbose_as_appropriate(length), print $fh $_ while <>;
  374.   close $fh;
  375. }
  376.  
  377. sub exec_with_diamond {
  378.   if ($verbose || grep /\|/, @ARGV) {
  379.     # Arguments are specified in filenames and involve processes, so use perl
  380.     # to forward data.
  381.     exec_with_stdin @_;
  382.   } else {
  383.     # Faster option: just exec the program in-place. This avoids a layer of
  384.     # interprocess piping. Assume filenames follow arguments.
  385.     exec @_, @ARGV or die "failed to exec @_ @ARGV";
  386.   }
  387. }
  388.  
  389. sub sort_options {
  390.   my ($column_spec) = @_;
  391.   my @columns       = split //, $column_spec // '';
  392.   return '-S', $ENV{NFU_SORT_BUFFER} || '256M',
  393.          '--parallel=' . ($ENV{NFU_SORT_PARALLEL} || 4),
  394.          (@columns
  395.            ? ('-t', "\t",
  396.               map {('-k', sprintf "%d,%d", $_ + 1, $_ + 1)} @columns)
  397.            : ()),
  398.          ($ENV{NFU_SORT_COMPRESS}
  399.            ? ("--compress-program=$ENV{NFU_SORT_COMPRESS}")
  400.            : ());
  401. }
  402.  
  403. sub sort_cmd {join ' ', 'sort', sort_options, @_}
  404.  
  405. sub fifo_for {
  406.   my ($file, @transforms) = @_;
  407.   my $fifo_name = tmpnam;
  408.  
  409.   mkfifo $fifo_name, 0700 or die "failed to create fifo: $!";
  410.  
  411.   return $fifo_name if fork;
  412.  
  413.   my $command = expand_filename_shorthands($file, 1)
  414.               . join '', map {"$_ |"} @transforms;
  415.   open my $into_fifo, '>', $fifo_name
  416.     or die "failed to open fifo $fifo_name for writing: $!";
  417.   open my $from_file, $command
  418.     or die "failed to open file/command $command for reading: $!";
  419.  
  420.   be_verbose_as_appropriate(length), $into_fifo->print($_) while <$from_file>;
  421.   close $into_fifo;
  422.   close $from_file;
  423.  
  424.   unlink $fifo_name or warn "failed to unlink temporary fifo $fifo_name: $!";
  425.   exit 0;
  426. }
  427.  
  428. my %functions = (
  429.   group  => sub {exec_with_diamond 'sort', sort_options @_},
  430.   rgroup => sub {exec_with_diamond 'sort', '-r', sort_options @_},
  431.   order  => sub {exec_with_diamond 'sort', '-g', sort_options @_},
  432.   rorder => sub {exec_with_diamond 'sort', '-rg', sort_options @_},
  433.  
  434.   count => sub {
  435.     # Same behavior as uniq -c, but delimits counts with \t; also takes an
  436.     # optional series of columns to uniq by, rather than using the whole row.
  437.     my @columns = split //, shift // '';
  438.     my $last;
  439.     my @last;
  440.     my $count = -1;
  441.  
  442.     while (<>) {
  443.       be_verbose_as_appropriate length;
  444.       chomp;
  445.  
  446.       my @xs = split /\t/;
  447.       @xs   = @xs[@columns] if @columns;
  448.       $last = $_, @last = @xs unless ++$count;
  449.  
  450.       for (my $i = 0; $i < max scalar(@xs), scalar(@last); ++$i) {
  451.         if (!defined $xs[$i] || !defined $last[$i] || $xs[$i] ne $last[$i]) {
  452.           print "$count\t$last\n";
  453.           $count = 0;
  454.           @last  = @xs;
  455.           $last  = $_;
  456.           last;
  457.         }
  458.       }
  459.     }
  460.  
  461.     ++$count;
  462.     print "$count\t$last\n";
  463.   },
  464.  
  465.   uncount => sub {
  466.     while (<>) {
  467.       be_verbose_as_appropriate length;
  468.       my ($n, $line) = split /\t/, $_, 2;
  469.       $line //= "\n";
  470.       print $line for 1..$n;
  471.     }
  472.   },
  473.  
  474.   index => sub {
  475.     # Inner join by appending joined fields to the end.
  476.     my ($field_index, $join_file) = @_;
  477.     ++$field_index;
  478.  
  479.     my $sorted_index = fifo_for $join_file, sort_cmd "-t '\t' -k1b,1";
  480.     my $command = sort_cmd "-t '\t' -k ${field_index}b,$field_index" .
  481.                   "| join -t '\t' -1 $field_index - '$sorted_index'";
  482.  
  483.     open my $to_join, "| $command" or die "failed to exec $command: $!";
  484.     be_verbose_as_appropriate(length), print $to_join $_ while <>;
  485.     close $to_join;
  486.   },
  487.  
  488.   indexouter => sub {
  489.     # Outer left join by appending joined fields to the end.
  490.     my ($field_index, $join_file) = @_;
  491.     ++$field_index;
  492.  
  493.     my $sorted_index = fifo_for $join_file, sort_cmd "-t '\t' -k 1b,1";
  494.     my $command = sort_cmd "-t '\t' -k ${field_index}b,$field_index" .
  495.                   "| join -a 1 -t '\t' -1 $field_index - '$sorted_index'";
  496.  
  497.     open my $to_join, "| $command" or die "failed to exec $command: $!";
  498.     be_verbose_as_appropriate(length), print $to_join $_ while <>;
  499.     close $to_join;
  500.   },
  501.  
  502.   with => sub {
  503.     # Like 'paste'. Joins lines with \t.
  504.     my ($f) = @_;
  505.     open my $fh, expand_filename_shorthands $f, 1
  506.       or die "failed to open --with pseudofile $f: $!";
  507.     my ($part1, $part2);
  508.     while (defined($part1 = <>) and defined($part2 = <$fh>)) {
  509.       be_verbose_as_appropriate length($part1) + length($part2);
  510.       chomp $part1;
  511.       chomp $part2;
  512.       print $part1, "\t", $part2, "\n";
  513.     }
  514.     close $fh;
  515.   },
  516.  
  517.   repeat => sub {
  518.     my ($n, $f) = @_;
  519.     my $count = 0;
  520.     while (!$n || $count++ < $n) {
  521.       open my $fh, expand_filename_shorthands $f, 1
  522.         or die "failed to open --repeat pseudofile $f: $!";
  523.       be_verbose_as_appropriate(length), print while <$fh>;
  524.       close $fh;
  525.     }
  526.   },
  527.  
  528.   stateful_unary_fn('average',
  529.     sub {my ($size, $n, $total) = ($_[0] // 0, 0, 0);
  530.          [$size, $n, $total, []]},
  531.     sub {
  532.       my ($x, $state) = @_;
  533.       my ($size, $n, $total, $window) = @$state;
  534.       $total += $x;
  535.       ++$n;
  536.       my $v = $total / ($n > $size && $size ? $size : $n);
  537.       $total -= shift @$window if $size and push(@$window, $x) >= $size;
  538.       $$state[1] = $n;
  539.       $$state[2] = $total;
  540.       $v;
  541.     }),
  542.  
  543.   aggregate => sub {
  544.     my $f = compile_eval_into_function $_[0], 'aggregate function';
  545.     my @columns;
  546.     while (my $line = <>) {
  547.       be_verbose_as_appropriate length $line;
  548.       chomp $line;
  549.       my @fields = split /\t/, $line;
  550.  
  551.       # Two cases here. If the new record is compatible with the most recent
  552.       # existing one, or there aren't any existing ones, then group it and
  553.       # don't call the aggregator yet.
  554.       #
  555.       # If we see a change, then call the aggregator and empty out the group.
  556.       #
  557.       # Note that the aggregator function is called on columns, not rows.
  558.  
  559.       my $n = @columns && @{$columns[0]};
  560.       if (!$n or $fields[0] eq ${$columns[0]}[0]) {
  561.         $columns[$_][$n] = $fields[$_] for 0 .. $#fields;
  562.       } else {
  563.         $_ = ${$columns[0]}[0];
  564.         print $_, "\n" for $f->(@columns);
  565.         @columns = ();
  566.         $columns[$_][0] = $fields[$_] for 0 .. $#fields;
  567.       }
  568.     }
  569.     if (@columns) {
  570.       $_ = ${$columns[0]}[0];
  571.       print $_, "\n" for $f->(@columns);
  572.     }
  573.   },
  574.  
  575.   fold => sub {
  576.     my $f = compile_eval_into_function $_[0], 'fold function';
  577.     my @saved;
  578.     while (my $line = <>) {
  579.       be_verbose_as_appropriate length $line;
  580.       chomp $line;
  581.       if ($f->(split /\t/, $line)) {
  582.         push @saved, $line;
  583.       } else {
  584.         print row(@saved), "\n" if @saved;
  585.         @saved = ($line);
  586.       }
  587.     }
  588.     print row(@saved), "\n" if @saved;
  589.   },
  590.  
  591.   stateless_unary_fn('log', sub {
  592.     my ($x, $base) = @_;
  593.     my $log = log $x;
  594.     $log /= log $base if defined $base;
  595.     $log;
  596.   }),
  597.  
  598.   stateless_unary_fn('exp', sub {
  599.     my ($x, $base) = @_;
  600.     defined $base ? $base ** $x : exp $x;
  601.   }),
  602.  
  603.   stateless_unary_fn('quant', sub {
  604.     my ($x, $quantum) = @_;
  605.     $quantum ||= 1.0;
  606.     my $sign = $x < 0 ? -1 : 1;
  607.     int(abs($x) / $quantum + 0.5) * $quantum * $sign;
  608.   }),
  609.  
  610.   # Note: this needs to be stdin; otherwise "nfu -p %l filename" will fail
  611.   # (since exec_with_diamond trieds to pass filename straight into gnuplot).
  612.   plot => sub {
  613.     exec_with_stdin 'gnuplot',
  614.                     '-e',
  615.                     'plot "-" ' . join(' ', expand_gnuplot_options @_),
  616.                     '-persist';
  617.   },
  618.  
  619.   splot => sub {
  620.     exec_with_stdin 'gnuplot',
  621.                     '-e',
  622.                     'splot "-" ' . join(' ', expand_gnuplot_options @_),
  623.                     '-persist';
  624.   },
  625.  
  626.   mplot => sub {
  627.     my @gnuplot_options = split /;/, join ' ', expand_gnuplot_options @_;
  628.     my $fname = tmpnam;
  629.     open my $fh, '>', $fname or die "failed to open tempfile for mplot: $!";
  630.     $fh->print($_) while <>;
  631.     close $fh;
  632.     system 'gnuplot', '-e',
  633.            'plot ' . join(',', map("\"$fname\" $_", @gnuplot_options)),
  634.            '-persist';
  635.     unlink $fname or warn "failed to unlink mplot tempfile $fname: $!";
  636.   },
  637.  
  638.   poll => sub {
  639.     my ($sleep, $command) = @_;
  640.     die "usage: --poll sleep-amount 'command ...'"
  641.       unless defined $sleep and defined $command;
  642.     system($command), sleep $sleep while 1;
  643.   },
  644.  
  645.   stateful_unary_fn('delta',
  646.     sub {[0]},
  647.     sub {my ($x, $state) = @_;
  648.          my $v = $x - $$state[0];
  649.          $$state[0] = $x;
  650.          $v}),
  651.  
  652.   stateful_unary_fn('sum',
  653.     sub {[0]},
  654.     sub {my ($x, $state) = @_;
  655.          $$state[0] += $x}),
  656.  
  657.   stateful_unary_fn('variance',
  658.     sub {[0, 0, 0]},
  659.     sub {my ($x, $state) = @_;
  660.          $$state[0] += $x;
  661.          $$state[1] += $x * $x;
  662.          $$state[2]++;
  663.          my ($sx, $sx2, $count) = @$state;
  664.          ($sx2 - ($sx * $sx / $count)) / ($count - 1 || 1)}),
  665.  
  666.   stateful_unary_fn('sd',
  667.     sub {[0, 0, 0]},
  668.     sub {my ($x, $state) = @_;
  669.          $$state[0] += $x;
  670.          $$state[1] += $x * $x;
  671.          $$state[2]++;
  672.          my ($sx, $sx2, $count) = @$state;
  673.          sqrt(($sx2 - ($sx * $sx / $count)) / ($count - 1 || 1))}),
  674.  
  675.   take => sub {
  676.     if ($_[0] =~ s/^\+//) {
  677.       # Take last n, so we need a line queue
  678.       my @q;
  679.       my $i = 0;
  680.       be_verbose_as_appropriate(length), $q[$i++ % $_[0]] = $_ while <>;
  681.       print for @q[$i % $_[0] .. $#q];
  682.       print for @q[0 .. $i % $_[0] - 1];
  683.     } else {
  684.       my $n = $_[0] // 1;
  685.       while (<>) {
  686.         be_verbose_as_appropriate length;
  687.         last if --$n < 0;
  688.         print;
  689.       }
  690.     }
  691.   },
  692.  
  693.   sample => sub {
  694.     while (<>) {
  695.       be_verbose_as_appropriate length;
  696.       print if rand() < $_[0];
  697.     }
  698.   },
  699.  
  700.   drop => sub {
  701.     my $n = $_[0] // 1;
  702.     while (<>) {
  703.       be_verbose_as_appropriate length;
  704.       last if --$n <= 0;
  705.     }
  706.     be_verbose_as_appropriate(length), print while <>;
  707.   },
  708.  
  709.   map => sub {
  710.     my $f = compile_eval_into_function $_[0], 'map function';
  711.     while (my $line = <>) {
  712.       be_verbose_as_appropriate length $line;
  713.       chomp $line;
  714.       my @xs = split /\t/, $line;
  715.       print "$_\n" for $f->(@xs);
  716.     }
  717.   },
  718.  
  719.   pmap => sub {
  720.     my @fhs;
  721.     my $wbits = '';
  722.     my $wout  = '';
  723.     my $i     = 0;
  724.  
  725.     for (1 .. $ENV{NFU_PMAP_PARALLELISM} // 16) {
  726.       my $mapper = quote_self @uses, '--map', $_[0];
  727.       open my $fh, "| $mapper"
  728.         or die "failed to open child process $mapper: $!";
  729.  
  730.       vec($wbits, fileno($fh), 1) = 1;
  731.       push @fhs, $fh;
  732.     }
  733.  
  734.     while (<>) {
  735.       be_verbose_as_appropriate;
  736.       select undef, $wout = $wbits, undef, undef;
  737.       ++$i until vec($wout, fileno $fhs[$i % @fhs], 1);
  738.       syswrite $fhs[$i++ % @fhs], $_;
  739.     }
  740.     close for @fhs;
  741.   },
  742.  
  743.   keep => sub {
  744.     my $f = compile_eval_into_function $_[0], 'keep function';
  745.     while (my $line = <>) {
  746.       be_verbose_as_appropriate length $line;
  747.       chomp $line;
  748.       my @xs = split /\t/, $line;
  749.       print row(@xs), "\n" if $f->(@xs);
  750.     }
  751.   },
  752.  
  753.   remove => sub {
  754.     my $f = compile_eval_into_function $_[0], 'remove function';
  755.     while (my $line = <>) {
  756.       be_verbose_as_appropriate length $line;
  757.       chomp $line;
  758.       my @xs = split /\t/, $line;
  759.       print row(@xs), "\n" unless $f->(@xs);
  760.     }
  761.   },
  762.  
  763.   every => sub {
  764.     my ($n) = @_;
  765.     my $i = 0;
  766.     while (<>) {
  767.       be_verbose_as_appropriate length;
  768.       print unless $i++ % $n;
  769.     }
  770.   },
  771.  
  772.   fields => sub {
  773.     my ($fields)   = @_;
  774.     my $everything = $fields =~ s/\.$//;
  775.     my @fs         = split //, $fields;
  776.     $everything &&= 1 + max @fs;
  777.  
  778.     while (<>) {
  779.       be_verbose_as_appropriate length;
  780.       chomp;
  781.       my @xs = split /\t/;
  782.       my @ys = @xs[@fs];
  783.       push @ys, @xs[$everything .. $#xs] if $everything;
  784.       print join("\t", map $_ // '', @ys), "\n";
  785.     }
  786.   },
  787.  
  788.   fieldsplit => sub {
  789.     my $delim = qr/$_[0]/;
  790.     while (<>) {
  791.       be_verbose_as_appropriate length;
  792.       chomp;
  793.       print join("\t", split /$delim/), "\n";
  794.     }
  795.   },
  796.  
  797.   number => sub {
  798.     my $n = 0;
  799.     while (<>) {
  800.       be_verbose_as_appropriate length;
  801.       chomp;
  802.       print row(++$n, $_), "\n";
  803.     }
  804.   },
  805.  
  806.   prepend => sub {
  807.     open my $fh, expand_filename_shorthands $_[0], 1
  808.       or die "failed to open --prepend pseudofile $_[0]: $!";
  809.     be_verbose_as_appropriate(length), print while <$fh>;
  810.     close $fh;
  811.     print while <>;
  812.   },
  813.  
  814.   append => sub {
  815.     open my $fh, expand_filename_shorthands $_[0], 1
  816.       or die "failed to open --append pseudofile $_[0]: $!";
  817.     print while <>;
  818.     be_verbose_as_appropriate(length), print while <$fh>;
  819.     close $fh;
  820.   },
  821.  
  822.   pipe => sub {
  823.     open my $fh, "| $_[0]" or die "failed to launch $_[0]: $!";
  824.     while (<>) {
  825.       be_verbose_as_appropriate length;
  826.       $fh->print($_);
  827.     }
  828.     close $fh;
  829.   },
  830.  
  831.   tee => sub {
  832.     open my $fh, "| $_[0]" or die "failed to launch $_[0]: $!";
  833.     while (<>) {
  834.       be_verbose_as_appropriate length;
  835.       $fh->print($_);
  836.       print;
  837.     }
  838.     close $fh;
  839.   },
  840.  
  841.   duplicate => sub {
  842.     open my $fh1, "| $_[0]" or die "failed to launch $_[0]: $!";
  843.     open my $fh2, "| $_[1]" or die "failed to launch $_[1]: $!";
  844.     while (<>) {
  845.       be_verbose_as_appropriate length;
  846.       $fh1->print($_);
  847.       $fh2->print($_);
  848.     }
  849.     close $fh1;
  850.     close $fh2;
  851.   },
  852.  
  853.   partition => sub {
  854.     my ($splitter, $cmd) = @_;
  855.     my %fhs;
  856.     my $f = compile_eval_into_function $splitter, 'partition function';
  857.  
  858.     my @open_partitions;
  859.     while (<>) {
  860.       be_verbose_as_appropriate length;
  861.       my $line = $_;
  862.       my $p    = $f->(split /\t/, $line);
  863.       unless (exists $fhs{$p}) {
  864.         my $cmdsub = $cmd =~ s/\{\}/$p/gr;
  865.         open $fhs{$p}, "| $cmdsub" or die "failed to launch $cmdsub: $!";
  866.         push @open_partitions, $p;
  867.       }
  868.       $fhs{$p}->print($line);
  869.       close($p = shift @open_partitions), delete $fhs{$p}
  870.         while @open_partitions > ($ENV{NFU_MAX_FILEHANDLES} // 256);
  871.     }
  872.     close for values %fhs;
  873.   },
  874.  
  875.   preview => sub {
  876.     my $have_less = !system 'which less > /dev/null';
  877.     my $have_more = !system 'which more > /dev/null';
  878.  
  879.     my $less_program = $have_less ? 'less'
  880.                      : $have_more ? 'more' : 'cat';
  881.  
  882.     exec_with_diamond $less_program;
  883.   },
  884. );
  885.  
  886. # Print usage if the user clearly doesn't know what they're doing.
  887. if (@ARGV ? $ARGV[0] =~ /^-[h?]$/ || $ARGV[0] =~ /^--(usage|help)$/
  888.           : -t STDIN) {
  889.  
  890.   # Some checks for me to make sure I'm keeping the code well-maintained
  891.   exists $functions{$_}         or die "no function for $_" for keys %usages;
  892.   exists $usages{$_}            or die "no usage for $_"    for keys %functions;
  893.   exists $arity{$_}             or die "no arity for $_"    for keys %usages;
  894.   exists $usages{$_ =~ s/--//r} or die "no usage for $_"
  895.     for values %explosions, keys %usages;
  896.  
  897.   print STDERR "usage: nfu [prefix-commands...] [input-files...] commands...\n";
  898.   print STDERR "where prefix commands are:\n\n";
  899.  
  900.   print STDERR "  documentation (not used with normal commands):\n";
  901.   print STDERR "     --explain           <other-options>\n";
  902.   print STDERR "     --expand-pseudofile <filename>\n";
  903.   print STDERR "     --expand-code       <code>\n";
  904.   print STDERR "     --expand-gnuplot    <gnuplot options>\n";
  905.  
  906.   print STDERR "\n  pipeline modifiers:\n";
  907.   print STDERR "     --quote  -- quotes args: eval \$(nfu --quote ...)\n";
  908.   print STDERR "     --use    <file.pl>\n";
  909.   print STDERR "     --run    <perl code>\n";
  910.  
  911.   print STDERR "\nand each command is one of the following:\n\n";
  912.  
  913.   my $len = 1 + max map length, keys %usages;
  914.   my %short_lookup;
  915.   $short_lookup{$explosions{$_} =~ s/^--//r} = $_ for keys %explosions;
  916.  
  917.   for my $cmd (sort keys %usages) {
  918.     my $short = $short_lookup{$cmd};
  919.     $short = defined $short ? "-$short|" : '   ';
  920.     printf STDERR "  %s--%-${len}s(%d) %s\n",
  921.                   $short,
  922.                   $cmd,
  923.                   $arity{$cmd},
  924.                   $usages{$cmd} ? $arity{$cmd} ? "<$usages{$cmd}>"
  925.                                                : "-- $usages{$cmd}" : '';
  926.   }
  927.  
  928.   print STDERR "\ngnuplot expansions:\n\n";
  929.   printf STDERR "  %2s -> '%s'\n", $_, $gnuplot_aliases{$_}
  930.   for sort keys %gnuplot_aliases;
  931.  
  932.   my $env_len = 1 + max map length, keys %env_docs;
  933.   print STDERR "\nenvironment variables:\n\n";
  934.   printf STDERR "  %-${env_len}s %s\n", $_, $env_docs{$_}
  935.   for sort keys %env_docs;
  936.  
  937.   print STDERR "\n";
  938.   print STDERR "see https://github.com/spencertipping/nfu for documentation\n";
  939.   print STDERR "\n";
  940.  
  941.   exit 1;
  942. }
  943.  
  944. if (@ARGV && $ARGV[0] eq '--quote') {
  945.   # Quote all other arguments so a shell will parse them correctly.
  946.   shift @ARGV;
  947.   print quote_self(@ARGV), "\n";
  948.   exit 0;
  949. }
  950.  
  951. if (@ARGV && $ARGV[0] =~ /^--expand/) {
  952.   my ($command, $x, @others) = @ARGV;
  953.   if ($command =~ /-pseudofile$/) {
  954.     print expand_filename_shorthands($x) // '<invalid/nonexistent>', "\n";
  955.   } elsif ($command =~ /-code$/) {
  956.     print expand_eval_shorthands($x), "\n";
  957.   } elsif ($command =~ /-gnuplot$/) {
  958.     print expand_gnuplot_options($x), "\n";
  959.   } else {
  960.     print STDERR "unknown command: $command\n";
  961.     exit 1;
  962.   }
  963.   exit 0;
  964. }
  965.  
  966. sub explode {
  967.   return $_[0] unless $_[0] =~ s/^-([^-])/$1/;
  968.   map {$explosions{$_} // $_} grep length, split /([-+.\d]*),?/, $_[0];
  969. }
  970.  
  971. my @exploded;
  972. while (@ARGV) {
  973.   # Load 'use' modules before any forking.
  974.   if ($ARGV[0] =~ /^--(use|run)$/) {
  975.     my $option = shift @ARGV;
  976.     my $x      = shift @ARGV;
  977.  
  978.     push @uses, $option, $x;
  979.     if ($option eq '--run') {
  980.       eval $x;
  981.       die "failed to run $x: $@" if $@;
  982.     } else {
  983.       do $x;
  984.       die "failed to use $x: $@" if $@;
  985.     }
  986.   } elsif (!$verbose && $ARGV[0] =~ /^--?v(erbose)?$/) {
  987.     shift @ARGV;
  988.     print STDERR "\033[2J";
  989.     $verbose = 1;
  990.   } else {
  991.     push @exploded, explode shift @ARGV;
  992.   }
  993. }
  994.  
  995. my $reader  = undef;
  996. my @parsed  = ();
  997. my $explain = 0;
  998.  
  999. @ARGV = ();
  1000.  
  1001. # First parse through all of the options, pull out stray files, and replace
  1002. # @ARGV. This enables <> in the worker subs. (The "right way" to do this would
  1003. # be to chain the commands' inputs; then I could remove the branch in the
  1004. # for-loop below. But I'm too lazy.)
  1005. while (@exploded) {
  1006.   (my $command = shift @exploded) =~ s/^--//;
  1007.  
  1008.   if (defined(my $arity = $arity{$command})) {
  1009.     my @args;
  1010.     push @args, shift @exploded while @exploded
  1011.                                    && (--$arity >= 0
  1012.                                        || $exploded[0] =~ /^[-+]?\d+/);
  1013.     push @parsed, [$command, @args];
  1014.   } elsif ($command eq 'explain') {
  1015.     $explain = 1;
  1016.   } else {
  1017.     my $f = expand_filename_shorthands $command;
  1018.     die "nonexistent pseudofile: $command" unless defined $f;
  1019.     push @ARGV, $f;
  1020.   }
  1021. }
  1022.  
  1023. # Open output in an interactive previewer if...
  1024. push @parsed, ['preview'] if !$ENV{NFU_NO_PAGER}    # we can page
  1025.                           && (!-t STDIN || @ARGV)   # not interacting for input
  1026.                           && -t STDOUT;             # interacting for output
  1027.  
  1028. if ($explain) {
  1029.   # Explain what we would have done with the given command line.
  1030.   printf "file\t%s\n", $_ for @ARGV;
  1031.   printf "--%s\t%s\n", ${$_}[0], join "\t",
  1032.                                  map "'$_'", @{$_}[1 .. $#$_] for @parsed;
  1033. } elsif (@parsed) {
  1034.   # Note: the loop below uses pipe/fork/dup2 instead of a more idiomatic Open2
  1035.   # call. I don't have a good reason for this other than to figure out how the
  1036.   # low-level stuff worked.
  1037.   for (my $i = 0; $i < @parsed; ++$i) {
  1038.     my ($command, @args) = @{$parsed[$i]};
  1039.  
  1040.     # Here's where things get fun. The question right now is, "do we need to
  1041.     # fork, or can we run in-process?" -- i.e. are we in the middle, or at the
  1042.     # end? When we're in the middle, we want to redirect STDOUT to the pipe's
  1043.     # writer and fork; otherwise we run in-process and write directly to the
  1044.     # existing STDOUT.
  1045.     ++$verbose_row;
  1046.     if ($i < @parsed - 1) {
  1047.       # We're in the middle, so allocate a pipe and fork.
  1048.       pipe my($new_reader), my($writer);
  1049.       $verbose_command = $command;
  1050.       @verbose_args    = @args;
  1051.       unless (fork) {
  1052.         # We're the child, so do STDOUT redirection.
  1053.         close $new_reader or die "failed to close pipe reader: $!";
  1054.         dup2(fileno($reader), 0) or die "failed to dup input: $!"
  1055.           if defined $reader;
  1056.         dup2(fileno($writer), 1) or die "failed to dup stdout: $!";
  1057.  
  1058.         close $reader or die "failed to close reader: $!" if defined $reader;
  1059.         close $writer or die "failed to close writer: $!";
  1060.  
  1061.         # The function here may never return.
  1062.         $functions{$command}->(@args);
  1063.         exit;
  1064.       } else {
  1065.         close $writer or die "failed to close pipe writer: $!";
  1066.         $reader = $new_reader;
  1067.       }
  1068.     } else {
  1069.       # We've hit the end of the chain. Preserve stdout, redirect stdin from
  1070.       # current reader.
  1071.       dup2(fileno($reader), 0) or die "failed to dup input: $!"
  1072.         if defined $reader;
  1073.       $verbose_command = $command;
  1074.       @verbose_args    = @args;
  1075.       $functions{$command}->(@args);
  1076.     }
  1077.  
  1078.     # Prevent <> from reading files after the first iteration (this is such a
  1079.     # hack).
  1080.     @ARGV = ();
  1081.   }
  1082. } else {
  1083.   # Behave like cat, which is useful for auto-decompressing things.
  1084.   be_verbose_as_appropriate(length), print while <>;
  1085. }

Svara på "nfu: Command-line numeric fu | Spencer Tipping"

Här kan du skriva ett svar till kodsnutten ovan