#! /local/bin/perl -w use strict; use Term::ANSIColor; ### HELP if ($ARGV[0]=~/^\-(h|\?|\-help)$/) { print ' ******************* * TeXcount.pl Count words in TeX and LaTeX files, ignoring macros, tables, formulae, etc. Syntax: TeXcount.pl [options] files Options: -h, -?, --help Produce this help -v Verbose: print parsed words, mark formulae -vv More verbose: also print ignored text -vvv Even more verbose: include comments and options -vvvv Extremely verbose: for debugging, print state The script counts words as either words in the text, words in headers/titles or words in floats (figure/table captions). Macro options (i.e. \marco[...]) are ignored; macro parameters (i.e. \macro{...}) are counted or ignored depending on the macro, but by default counted. Begin-end groups are by default ignored and treated as \'floats\', though some (e.g. center) are counted. Mathematical formulae are not counted as words, but are instead counted separately with separate counts for inlined formulae and displayed formulae. Similarly, the number of headers and the number of \'floats\' are counted. Note that \'float\' is used here to describe anything defined in a begin-end group unless explicitly recognized as text or mathematics. The verbose options (-v, -vv, -vvv and -vvvv) produces output indicating how the text has been interpreted. Check this to ensure that words in the text has been interpreted as such, whereas mathematical formulae and text/non-text in begin-end groups have been correctly interpreted. Hint: Use \'less -r\' instead of just \'less\' to view output: the \'-r\' option makes less treat text formating codes properly. '; exit; } # How many tokens to exclude from count my %TeXexcl=('\documentclass'=>1,'\documentstyle'=>1, '\usepackage'=>1, '\ref'=>1, '\cite'=>1, '\label'=>1, '\eqlabel'=>1, '\eqref'=>1, '\hspace'=>1,'\vspace'=>1, '\input'=>1); # Macros for headers my %TeXheader=('\title'=>2,'\part'=>2,'\chapter'=>2, '\section'=>2,'\subsection'=>2,'\subsubsection'=>2, '\paragraph'=>2,'\subparagraph'=>2); # Begin-End groups my %TeXgroup=('center'=>1, 'abstract'=>1,'quote'=>1, 'quotation'=>1,'verse'=>1,'itemize'=>1, 'theorem'=>1,'lemma'=>1,'definition'=>1,'corollary'=>1, 'math'=>6,'displaymath'=>7, 'equation'=>7,'eqnarray'=>7,'array'=>7); # In floats: include only specific macros my %TeXfloatinc=('\caption'=>1); # Count labels my @countlabel=('Files','Words','Header words', 'Float words','Headers','Floats', 'Math inlines','Math displayed'); # Styles my @STYLES=(); my %STYLE; $STYLES[0]={}; $STYLES[1]={'word1'=>'blue','word2'=>'bold blue','word3'=>'blue', 'grouping'=>'red','mathgroup'=>'magenta'}; $STYLES[2]={%{$STYLES[1]}, 'word0'=>'yellow','word-1'=>'yellow', 'command'=>'green', 'ignore'=>'yellow'}; $STYLES[3]={%{$STYLES[2]}, 'comment'=>'yellow','option'=>'yellow'}; $STYLES[4]={%{$STYLES[3]}}; # Options and states my $verbose=0; my $blankline=0; my $totalcount=new_count(); foreach my $file (@ARGV) { if ($file eq '-v') {$verbose=1; next;} if ($file eq '-vv') {$verbose=2; next;} if ($file eq '-vvv') {$verbose=3; next;} if ($file eq '-vvvv') {$verbose=4; next;} %STYLE=%{$STYLES[$verbose]}; my $tex=TeXfile($file); parse($tex); print "\nFILE: ".$file."\n"; print_count($tex->{'count'}); print "\n"; add_count($totalcount,$tex->{'count'}); } if (${$totalcount}[0]>1) { print "\nSUM:\n"; print_count($totalcount); } ######### ######### Subroutines ######### sub TeXfile { my $filename=shift @_; my %TeX=(); $TeX{'line'}=read_file($filename); $TeX{'next'}=undef; $TeX{'type'}=undef; $TeX{'style'}=undef; $TeX{'eof'}=0; $TeX{'count'}=new_count(); $TeX{'count'}[0]++; return \%TeX; } sub set_style { my ($tex,$style)=@_; if (!(($tex->{'style'}) && ($tex->{'style'} eq '-'))) {$tex->{'style'}=$style;} } sub flush_style { my ($tex,$style)=@_; set_style($tex,$style); flush_next($tex); } sub line_return { my $blank=shift @_; if ($blank>$blankline) { print "\n"; $blankline++; } } sub print_style { my ($text,$style)=@_; (($verbose>0) && (defined $text) && (defined $style)) || return; ($style=$STYLE{$style}) || return; if (($style) && !($style eq '-')) {print colored($text,$style);} $blankline=-1; } sub read_file { my $filename=shift @_; open(FH,"<".$filename."") || return undef; if ($verbose) { line_return(1); print "File ".$filename." opened.\n"; $blankline=0; } my @text=(); while (my $line=) { push @text,$line; } close(FH); return join('',@text); } sub next_token { my $tex=shift @_; my ($next,$type); if ($tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});} $tex->{'style'}=undef; while ($next=get_next_token($tex)) { $type=$tex->{'type'}; if ($type==0) { print_style($next,'comment'); } elsif ($type==9) { if ($verbose) {line_return(1);} } else { return $next; } } return $next; } sub flush_next { my $tex=shift @_; if ($tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});} $tex->{'style'}='-'; } sub get_next_token { my $tex=shift @_; my $next; ($next=get_token($tex,'\%[^\n]*',0)) && return $next; ($next=get_token($tex,'\n',9)) && return $next; ($next=get_token($tex,'\w+(\.\w+)+\.',1)) && return $next; ($next=get_token($tex,'\w+([\-\']\w+)*',1)) && return $next; ($next=get_token($tex,'[\"\'\`:\.,\(\)\[\]!\+\-\*=/\^\_\@\<\>\~\#\&]',2)) && return $next; ($next=get_token($tex,'\\\\([a-zA-Z_]+|\\\\|[\^\'\`\&!\[\]\(\)\%])',3)) && return $next; ($next=get_token($tex,'[\{\}]',4)) && return $next; ($next=get_token($tex,'[\[\]]',5)) && return $next; ($next=get_token($tex,'\$\$',6)) && return $next; ($next=get_token($tex,'\$',6)) && return $next; ($next=get_token($tex,'.',999)) && return $next; ($next=get_token($tex,'[^\s]+',999)) && return $next; $tex->{'eof'}=1; return undef; } sub get_token { my ($tex,$regexp,$type)=@_; if ( $tex->{'line'} =~ s/^($regexp)[ \t\r\f]*// ) { $tex->{'next'}=$1; $tex->{'type'}=$type; return $1; } return undef; } sub new_count { my @count=(0,0,0,0,0,0,0,0); # files, text words, header words, float words, # headers, floats, math-inline, math-display; return \@count; } sub count_word { my ($count,$type,$word,$style,$verb)=@_; ($word) || ($word=""); ($style) || ($style=0); ($verb) || ($verb=0); if ($type>0) {${$count}[$type]++;} if ($verb >= $verbose) { } } sub print_count { my $count=shift @_; for (my $i=1;$i<8;$i++) { print $countlabel[$i].': '.${$count}[$i]."\n"; } } sub add_count { my ($a,$b)=@_; for (my $i=0;$i<8;$i++) { ${$a}[$i]+=${$b}[$i]; } } sub parse { my ($tex)=@_; while (!($tex->{'eof'})) { parse_unit($tex,1); } } sub parse_unit { # Status: # 0 = exclude # 1 = text # 2 = header text # 3 = float text # -1 = float (exclude) my ($tex,$status,$end)=@_; my $count=$tex->{'count'}; my $substat; if ($verbose>3) { flush_next($tex); print $status.':'; if (defined $end) {print $end.':';} } while (my $next=next_token($tex)) { # parse next token; or tokens until match with $end set_style($tex,"ignore"); if (($end) && ($end eq $next)) { # end of unit return; } elsif (!defined $next) { exit "ERROR: End of file while waiting for ".$end."\n"; } if ($tex->{'type'}==1) { # word if ($status>0) { ${$count}[$status]++; set_style($tex,'word'.$status); } } elsif ($tex->{'type'}==9) { #print "\n"; } elsif ($next eq '{') { # {...} parse_unit($tex,$status,'}'); } elsif ($tex->{'type'}==3) { # macro call set_style($tex,'command'); if ($next eq '\begin') { # \begin...\end flush_style($tex,'grouping'); gobble_option($tex); if ($tex->{'line'} =~ s/^\{(\w+)\*?\}[ \t\r\f]*//) { # gobble group type print_style('{'.$1.'}','grouping'); } else { print "\nWarning: BEGIN group without type.\n"; } if ($status==0) {$substat=0;} else { # find group status (or set to -1=float) ($substat=$TeXgroup{$1}) || ($substat=-1); } if ($substat==-1) {${$count}[5]++;} if ($substat>3) { # count item, exclude contents ${$count}[$substat]++; $substat=0; } parse_unit($tex,$substat,'\end'); if ($tex->{'line'} =~ s/^\{(\w+)\}[ \t\r\f]*//) { # gobble group type flush_style($tex,'grouping'); print_style('{'.$1.'}','grouping'); } else { print "\nWarning: END group without type.\n"; } } elsif ($next eq '\(') { # math inline if ($status>0) {${$count}[6]++;} set_style($tex,'mathgroup'); parse_unit($tex,0,'\)'); set_style($tex,'mathgroup'); } elsif ($next eq '\[') { # math display if ($status>0) {${$count}[7]++;} set_style($tex,'mathgroup'); parse_unit($tex,0,'\]'); set_style($tex,'mathgroup'); } elsif (my $i=$TeXexcl{$next}) { # macro: exclude options gobble_option($tex); for (;$i-->0;) { parse_unit($tex,0); } } elsif ($next eq '\def') { # ignore \def... $tex->{'line'} =~ s/^[^\{]*\{/\{/; parse_unit($tex,0); } elsif (($next eq '\newcommand') || ($next eq '\renewcommand')) { # ignore command (re)definition parse_unit($tex,0); gobble_option($tex); parse_unit($tex,0); } else { gobble_option($tex); } if (($status>0) && ($substat=$TeXheader{$next})) { # headers ${$count}[4]++; parse_unit($tex,$substat); } if (($status==-1) && ($substat=$TeXfloatinc{$next})) { # text included from float parse_unit($tex,3); } } elsif ($next eq '$') { # math inline if ($status>0) {${$count}[6]++;} set_style($tex,'mathgroup'); parse_unit($tex,0,$next); set_style($tex,'mathgroup'); } elsif ($next eq '$$') { # math display if ($status>0) {${$count}[7]++;} set_style($tex,'mathgroup'); parse_unit($tex,0,$next); set_style($tex,'mathgroup'); } if (!defined $end) {return;} } } sub gobble_option { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^(\[\w+\])//) { print_style($1,'option'); } }