#! /local/bin/perl -w use strict; use locale; use Term::ANSIColor; my $versionnumber=1.4; my $versiondate="2006 Jan 02"; ### HELP if (!defined @ARGV || $ARGV[0]=~/^(\-(h|\?|\-help)|\/\?)$/) { print ' *************************************************************** * TeXcount.pl * Count words in TeX and LaTeX files, ignoring macros, tables, formulae, etc. Syntax: TeXcount.pl [options] files Options: -v Verbose: print parsed words, mark formulae -vv More verbose: also print ignored text -vvv Even more verbose: include comments and options -showstate Show internal states (with verbose) -nc, -nocol No colours (colours require ANSI) --version Print version number -h, -?, --help, /? Produce this help The script counts words as either words in the text, words in headers/titles or words in floats (figure/table captions). Macro options (i.e. \marco[...]) are ignored; macro parameters (i.e. \macro{...}) are counted or ignored depending on the macro, but by default counted. Begin-end groups are by default ignored and treated as \'floats\', though some (e.g. center) are counted. Mathematical formulae are not counted as words, but are instead counted separately with separate counts for inlined formulae and displayed formulae. Similarly, the number of headers and the number of \'floats\' are counted. Note that \'float\' is used here to describe anything defined in a begin-end group unless explicitly recognized as text or mathematics. The verbose options (-v, -vv, -vvv, showstate) produces output indicating how the text has been interpreted. Check this to ensure that words in the text has been interpreted as such, whereas mathematical formulae and text/non-text in begin-end groups have been correctly interpreted. Unix hint: Use \'less -r\' instead of just \'less\' to view output: the \'-r\' option makes less treat text formating codes properly. Windows hint: If your Windows interprets ANSI colour codes, lucky you! Otherwise, use the -nocol (or -nc) option with the verbose options or the output will be riddled with colour codes. '; exit; } if ($ARGV[0]=~/^\-\-version$/) { print "Version ".$versionnumber.", ".$versiondate.'.'; exit; } ### How many tokens to exclude from count after macro # Macro name and first N tokens (or {...}) are # ignored. Options [...] right after macro name are # also ignored, as are options [...] between and after # the excluded tokens. my %TeXexcl=('\documentclass'=>1,'\documentstyle'=>1, '\newcommand'=>2,'\renewcommand'=>2, '\newtheorem'=>2, '\usepackage'=>1, '\parbox'=>1, '\raisebox'=>1, '\framebox'=>1, '\ref'=>1, '\label'=>1, '\cite'=>1, '\citep'=>1, '\citet'=>1, '\eqlabel'=>1, '\eqref'=>1, '\hspace'=>1, '\vspace'=>1, '\input'=>1, '\bibliographystyle'=>1); ### Macros for headers # Macros that identify headers: i.e. following token or # {...} is counted as header. The =>2 indicates transition to # state 2 which is used within headers. my %TeXheader=('\title'=>2,'\part'=>2,'\chapter'=>2, '\section'=>2,'\subsection'=>2,'\subsubsection'=>2, '\paragraph'=>2,'\subparagraph'=>2); ### Begin-End groups # Identified as begin-end groups, and define =>state. The # states used corresponds to the elements of the count array, and # are: # 1: Text, word included in text count # 2: Header, words included in header count # 3: Float caption, words included in float caption count # 6: Inline mathematics, words not counted # 7: Displayed mathematics, words not counted # -1: Float, not included, but looks for captions # # 4 and 5 are used to count number of headers and floats # and are not used as states. # # By default, groups not specified are parsed as floats (-1) and only # caption text is counted, and counted as float caption rather than # as words in text. # my %TeXgroup=('document'=>1,'center'=>1, 'abstract'=>1,'quote'=>1, 'quotation'=>1,'verse'=>1,'itemize'=>1, 'theorem'=>1,'lemma'=>1,'definition'=>1,'corollary'=>1, 'example'=>1, 'math'=>6,'displaymath'=>7, 'equation'=>7,'eqnarray'=>7,'array'=>7, 'figure'=>-1,'float'=>-1,'tabular'=>-1); ### In floats: include only specific macros # Macros used to identify caption text within floats. my %TeXfloatinc=('\caption'=>1); ### Count labels # Labels used to describe the counts my @countlabel=('Files','Words in text','Words in headers', 'Words in float captions','Number of headers', 'Number of floats', 'Number of math inlines', 'Number of math displayed'); # Styles my @STYLES=(); my %STYLE; $STYLES[0]={}; $STYLES[1]={'word1'=>'blue','word2'=>'bold blue','word3'=>'blue', 'grouping'=>'red','mathgroup'=>'magenta','prefix'=>'grey'}; $STYLES[2]={%{$STYLES[1]}, 'word0'=>'yellow','word-1'=>'yellow', 'command'=>'green','exclgroup'=>'orange','exclmath'=>'pink', 'ignore'=>'yellow','prefix'=>'grey'}; $STYLES[3]={%{$STYLES[2]}, 'comment'=>'yellow','option'=>'yellow','prefix'=>'grey'}; $STYLES[4]={%{$STYLES[3]}}; # Options and states my $verbose=0; my $showstates=0; my $blankline=0; my $totalcount=new_count(); foreach my $file (@ARGV) { if ($file eq '-v') {$verbose=1; next;} if ($file eq '-vv') {$verbose=2; next;} if ($file eq '-vvv') {$verbose=3; next;} if ($file eq '-vvvv') {$verbose=3; $showstates=1; next;} if ($file eq '-showstate'){$showstates=1; next;} if ($file=~/^\-(nocol|nc$)/) { $ENV{'ANSI_COLORS_DISABLED'} = 1; next; } %STYLE=%{$STYLES[$verbose]}; my $tex=TeXfile($file); parse($tex); print "\nFILE: ".$file."\n"; print_count($tex->{'count'}); print "\n"; add_count($totalcount,$tex->{'count'}); } if (${$totalcount}[0]>1) { print "\nSUM:\n"; print_count($totalcount); } ######### ######### Subroutines ######### sub TeXfile { my $filename=shift @_; my %TeX=(); $TeX{'line'}=read_file($filename); $TeX{'next'}=undef; $TeX{'type'}=undef; $TeX{'style'}=undef; $TeX{'prefix'}=undef; $TeX{'eof'}=0; $TeX{'count'}=new_count(); $TeX{'count'}[0]++; return \%TeX; } sub set_style { my ($tex,$style)=@_; if (!(($tex->{'style'}) && ($tex->{'style'} eq '-'))) {$tex->{'style'}=$style;} } sub flush_style { my ($tex,$style)=@_; set_style($tex,$style); flush_next($tex); } sub line_return { my $blank=shift @_; if ($blank>$blankline) { print "\n"; $blankline++; } } sub print_style { my ($text,$style,$prefix)=@_; (($verbose>0) && (defined $text) && (defined $style)) || return; ($style=$STYLE{$style}) || return; if (($style) && !($style eq '-')) { if ($prefix) { print_style($prefix,'prefix'); } print Term::ANSIColor::colored($text,$style); #print $text; } $blankline=-1; } sub read_file { my $filename=shift @_; open(FH,"<".$filename."") || return undef; if ($verbose) { line_return(1); print "File ".$filename." opened.\n"; $blankline=0; } my @text=(); while (my $line=) { push @text,$line; } close(FH); return join('',@text); } sub next_token { my $tex=shift @_; my ($next,$type); if (defined $tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});} $tex->{'style'}=undef; while (defined ($next=get_next_token($tex))) { $type=$tex->{'type'}; if ($type==0) { print_style($next,'comment'); } elsif ($type==9) { if ($verbose) {line_return(1);} } else { return $next; } } return $next; } sub flush_next { my $tex=shift @_; if (defined $tex->{'next'}) { print_style($tex->{'next'}.' ',$tex->{'style'},$tex->{'prefix'}); $tex->{'prefix'}=undef; } $tex->{'style'}='-'; } sub get_next_token { my $tex=shift @_; my $next; (defined ($next=get_token($tex,'\%[^\n]*',0))) && return $next; (defined ($next=get_token($tex,'\n',9))) && return $next; (defined ($next=get_token($tex,'\w+(\.\w+)+\.',1))) && return $next; (defined ($next=get_token($tex,'\w+([\-\']\w+)*',1))) && return $next; (defined ($next=get_token($tex,'[\"\'\`:\.,\(\)\[\]!\+\-\*=/\^\_\@\<\>\~\#\&]',2))) && return $next; (defined ($next=get_token($tex,'\\\\([a-zA-Z_]+|\\\\|[\^\'\`\&!\[\]\(\)\%])',3))) && return $next; (defined ($next=get_token($tex,'[\{\}]',4))) && return $next; (defined ($next=get_token($tex,'[\[\]]',5))) && return $next; (defined ($next=get_token($tex,'\$\$',6))) && return $next; (defined ($next=get_token($tex,'\$',6))) && return $next; (defined ($next=get_token($tex,'.',999))) && return $next; (defined ($next=get_token($tex,'[^\s]+',999))) && return $next; $tex->{'eof'}=1; return undef; } sub get_token { my ($tex,$regexp,$type)=@_; if ( $tex->{'line'} =~ s/^($regexp)[ \t\r\f]*// ) { $tex->{'next'}=$1; $tex->{'type'}=$type; return $1; } return undef; } sub new_count { my @count=(0,0,0,0,0,0,0,0); # files, text words, header words, float words, # headers, floats, math-inline, math-display; return \@count; } sub count_word { my ($count,$type,$word,$style,$verb)=@_; ($word) || ($word=""); ($style) || ($style=0); ($verb) || ($verb=0); if ($type>0) {${$count}[$type]++;} if ($verb >= $verbose) { } } sub print_count { my $count=shift @_; for (my $i=1;$i<8;$i++) { print $countlabel[$i].': '.${$count}[$i]."\n"; } } sub add_count { my ($a,$b)=@_; for (my $i=0;$i<8;$i++) { ${$a}[$i]+=${$b}[$i]; } } sub parse { my ($tex)=@_; while (!($tex->{'eof'})) { parse_unit($tex,1); } } sub parse_unit { # Status: # 0 = exclude # 1 = text # 2 = header text # 3 = float text # -1 = float (exclude) my ($tex,$status,$end)=@_; my $count=$tex->{'count'}; my $substat; if ($showstates) { flush_next($tex); if (defined $end) { $tex->{'prefix'}=$status.'-'.$end.':'; } else { $tex->{'prefix'}=$status.':'; } } while (defined (my $next=next_token($tex))) { # parse next token; or tokens until match with $end set_style($tex,"ignore"); if (($end) && ($end eq $next)) { # end of unit return; } elsif (!defined $next) { exit "ERROR: End of file while waiting for ".$end."\n"; } if ($tex->{'type'}==1) { # word if ($status>0) { ${$count}[$status]++; set_style($tex,'word'.$status); } } elsif ($tex->{'type'}==9) { #print "\n"; } elsif ($next eq '{') { # {...} parse_unit($tex,$status,'}'); } elsif ($tex->{'type'}==3) { # macro call set_style($tex,'command'); if ($next eq '\begin') { # \begin...\end my $localstyle; if ($status>0) { $localstyle='grouping'; } else { $localstyle='exclgroup'; } flush_style($tex,$localstyle); gobble_option($tex); if ($tex->{'line'} =~ s/^\{(\w+)\*?\}[ \t\r\f]*//) { # gobble group type print_style('{'.$1.'}',$localstyle); } else { print "\nWarning: BEGIN group without type.\n"; } if ($status==0) {$substat=0;} else { # find group status (or set to -1=float) ($substat=$TeXgroup{$1}) || ($substat=-1); } if (($status>0) && ($substat==-1)) { ${$count}[5]++; } if ($substat>3) { # count item, exclude contents ${$count}[$substat]++; $substat=0; } parse_unit($tex,$substat,'\end'); if ($tex->{'line'} =~ s/^\{(\w+)\}[ \t\r\f]*//) { # gobble group type flush_style($tex,$localstyle); print_style('{'.$1.'}',$localstyle); } else { print "\nWarning: END group without type.\n"; } } elsif ($next eq '\(') { # math inline my $localstyle; if ($status>0) { ${$count}[6]++; $localstyle='mathgroup'; } else { $localstyle='exclmath'; } set_style($tex,$localstyle); parse_unit($tex,0,'\)'); set_style($tex,$localstyle); } elsif ($next eq '\[') { # math display my $localstyle; if ($status>0) { ${$count}[7]++; $localstyle='mathgroup'; } else { $localstyle='exclmath'; } set_style($tex,$localstyle); parse_unit($tex,0,'\]'); set_style($tex,$localstyle); } elsif (my $i=$TeXexcl{$next}) { # macro: exclude options gobble_option($tex); for (;$i-->0;) { parse_unit($tex,0); gobble_option($tex); } } elsif ($next eq '\def') { # ignore \def... $tex->{'line'} =~ s/^([^\{]*)\{/\{/; flush_next($tex); print_style($1.' ','ignore'); parse_unit($tex,0); } else { gobble_option($tex); } if (($status>0) && ($substat=$TeXheader{$next})) { # headers ${$count}[4]++; parse_unit($tex,$substat); } if (($status==-1) && ($substat=$TeXfloatinc{$next})) { # text included from float parse_unit($tex,3); } } elsif ($next eq '$') { # math inline my $localstyle; if ($status>0) { ${$count}[6]++; $localstyle='mathgroup'; } else { $localstyle='exclmath'; } set_style($tex,$localstyle); parse_unit($tex,0,$next); set_style($tex,$localstyle); } elsif ($next eq '$$') { # math display my $localstyle; if ($status>0) { ${$count}[7]++; $localstyle='mathgroup'; } else { $localstyle='exclmath'; } set_style($tex,$localstyle); parse_unit($tex,0,$next); set_style($tex,$localstyle); } if (!defined $end) {return;} } } sub gobble_option { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^(\[(\w|[,\-\s])+\])//) { print_style($1,'option'); } }