#! /local/bin/perl -w use strict; use locale; my $versionnumber=1.5; my $versiondate="2006 Jan 09"; ### How many tokens to exclude from count after macro # Macro name and first N tokens (or {...}) are # ignored. Options [...] right after macro name are # also ignored, as are options [...] between and after # the excluded tokens. # # For macros not specified here, the default value is 0: i.e. # no tokens are excluded, by [...] options are. Some macros # have been included here with =>0 for completeness and as a # reminder. my %TeXexcl=('\documentclass'=>1,'\documentstyle'=>1, '\usepackage'=>1, '\hyphenation'=>1, '\pagestyle'=>1,'\thispagestyle'=>1, '\pagenumbering'=>1, '\markboth'=>1, '\markright'=>1, '\newcommand'=>2,'\renewcommand'=>2, '\newenvironment'=>3, 'renewenvironment'=>3, '\newtheorem'=>2, '\parbox'=>1, '\marginpar'=>0, '\makebox'=>0, '\raisebox'=>1, '\framebox'=>0, '\newsavebox'=>1, '\sbox'=>1, '\savebox'=>2, '\usebox'=>1, '\rule'=>2, '\label'=>1, '\ref'=>1, '\pageref'=>1, '\bibitem'=>1, '\cite'=>1, '\citep'=>1, '\citet'=>1, '\nocite'=>1, '\eqlabel'=>1, '\eqref'=>1, '\hspace'=>1, '\vspace'=>1, '\addvspace'=>1, '\input'=>1, '\include'=>1, '\includeonly'=>1, '\bibliographystyle'=>1, '\bibliography'=>1, '\newlength'=>1, '\setlength'=>2, '\addtolength'=>2, '\settodepth'=>2, '\settoheight'=>2, '\settowidth'=>2, '\newcounter'=>1, '\setcounter'=>2, '\addtocounter'=>2, '\stepcounter'=>1, '\refstepcounter'=>1, '\usecounter'=>1, '\alph'=>1, '\arabic'=>1, '\fnsymbol'=>1, '\roman'=>1, '\value'=>1, '\newfont'=>2, '\cline'=>1, '\multicolumn'=>3, '\typeout'=>1, '\typein'=>1, 'beginlist'=>2, 'beginminipage'=>1, 'begintabular'=>1, 'beginthebibliography'=>1, 'beginlrbox'=>1 ); ### Macros for headers # Macros that identify headers: i.e. following token or # {...} is counted as header. The =>2 indicates transition to # state 2 which is used within headers. my %TeXheader=('\title'=>2,'\part'=>2,'\chapter'=>2, '\section'=>2,'\subsection'=>2,'\subsubsection'=>2, '\paragraph'=>2,'\subparagraph'=>2); ### Begin-End groups # Identified as begin-end groups, and define =>state. The # states used corresponds to the elements of the count array, and # are: # 0: Not included # 1: Text, word included in text count # 2: Header, words included in header count # 3: Float caption, words included in float caption count # 6: Inline mathematics, words not counted # 7: Displayed mathematics, words not counted # -1: Float, not included, but looks for captions # # 4 and 5 are used to count number of headers and floats # and are not used as states. # # By default, groups not specified are parsed as floats (-1) and only # caption text is counted, and counted as float caption rather than # as words in text. # # Note that some environments may only exist within math-mode, and # therefore need not be defined here: in fact, they should not as it # is not clear if they will be in inlined or displayed math. # my %TeXgroup=('document'=>1,'letter'=>1,'titlepage'=>0, 'center'=>1,'flushleft'=>1,'flushright'=>1, 'abstract'=>1,'quote'=>1,'quotation'=>1,'verse'=>1, 'minipage'=>1,'verbatim'=>1, 'description'=>1,'enumerate'=>1,'itemize'=>1,'list'=>1, 'theorem'=>1,'lemma'=>1,'definition'=>1,'corollary'=>1, 'example'=>1, 'math'=>6,'displaymath'=>7, 'equation'=>7,'eqnarray'=>7, 'figure'=>-1,'float'=>-1,'picture'=>-1,'table'=>-1, 'tabbing'=>0,'tabular'=>0,'thebibliography'=>0, 'lrbox'=>0); ### In floats: include only specific macros # Macros used to identify caption text within floats. my %TeXfloatinc=('\caption'=>1); ### Count labels # Labels used to describe the counts my @countlabel=('Files','Words in text','Words in headers', 'Words in float captions','Number of headers', 'Number of floats', 'Number of math inlines', 'Number of math displayed'); # Styles my @STYLES=(); my %STYLE; $STYLES[0]={}; $STYLES[1]={'word1'=>'blue','word2'=>'bold blue','word3'=>'blue', 'grouping'=>'red','mathgroup'=>'magenta','state'=>'cyan underline'}; $STYLES[2]={%{$STYLES[1]}, 'word0'=>'yellow','word-1'=>'yellow', 'command'=>'green','exclgroup'=>'yellow','exclmath'=>'yellow', 'ignore'=>'cyan'}; $STYLES[3]={%{$STYLES[2]}, 'comment'=>'yellow','option'=>'yellow'}; $STYLES[4]={%{$STYLES[3]}}; # Options and states my $verbose=0; my $showstates=0; my $htmlstyle=0; my $blankline=0; ## HELP if (!defined @ARGV || $ARGV[0]=~/^(\-(h|\?|\-help)|\/\?)$/) { print_help(); exit; } #VERSION if ($ARGV[0]=~/^\-\-version$/) { print "Version ".$versionnumber.", ".$versiondate.'.'; exit; } # PROCESS ARGUMENTS AND FILES my $totalcount=new_count(); foreach my $file (<@ARGV>) { if ($file eq '-v') {$verbose=1; next;} if ($file eq '-vv') {$verbose=2; next;} if ($file eq '-vvv') {$verbose=3; next;} if ($file eq '-vvvv') {$verbose=3; $showstates=1; next;} if ($file =~ /^\-showstate/ ){$showstates=1; next;} if ($file =~ /^\-html/ ) { $ENV{'ANSI_COLORS_DISABLED'} = 1; $htmlstyle = $file eq '-htmlcore' ? 1 : 2; if ($htmlstyle>1) {html_head();} next; } if ($file=~/^\-(nocol|nc$)/) { $ENV{'ANSI_COLORS_DISABLED'} = 1; next; } %STYLE=%{$STYLES[$verbose]}; my $tex=TeXfile($file); parse($tex); print "\n"; print_count($tex->{'count'},'FILE: '.$file); print "\n"; add_count($totalcount,$tex->{'count'}); } if (${$totalcount}[0]>1) { formatprint('Total','h1'); print_count($totalcount,'SUM'); } if ($htmlstyle>1) { html_tail(); } ######### ######### Subroutines ######### sub TeXfile { my $filename=shift @_; return TeXcode(read_file($filename)); } sub TeXcode { my $texcode=shift @_; my %TeX=(); $TeX{'line'}=$texcode; $TeX{'next'}=undef; $TeX{'type'}=undef; $TeX{'style'}=undef; $TeX{'printstate'}=undef; $TeX{'eof'}=0; $TeX{'count'}=new_count(); $TeX{'count'}[0]++; return \%TeX; } sub set_style { my ($tex,$style)=@_; if (!(($tex->{'style'}) && ($tex->{'style'} eq '-'))) {$tex->{'style'}=$style;} } sub flush_style { my ($tex,$style)=@_; set_style($tex,$style); flush_next($tex); } sub line_return { my $blank=shift @_; if ($blank>$blankline) { if ($htmlstyle) {print "
\n";} else {print "\n";} $blankline++; } } sub print_style { my ($text,$style,$state)=@_; (($verbose>0) && (defined $text) && (defined $style)) || return; my $colour; ($colour=$STYLE{$style}) || return; if (($colour) && !($colour eq '-')) { if ($htmlstyle) { print "".$text.""; } else { #print Term::ANSIColor::colored($text,$colour); print $text; } if ($state) { print_style($state,'state'); } $blankline=-1; } } sub formatprint { my ($text,$tag,$class)=@_; my $break=($text=~s/\n$//); if ($htmlstyle && defined $tag) { print '<'.$tag; if ($class) {print " class='".$class."'";} print '>'.$text.''; } else { print $text; } if ($break) {print "\n";} } sub read_file { my $filename=shift @_; open(FH,"<".$filename."") || return undef; if ($verbose) { line_return(1); formatprint("File: ".$filename."\n",'h2'); $blankline=0; } my @text=(); while (my $line=) { push @text,$line; } close(FH); return join('',@text); } sub next_token { my $tex=shift @_; my ($next,$type); if (defined $tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});} $tex->{'style'}=undef; while (defined ($next=get_next_token($tex))) { $type=$tex->{'type'}; if ($type==0) { print_style($next,'comment'); } elsif ($type==9) { if ($verbose) {line_return(1);} } else { return $next; } } return $next; } sub flush_next { my $tex=shift @_; if (defined $tex->{'next'}) { print_style($tex->{'next'}.' ',$tex->{'style'},$tex->{'printstate'}); } $tex->{'printstate'}=undef; $tex->{'style'}='-'; } sub get_next_token { my $tex=shift @_; my $next; (defined ($next=get_token($tex,'\%[^\n]*',0))) && return $next; (defined ($next=get_token($tex,'\n',9))) && return $next; (defined ($next=get_token($tex,'\w+(\.\w+)+\.',1))) && return $next; (defined ($next=get_token($tex,'\w+([\-\']\w+)*',1))) && return $next; (defined ($next=get_token($tex,'[\"\'\`:\.,\(\)\[\]!\+\-\*=/\^\_\@\<\>\~\#\&]',2))) && return $next; (defined ($next=get_token($tex,'\\\\([a-zA-Z_]+|\\\\|[\^\'\`\&!\[\]\(\)\%])',3))) && return $next; (defined ($next=get_token($tex,'[\{\}]',4))) && return $next; (defined ($next=get_token($tex,'[\[\]]',5))) && return $next; (defined ($next=get_token($tex,'\$\$',6))) && return $next; (defined ($next=get_token($tex,'\$',6))) && return $next; (defined ($next=get_token($tex,'.',999))) && return $next; (defined ($next=get_token($tex,'[^\s]+',999))) && return $next; $tex->{'eof'}=1; return undef; } sub get_token { my ($tex,$regexp,$type)=@_; if ( $tex->{'line'} =~ s/^($regexp)[ \t\r\f]*// ) { $tex->{'next'}=$1; $tex->{'type'}=$type; return $1; } return undef; } sub new_count { my @count=(0,0,0,0,0,0,0,0); # files, text words, header words, float words, # headers, floats, math-inline, math-display; return \@count; } sub count_word { my ($count,$type,$word,$style,$verb)=@_; ($word) || ($word=""); ($style) || ($style=0); ($verb) || ($verb=0); if ($type>0) {${$count}[$type]++;} if ($verb >= $verbose) { } } sub print_count { my ($count,$header)=@_; if ($htmlstyle) {print "
\n";} if (defined $header) { formatprint($header."\n",'dt','header'); } for (my $i=1;$i<8;$i++) { formatprint($countlabel[$i].': ','dt'); formatprint(${$count}[$i]."\n",'dd'); } if ($htmlstyle) {print "
\n";} } sub add_count { my ($a,$b)=@_; for (my $i=0;$i<8;$i++) { ${$a}[$i]+=${$b}[$i]; } } sub parse { my ($tex)=@_; if ($htmlstyle && $verbose) {print "

\n";} while (!($tex->{'eof'})) { parse_unit($tex,1); } if ($htmlstyle && $verbose) {print "

\n";} } sub parse_unit { # Status: # 0 = exclude # 1 = text # 2 = header text # 3 = float text # -1 = float (exclude) my ($tex,$status,$end)=@_; my $count=$tex->{'count'}; my $substat; if ($showstates) { if (defined $end) { $tex->{'printstate'}=':'.$status.'-'.$end.':'; } else { $tex->{'printstate'}=':'.$status.':'; } flush_next($tex); } while (defined (my $next=next_token($tex))) { # parse next token; or tokens until match with $end set_style($tex,"ignore"); if (($end) && ($end eq $next)) { # end of unit return; } elsif (!defined $next) { exit "ERROR: End of file while waiting for ".$end."\n"; } if ($tex->{'type'}==1) { # word if ($status>0) { ${$count}[$status]++; set_style($tex,'word'.$status); } } elsif ($tex->{'type'}==9) { #print "\n"; } elsif ($next eq '{') { # {...} parse_unit($tex,$status,'}'); } elsif ($tex->{'type'}==3) { # macro call set_style($tex,'command'); if ($next eq '\begin') { # \begin...\end my $localstyle=$status>0 ? 'grouping' : 'exclgroup'; flush_style($tex,$localstyle); gobble_option($tex); if ($tex->{'line'} =~ s/^\{(\w+)\*?\}[ \t\r\f]*//) { # gobble group type print_style('{'.$1.'}',$localstyle); } else { print "\nWarning: BEGIN group without type.\n"; } # find group status (or leave unchanged) ($substat=$TeXgroup{$1}) || ($substat=$status); if ($status<=0 && $status<$substat) {$substat=$status;} if (($status>0) && ($substat==-1)) { # Count float ${$count}[5]++; } if ($status>0 and $substat>3) { # count item, exclude contents ${$count}[$substat]++; $substat=0; } parse_unit($tex,$substat,'\end'); if ($tex->{'line'} =~ s/^\{(\w+)\}[ \t\r\f]*//) { # gobble group type flush_style($tex,$localstyle); print_style('{'.$1.'}',$localstyle); } else { print "\nWarning: END group without type.\n"; } } elsif ($next eq '\(') { # math inline my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[6]++;} set_style($tex,$localstyle); parse_unit($tex,0,'\)'); set_style($tex,$localstyle); } elsif ($next eq '\[') { # math display my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[7]++;} set_style($tex,$localstyle); parse_unit($tex,0,'\]'); set_style($tex,$localstyle); } elsif (defined (my $i=$TeXexcl{$next})) { # macro: exclude options gobble_macro_parms($tex,$i); } elsif ($next eq '\def') { # ignore \def... $tex->{'line'} =~ s/^([^\{]*)\{/\{/; flush_next($tex); print_style($1.' ','ignore'); parse_unit($tex,0); } else { gobble_option($tex); } if (($status>0) && ($substat=$TeXheader{$next})) { # headers ${$count}[4]++; gobble_macro_modifier($tex); gobble_option($tex); parse_unit($tex,$substat); } if (($status==-1) && ($substat=$TeXfloatinc{$next})) { # text included from float parse_unit($tex,3); } } elsif ($next eq '$') { # math inline my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[6]++;} set_style($tex,$localstyle); parse_unit($tex,0,$next); set_style($tex,$localstyle); } elsif ($next eq '$$') { # math display my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[7]++;} set_style($tex,$localstyle); parse_unit($tex,0,$next); set_style($tex,$localstyle); } if (!defined $end) {return;} } } sub gobble_option { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^(\[(\w|[,\-\s])+\])//) { print_style($1,'option'); return $1; } return undef; } sub gobble_options { while (gobble_option(@_)) {} } sub gobble_macro_modifier { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^\*//) { print_style($1,'option'); return $1; } return undef; } sub gobble_macro_parms { my ($tex,$i)=@_; if ($i>0) {gobble_macro_modifier($tex);} gobble_options($tex); for (;$i-->0;) { parse_unit($tex,0); gobble_options($tex); } } ### HTML context sub html_head { print '

LaTeX word count

'; } sub html_tail { print ''; } ### HELP sub print_help { print ' *************************************************************** * TeXcount.pl * Count words in TeX and LaTeX files, ignoring macros, tables, formulae, etc. Syntax: TeXcount.pl [options] files Options: -v Verbose: print parsed words, mark formulae -vv More verbose: also print ignored text -vvv Even more verbose: include comments and options -showstate Show internal states (with verbose) -nc, -nocol No colours (colours require ANSI) -html Output in HTML format -htmlcore Only HTML body contents --version Print version number -h, -?, --help, /? Produce this help The script counts words as either words in the text, words in headers/titles or words in floats (figure/table captions). Macro options (i.e. \marco[...]) are ignored; macro parameters (i.e. \macro{...}) are counted or ignored depending on the macro, but by default counted. Begin-end groups are by default ignored and treated as \'floats\', though some (e.g. center) are counted. Mathematical formulae are not counted as words, but are instead counted separately with separate counts for inlined formulae and displayed formulae. Similarly, the number of headers and the number of \'floats\' are counted. Note that \'float\' is used here to describe anything defined in a begin-end group which is not text or mathematics: e.g. tables and figures. The verbose options (-v, -vv, -vvv, showstate) produces output indicating how the text has been interpreted. Check this to ensure that words in the text has been interpreted as such, whereas mathematical formulae and text/non-text in begin-end groups have been correctly interpreted. Unix hint: Use \'less -r\' instead of just \'less\' to view output: the \'-r\' option makes less treat text formating codes properly. Windows hint: If your Windows interprets ANSI colour codes, lucky you! Otherwise, use the -nocol (or -nc) option with the verbose options or the output will be riddled with colour codes. '; }