#! /local/bin/perl -w use strict; use Term::ANSIColor; use POSIX qw(locale_h); use locale; setlocale(LC_CTYPE,"no_NO"); my $versionnumber="1.9"; my $versiondate="2008 Jan 10"; ### How many tokens to exclude from count after macro # Macro name and first N tokens (or {...}) are # ignored. Options [...] right after macro name are # also ignored, as are options [...] between and after # the excluded tokens. # # For macros not specified here, the default value is 0: i.e. # no tokens are excluded, but [...] options are. Some macros # have been included here with =>0 for completeness and as a # reminder. my %TeXexcl=('\documentclass'=>1,'\documentstyle'=>1,'\usepackage'=>1, '\hyphenation'=>1, '\pagestyle'=>1,'\thispagestyle'=>1, '\pagenumbering'=>1,'\markboth'=>1, '\markright'=>1, '\newcommand'=>2,'\renewcommand'=>2,'\newenvironment'=>3, 'renewenvironment'=>3, '\newfont'=>2,'\newtheorem'=>2,'\bibliographystyle'=>1, '\bibliography'=>1, '\parbox'=>1, '\marginpar'=>0,'\makebox'=>0, '\raisebox'=>1, '\framebox'=>0, '\newsavebox'=>1, '\sbox'=>1, '\savebox'=>2, '\usebox'=>1,'\rule'=>2, '\footnote'=>1,'\label'=>1, '\ref'=>1, '\pageref'=>1, '\bibitem'=>1, '\cite'=>1, '\citep'=>1, '\citet'=>1, '\citeauthor'=>1, '\citealt'=>1, '\nocite'=>1, '\eqlabel'=>1, '\eqref'=>1,'\hspace'=>1, '\vspace'=>1, '\addvspace'=>1, '\input'=>1, '\include'=>1, '\includeonly'=>1,'\includegraphics'=>1, '\newlength'=>1, '\setlength'=>2, '\addtolength'=>2,'\settodepth'=>2, '\settoheight'=>2, '\settowidth'=>2,'\newcounter'=>1, '\setcounter'=>2, '\addtocounter'=>2,'\stepcounter'=>1, '\refstepcounter'=>1, '\usecounter'=>1, '\alph'=>1, '\arabic'=>1, '\fnsymbol'=>1, '\roman'=>1, '\value'=>1, '\cline'=>1, '\multicolumn'=>3,'\typeout'=>1, '\typein'=>1, 'beginlist'=>2, 'beginminipage'=>1, 'begintabular'=>1, 'beginthebibliography'=>1,'beginlrbox'=>1 ); ### Macros that should be counted as one or more words # Macros that represent text may be declared here. The value gives # the number of words the macro represents. my %TeXmacroword=('\LaTeX'=>1); ### Macros for headers # Macros that identify headers: i.e. following token or # {...} is counted as header. The =>2 indicates transition to # state 2 which is used within headers. my %TeXheader=('\title'=>2,'\part'=>2,'\chapter'=>2, '\section'=>2,'\subsection'=>2,'\subsubsection'=>2, '\paragraph'=>2,'\subparagraph'=>2); ### Macros that are counted within the preamble # The preamble is the text between \documentclass and \begin{document}. # Text and macros in the preamble is ignored unless specified here. The # value is the status (1=text, 2=header, etc.) they should be interpreted as. # Note that only the first unit (token or {...} block) is counted. my %TeXpreamble=('\title'=>2); ### Begin-End groups # Identified as begin-end groups, and define =>state. The # states used corresponds to the elements of the count array, and # are: # 0: Not included # 1: Text, word included in text count # 2: Header, words included in header count # 3: Float caption, words included in float caption count # 6: Inline mathematics, words not counted # 7: Displayed mathematics, words not counted # -1: Float, not included, but looks for captions # # 4 and 5 are used to count number of headers and floats # and are not used as states. # # By default, groups not specified are parsed as floats (-1) and only # caption text is counted, and counted as float caption rather than # as words in text. # # Note that some environments may only exist within math-mode, and # therefore need not be defined here: in fact, they should not as it # is not clear if they will be in inlined or displayed math. # my %TeXgroup=('document'=>1,'letter'=>1,'titlepage'=>0, 'center'=>1,'flushleft'=>1,'flushright'=>1, 'abstract'=>1,'quote'=>1,'quotation'=>1,'verse'=>1,'minipage'=>1,'verbatim'=>1, 'description'=>1,'enumerate'=>1,'itemize'=>1,'list'=>1, 'theorem'=>1,'lemma'=>1,'definition'=>1,'corollary'=>1,'example'=>1, 'math'=>6,'displaymath'=>7,'equation'=>7,'eqnarray'=>7, 'figure'=>-1,'float'=>-1,'picture'=>-1,'table'=>-1, 'tabbing'=>0,'tabular'=>0,'thebibliography'=>0,'lrbox'=>0); ### In floats: include only specific macros # Macros used to identify caption text within floats. my %TeXfloatinc=('\caption'=>1); ### Count labels # Labels used to describe the counts my @countlabel=('Files','Words in text','Words in headers', 'Words in float captions','Number of headers','Number of floats', 'Number of math inlines','Number of math displayed'); # Styles my @STYLES=(); my %STYLE; $STYLES[0]={}; $STYLES[1]={'word1'=>'blue','word2'=>'bold blue','word3'=>'blue', 'grouping'=>'red','mathgroup'=>'magenta','state'=>'cyan underline'}; $STYLES[2]={%{$STYLES[1]}, 'word0'=>'yellow','word-1'=>'yellow', 'command'=>'green','exclgroup'=>'yellow','exclmath'=>'yellow', 'ignore'=>'cyan'}; $STYLES[3]={%{$STYLES[2]}, 'comment'=>'yellow','option'=>'yellow'}; $STYLES[4]={%{$STYLES[3]}}; # Options and states my $verbose=0; my $showstates=0; my $htmlstyle=0; my $blankline=0; ## SHORT HELP AND INFO if (!defined @ARGV) { print_version(); print_syntax(); print_reference(); exit; } elsif ($ARGV[0]=~/^(\-(h|\?|\-help)|\/\?)$/) { print_help(); exit; } elsif ($ARGV[0]=~/^\-\-version$/) { print_version(); exit; } elsif ($ARGV[0]=~/^\-\-licence$/) { print_licence(); exit; } # PROCESS ARGUMENTS AND FILES my $totalcount=new_count(); foreach my $file (<@ARGV>) { if ($file eq '-v') {$verbose=1; next;} if ($file eq '-vv') {$verbose=2; next;} if ($file eq '-vvv') {$verbose=3; next;} if ($file eq '-vvvv') {$verbose=3; $showstates=1; next;} if ($file =~ /^\-showstate/ ){$showstates=1; next;} if ($file =~ /^\-html/ ) { $ENV{'ANSI_COLORS_DISABLED'} = 1; $htmlstyle = $file eq '-htmlcore' ? 1 : 2; if ($htmlstyle>1) {html_head();} next; } if ($file=~/^\-(nocol|nc$)/) { $ENV{'ANSI_COLORS_DISABLED'} = 1; next; } %STYLE=%{$STYLES[$verbose]}; my $tex=TeXfile($file); if (!defined $tex) { print "File not found or not readable: ".$file."\n"; next; } parse($tex); print "\n"; print_count($tex->{'count'},'FILE: '.$file); print "\n"; add_count($totalcount,$tex->{'count'}); } if (${$totalcount}[0]>1) { formatprint('Total','h1'); print_count($totalcount,'SUM'); } if ($htmlstyle>1) { html_tail(); } ######### ######### Subroutines ######### sub TeXfile { my $filename=shift @_; my $file=read_file($filename) || return undef; return TeXcode($file); } sub TeXcode { my $texcode=shift @_; my %TeX=(); $TeX{'line'}=$texcode; $TeX{'next'}=undef; $TeX{'type'}=undef; $TeX{'style'}=undef; $TeX{'printstate'}=undef; $TeX{'eof'}=0; $TeX{'count'}=new_count(); $TeX{'count'}[0]++; return \%TeX; } sub set_style { my ($tex,$style)=@_; if (!(($tex->{'style'}) && ($tex->{'style'} eq '-'))) {$tex->{'style'}=$style;} } sub flush_style { my ($tex,$style)=@_; set_style($tex,$style); flush_next($tex); } sub line_return { my $blank=shift @_; if ($blank>$blankline) { if ($htmlstyle) {print "
\n";} else {print "\n";} $blankline++; } } sub print_style { my ($text,$style,$state)=@_; (($verbose>0) && (defined $text) && (defined $style)) || return; my $colour; ($colour=$STYLE{$style}) || return; if (($colour) && !($colour eq '-')) { if ($htmlstyle) { print "".$text.""; } else { print Term::ANSIColor::colored($text,$colour); #print $text; } if ($state) { print_style($state,'state'); } $blankline=-1; } } sub formatprint { my ($text,$tag,$class)=@_; my $break=($text=~s/\n$//); if ($htmlstyle && defined $tag) { print '<'.$tag; if ($class) {print " class='".$class."'";} print '>'.$text.''; } else { print $text; } if ($break) {print "\n";} } sub read_file { my $filename=shift @_; open(FH,"<".$filename."") || return undef; if ($verbose) { #line_return(1); formatprint("File: ".$filename."\n",'h2'); $blankline=0; } my @text=(); while (my $line=) { push @text,$line; } close(FH); return join('',@text); } sub next_token { my $tex=shift @_; my ($next,$type); if (defined $tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});} $tex->{'style'}=undef; while (defined ($next=get_next_token($tex))) { $type=$tex->{'type'}; if ($type==0) { print_style($next,'comment'); } elsif ($type==9) { if ($verbose) {line_return(1);} } else { return $next; } } return $next; } sub flush_next { my $tex=shift @_; if (defined $tex->{'next'}) { print_style($tex->{'next'}.' ',$tex->{'style'},$tex->{'printstate'}); } $tex->{'printstate'}=undef; $tex->{'style'}='-'; } sub get_next_token { my $tex=shift @_; my $next; (defined ($next=get_token($tex,'\%[^\n]*',0))) && return $next; (defined ($next=get_token($tex,'\n',9))) && return $next; (defined ($next=get_token($tex,'\\\\[\{\}]',1))) && return $next; (defined ($next=get_token($tex,'\w+(\.\w+)+\.',1))) && return $next; (defined ($next=get_token($tex,'\w+([\-\']\w+)*',1))) && return $next; (defined ($next=get_token($tex,'[\"\'\`:\.,\(\)\[\]!\+\-\*=/\^\_\@\<\>\~\#\&]',2))) && return $next; (defined ($next=get_token($tex,'\\\\([a-zA-Z_]+|[^a-zA-Z_])',3))) && return $next; (defined ($next=get_token($tex,'[\{\}]',4))) && return $next; (defined ($next=get_token($tex,'[\[\]]',5))) && return $next; (defined ($next=get_token($tex,'\$\$',6))) && return $next; (defined ($next=get_token($tex,'\$',6))) && return $next; (defined ($next=get_token($tex,'.',999))) && return $next; (defined ($next=get_token($tex,'[^\s]+',999))) && return $next; $tex->{'eof'}=1; return undef; } sub get_token { my ($tex,$regexp,$type)=@_; if (!defined $regexp) {print "ERROR in get_token: undefined regex.";} if (!defined $tex->{'line'}) {print "ERROR in get_token: undefined tex-line. ".$tex->{'next'}."\n";} if ( $tex->{'line'} =~ s/^($regexp)[ \t\r\f]*// ) { $tex->{'next'}=$1; $tex->{'type'}=$type; return $1; } return undef; } sub new_count { my @count=(0,0,0,0,0,0,0,0); # files, text words, header words, float words, # headers, floats, math-inline, math-display; return \@count; } sub count_word { my ($count,$type,$word,$style,$verb)=@_; ($word) || ($word=""); ($style) || ($style=0); ($verb) || ($verb=0); if ($type>0) {${$count}[$type]++;} if ($verb >= $verbose) { } } sub print_count { my ($count,$header)=@_; if ($htmlstyle) {print "
\n";} if (defined $header) { formatprint($header."\n",'dt','header'); } for (my $i=1;$i<8;$i++) { formatprint($countlabel[$i].': ','dt'); formatprint(${$count}[$i]."\n",'dd'); } if ($htmlstyle) {print "
\n";} } sub add_count { my ($a,$b)=@_; for (my $i=0;$i<8;$i++) { ${$a}[$i]+=${$b}[$i]; } } sub parse { my ($tex)=@_; if ($htmlstyle && $verbose) {print "

\n";} while (!($tex->{'eof'})) { parse_unit($tex,1); } if ($htmlstyle && $verbose) {print "

\n";} } sub parse_unit { # Status: # 0 = exclude # 1 = text # 2 = header text # 3 = float text # -1 = float (exclude) # -9 = preamble (between \documentclass and \begin{document}) my ($tex,$status,$end)=@_; my $count=$tex->{'count'}; my $substat; if ($showstates) { if (defined $end) { $tex->{'printstate'}=':'.$status.':'.$end.':'; } else { $tex->{'printstate'}=':'.$status.':'; } flush_next($tex); } while (defined (my $next=next_token($tex))) { # parse next token; or tokens until match with $end set_style($tex,"ignore"); if (($end) && ($end eq $next)) { # end of unit return; } elsif (!defined $next) { exit "ERROR: End of file while waiting for ".$end."\n"; } if ($tex->{'type'}==1) { # word if ($status>0) { ${$count}[$status]++; set_style($tex,'word'.$status); } } elsif ($tex->{'type'}==9) { #print "\n"; } elsif ($next eq '{') { # {...} parse_unit($tex,$status,'}'); } elsif ($tex->{'type'}==3) { # macro call set_style($tex,'command'); if ($next eq '\documentclass') { # starts preamble gobble_option($tex); gobble_macro_parms($tex,1); while (!($tex->{'eof'})) { parse_unit($tex,-9); } } elsif ($next eq '\begin') { # \begin...\end if ($status==-9 && $tex->{'line'}=~/^\{document\}/) { $status=1; } my $localstyle=$status>0 ? 'grouping' : 'exclgroup'; flush_style($tex,$localstyle); gobble_option($tex); if ($tex->{'line'} =~ s/^\{(\w+)\*?\}[ \t\r\f]*//) { # gobble group type print_style('{'.$1.'}',$localstyle); $next='begin'.$1; if (defined (my $i=$TeXexcl{$next})) { gobble_macro_parms($tex,$i); } } else { print "\nWarning: BEGIN group without type.\n"; } # find group status (or leave unchanged) defined ($substat=$TeXgroup{$1}) || ($substat=$status); if ($status<=0 && $status<$substat) {$substat=$status;} if (($status>0) && ($substat==-1)) { # Count float ${$count}[5]++; } if ($status>0 and $substat>3) { # count item, exclude contents ${$count}[$substat]++; $substat=0; } parse_unit($tex,$substat,'\end'); if ($tex->{'line'} =~ s/^\{(\w+)\}[ \t\r\f]*//) { # gobble group type flush_style($tex,$localstyle); print_style('{'.$1.'}',$localstyle); } else { print "\nWarning: END group without type.\n"; } } elsif ($next eq '\(') { # math inline my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[6]++;} set_style($tex,$localstyle); parse_unit($tex,0,'\)'); set_style($tex,$localstyle); } elsif ($next eq '\[') { # math display my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[7]++;} set_style($tex,$localstyle); parse_unit($tex,0,'\]'); set_style($tex,$localstyle); } elsif (defined (my $i=$TeXexcl{$next})) { # macro: exclude options gobble_macro_parms($tex,$i); } elsif (defined ($i=$TeXmacroword{$next})) { # count macro as word (or a given number of words) if ($status>0) { ${$count}[$status]+=$i; set_style($tex,'word'.$status); } } elsif ($next eq '\def') { # ignore \def... $tex->{'line'} =~ s/^([^\{]*)\{/\{/; flush_next($tex); print_style($1.' ','ignore'); parse_unit($tex,0); } elsif ($next =~ /^\\[^\w\_]/) { } else { gobble_option($tex); } if (($status>0 || ($status==-9 && defined $TeXpreamble{$next})) && ($substat=$TeXheader{$next})) { # headers ${$count}[4]++; gobble_macro_modifier($tex); gobble_option($tex); parse_unit($tex,$substat); } elsif ($status==-9 && defined ($substat=$TeXpreamble{$next})) { parse_unit($tex,$substat); } if (($status==-1) && ($substat=$TeXfloatinc{$next})) { # text included from float parse_unit($tex,3); } } elsif ($next eq '$') { # math inline my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[6]++;} set_style($tex,$localstyle); parse_unit($tex,0,$next); set_style($tex,$localstyle); } elsif ($next eq '$$') { # math display my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {${$count}[7]++;} set_style($tex,$localstyle); parse_unit($tex,0,$next); set_style($tex,$localstyle); } if (!defined $end) {return;} } } sub gobble_option { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^(\[(\w|[,\-\s\~\.\:\;\+\?\*\_\=])*\])//) { print_style($1,'option'); return $1; } return undef; } sub gobble_options { while (gobble_option(@_)) {} } sub gobble_macro_modifier { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^\*//) { print_style($1,'option'); return $1; } return undef; } sub gobble_macro_parms { my ($tex,$i)=@_; if ($i>0) {gobble_macro_modifier($tex);} gobble_options($tex); for (;$i-->0;) { parse_unit($tex,0); gobble_options($tex); } } ### HTML context sub html_head { print '

LaTeX word count

'; } sub html_tail { print ''; } ### HELP AND INFO sub print_version { print "TeXcount version ".$versionnumber.", ".$versiondate.'.'; } sub print_syntax { print ' Syntax: TeXcount.pl [options] files Options: -v Verbose: print parsed words, mark formulae -vv More verbose: also print ignored text -vvv Even more verbose: include comments and options -showstate Show internal states (with verbose) -nc, -nocol No colours (colours require ANSI) -html Output in HTML format -htmlcore Only HTML body contents -h, -?, --help, /? Help --version Print version number --licence Licence information '; } sub print_help { print ' *************************************************************** * TeXcount.pl '.$versionnumber.', '.$versiondate.' * Count words in TeX and LaTeX files, ignoring macros, tables, formulae, etc. '; print_syntax(); print ' The script counts words as either words in the text, words in headers/titles or words in floats (figure/table captions). Macro options (i.e. \marco[...]) are ignored; macro parameters (i.e. \macro{...}) are counted or ignored depending on the macro, but by default counted. Begin-end groups are by default ignored and treated as \'floats\', though some (e.g. center) are counted. Unless -nocol (or -nc) has been specified, the output will be colour coded. Counted text is coloured blue with headers are in bold and in HTML output caption text is italicised. Mathematical formulae are not counted as words, but are instead counted separately with separate counts for inlined formulae and displayed formulae. Similarly, the number of headers and the number of \'floats\' are counted. Note that \'float\' is used here to describe anything defined in a begin-end group unless explicitly recognized as text or mathematics. The verbose options (-v, -vv, -vvv, showstate) produces output indicating how the text has been interpreted. Check this to ensure that words in the text has been interpreted as such, whereas mathematical formulae and text/non-text in begin-end groups have been correctly interpreted. Unix hint: Use \'less -r\' instead of just \'less\' to view output: the \'-r\' option makes less treat text formating codes properly. Windows hint: If your Windows interprets ANSI colour codes, lucky you! Otherwise, use the -nocol (or -nc) option with the verbose options or the output will be riddled with colour codes. Instead, you can use -html to produce HTML code, write this to file and view with your favourite browser. '; print_reference(); } sub print_reference { print ' The TeXcount script is copyright of Einar Andreas Rødland (2008) and published under the LaTeX Project Public Licence. For more information about the script, e.g. news, updates, help, usage tips, known issues and short-comings, go to http://folk.uio.no/einarro/Comp/texwordcount.html or go to http://folk.uio.no/einarro/Services/texcount.html to access the script as a web service. Feedback such as problems or errors can be reported to einarro@ifi.uio.no. '; } sub print_licence { print 'TeXcount version '.$versionnumber.' Copyright 2008 Einar Andreas Rødland The TeXcount script is published under the LaTeX Project Public Licence (LPPL) http://www.latex-project.org/lppl.txt which grants you, the user, the right to use, modify and distribute the script. However, if the script is modified, you must change its name or use other technical means to avoid confusion. The script has LPPL status "maintained" with Einar Andreas Rødland being the current maintainer. '; }