) {
push @text,$line;
}
close(FH);
return join('',@text);
}
sub next_token {
my $tex=shift @_;
my ($next,$type);
if (defined $tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});}
$tex->{'style'}=undef;
while (defined ($next=get_next_token($tex))) {
$type=$tex->{'type'};
if ($type==0) {
print_style($next,'comment');
} elsif ($type==9) {
if ($verbose) {line_return(1);}
} else {
return $next;
}
}
return $next;
}
sub flush_next {
my $tex=shift @_;
if (defined $tex->{'next'}) {
print_style($tex->{'next'}.' ',$tex->{'style'},$tex->{'printstate'});
}
$tex->{'printstate'}=undef;
$tex->{'style'}='-';
}
sub get_next_token {
my $tex=shift @_;
my $next;
(defined ($next=get_token($tex,'\%[^\n]*',0))) && return $next;
(defined ($next=get_token($tex,'\n',9))) && return $next;
(defined ($next=get_token($tex,'\w+(\.\w+)+\.',1))) && return $next;
(defined ($next=get_token($tex,'\w+([\-\']\w+)*',1))) && return $next;
(defined ($next=get_token($tex,'[\"\'\`:\.,\(\)\[\]!\+\-\*=/\^\_\@\<\>\~\#\&]',2))) && return $next;
(defined ($next=get_token($tex,'\\\\([a-zA-Z_]+|\\\\|[\^\'\`\&!\[\]\(\)\%])',3)))
&& return $next;
(defined ($next=get_token($tex,'[\{\}]',4))) && return $next;
(defined ($next=get_token($tex,'[\[\]]',5))) && return $next;
(defined ($next=get_token($tex,'\$\$',6))) && return $next;
(defined ($next=get_token($tex,'\$',6))) && return $next;
(defined ($next=get_token($tex,'.',999))) && return $next;
(defined ($next=get_token($tex,'[^\s]+',999))) && return $next;
$tex->{'eof'}=1;
return undef;
}
sub get_token {
my ($tex,$regexp,$type)=@_;
if ( $tex->{'line'} =~ s/^($regexp)[ \t\r\f]*// ) {
$tex->{'next'}=$1;
$tex->{'type'}=$type;
return $1;
}
return undef;
}
sub new_count {
my @count=(0,0,0,0,0,0,0,0);
# files, text words, header words, float words,
# headers, floats, math-inline, math-display;
return \@count;
}
sub count_word {
my ($count,$type,$word,$style,$verb)=@_;
($word) || ($word="");
($style) || ($style=0);
($verb) || ($verb=0);
if ($type>0) {${$count}[$type]++;}
if ($verb >= $verbose) {
}
}
sub print_count {
my ($count,$header)=@_;
if ($htmlstyle) {print "\n";}
if (defined $header) {
formatprint($header."\n",'dt','header');
}
for (my $i=1;$i<8;$i++) {
formatprint($countlabel[$i].': ','dt');
formatprint(${$count}[$i]."\n",'dd');
}
if ($htmlstyle) {print "
\n";}
}
sub add_count {
my ($a,$b)=@_;
for (my $i=0;$i<8;$i++) {
${$a}[$i]+=${$b}[$i];
}
}
sub parse {
my ($tex)=@_;
if ($htmlstyle && $verbose) {print "\n";}
while (!($tex->{'eof'})) {
parse_unit($tex,1);
}
if ($htmlstyle && $verbose) {print "
\n";}
}
sub parse_unit {
# Status:
# 0 = exclude
# 1 = text
# 2 = header text
# 3 = float text
# -1 = float (exclude)
my ($tex,$status,$end)=@_;
my $count=$tex->{'count'};
my $substat;
if ($showstates) {
if (defined $end) {
$tex->{'printstate'}=':'.$status.'-'.$end.':';
} else {
$tex->{'printstate'}=':'.$status.':';
}
flush_next($tex);
}
while (defined (my $next=next_token($tex))) {
# parse next token; or tokens until match with $end
set_style($tex,"ignore");
if (($end) && ($end eq $next)) {
# end of unit
return;
} elsif (!defined $next) {
exit "ERROR: End of file while waiting for ".$end."\n";
}
if ($tex->{'type'}==1) {
# word
if ($status>0) {
${$count}[$status]++;
set_style($tex,'word'.$status);
}
} elsif ($tex->{'type'}==9) {
#print "\n";
} elsif ($next eq '{') {
# {...}
parse_unit($tex,$status,'}');
} elsif ($tex->{'type'}==3) {
# macro call
set_style($tex,'command');
if ($next eq '\begin') {
# \begin...\end
my $localstyle=$status>0 ? 'grouping' : 'exclgroup';
flush_style($tex,$localstyle);
gobble_option($tex);
if ($tex->{'line'} =~ s/^\{(\w+)\*?\}[ \t\r\f]*//) {
# gobble group type
print_style('{'.$1.'}',$localstyle);
} else {
print "\nWarning: BEGIN group without type.\n";
}
# find group status (or leave unchanged)
($substat=$TeXgroup{$1}) || ($substat=$status);
if ($status<=0 && $status<$substat) {$substat=$status;}
if (($status>0) && ($substat==-1)) {
# Count float
${$count}[5]++;
}
if ($status>0 and $substat>3) {
# count item, exclude contents
${$count}[$substat]++;
$substat=0;
}
parse_unit($tex,$substat,'\end');
if ($tex->{'line'} =~ s/^\{(\w+)\}[ \t\r\f]*//) {
# gobble group type
flush_style($tex,$localstyle);
print_style('{'.$1.'}',$localstyle);
} else {
print "\nWarning: END group without type.\n";
}
} elsif ($next eq '\(') {
# math inline
my $localstyle=$status>0 ? 'mathgroup' : 'exclmath';
if ($status>0) {${$count}[6]++;}
set_style($tex,$localstyle);
parse_unit($tex,0,'\)');
set_style($tex,$localstyle);
} elsif ($next eq '\[') {
# math display
my $localstyle=$status>0 ? 'mathgroup' : 'exclmath';
if ($status>0) {${$count}[7]++;}
set_style($tex,$localstyle);
parse_unit($tex,0,'\]');
set_style($tex,$localstyle);
} elsif (defined (my $i=$TeXexcl{$next})) {
# macro: exclude options
gobble_macro_parms($tex,$i);
} elsif ($next eq '\def') {
# ignore \def...
$tex->{'line'} =~ s/^([^\{]*)\{/\{/;
flush_next($tex);
print_style($1.' ','ignore');
parse_unit($tex,0);
} else {
gobble_option($tex);
}
if (($status>0) && ($substat=$TeXheader{$next})) {
# headers
${$count}[4]++;
gobble_macro_modifier($tex);
gobble_option($tex);
parse_unit($tex,$substat);
}
if (($status==-1) && ($substat=$TeXfloatinc{$next})) {
# text included from float
parse_unit($tex,3);
}
} elsif ($next eq '$') {
# math inline
my $localstyle=$status>0 ? 'mathgroup' : 'exclmath';
if ($status>0) {${$count}[6]++;}
set_style($tex,$localstyle);
parse_unit($tex,0,$next);
set_style($tex,$localstyle);
} elsif ($next eq '$$') {
# math display
my $localstyle=$status>0 ? 'mathgroup' : 'exclmath';
if ($status>0) {${$count}[7]++;}
set_style($tex,$localstyle);
parse_unit($tex,0,$next);
set_style($tex,$localstyle);
}
if (!defined $end) {return;}
}
}
sub gobble_option {
my $tex=shift @_;
flush_next($tex);
if ($tex->{'line'} =~ s/^(\[(\w|[,\-\s])+\])//) {
print_style($1,'option');
return $1;
}
return undef;
}
sub gobble_options {
while (gobble_option(@_)) {}
}
sub gobble_macro_modifier {
my $tex=shift @_;
flush_next($tex);
if ($tex->{'line'} =~ s/^\*//) {
print_style($1,'option');
return $1;
}
return undef;
}
sub gobble_macro_parms {
my ($tex,$i)=@_;
if ($i>0) {gobble_macro_modifier($tex);}
gobble_options($tex);
for (;$i-->0;) {
parse_unit($tex,0);
gobble_options($tex);
}
}
### HTML context
sub html_head {
print '
LaTeX word count
';
}
sub html_tail {
print '';
}
### HELP
sub print_help {
print '
***************************************************************
* TeXcount.pl
*
Count words in TeX and LaTeX files, ignoring macros, tables,
formulae, etc.
Syntax: TeXcount.pl [options] files
Options:
-v Verbose: print parsed words, mark formulae
-vv More verbose: also print ignored text
-vvv Even more verbose: include comments and options
-showstate Show internal states (with verbose)
-nc, -nocol No colours (colours require ANSI)
-html Output in HTML format
-htmlcore Only HTML body contents
--version Print version number
-h, -?, --help, /? Produce this help
The script counts words as either words in the text, words in
headers/titles or words in floats (figure/table captions).
Macro options (i.e. \marco[...]) are ignored; macro parameters
(i.e. \macro{...}) are counted or ignored depending on the
macro, but by default counted. Begin-end groups are by default
ignored and treated as \'floats\', though some (e.g. center) are
counted.
Mathematical formulae are not counted as words, but are instead
counted separately with separate counts for inlined formulae
and displayed formulae. Similarly, the number of headers and
the number of \'floats\' are counted. Note that \'float\' is
used here to describe anything defined in a begin-end group
which is not text or mathematics: e.g. tables and figures.
The verbose options (-v, -vv, -vvv, showstate) produces output
indicating how the text has been interpreted. Check this to
ensure that words in the text has been interpreted as such,
whereas mathematical formulae and text/non-text in begin-end
groups have been correctly interpreted.
Unix hint: Use \'less -r\' instead of just \'less\' to view output:
the \'-r\' option makes less treat text formating codes properly.
Windows hint: If your Windows interprets ANSI colour codes, lucky
you! Otherwise, use the -nocol (or -nc) option with the verbose
options or the output will be riddled with colour codes.
';
}