#!/usr/bin/perl

# A simple script that performs a wordcount over all ARCHER files with the file extension
# .xml in a directory.
# Also provides a comparison with wordcounts for version 3.1 of the corpus
#
# Usage: perl wordcount.pl /path/to/ARCHER-directory/
#
#
# For questions about the script, please contact hoffmann@uni-trier.de

require "find.pl";
use FindBin '$Bin';

$/ = \x04;

# open filehandles
open (WC1,">wordcounts_comparison.txt") || die "can't open error.txt";
open (WC2,">wordcounts.txt") || die "can't open error.txt";
open (FREQ,">freqlist.txt") || die "can't open error.txt";

$totalWordcount = 0;

# main loop - goes through all files in a directory hierarchy
for (@ARGV){
        &find($_);
}

# output total number of words to the shell
print STDERR "Total number of words: $totalWordcount\n\n";


# output information about number of turns
foreach $key (sort {$theWords{$b} <=> $theWords{$a} } keys %theWords) {
	print FREQ "$key\t$theWords{$key}\n";
}

# close filehandles again
close (WC1);
close (WC2);
close (FREQ);

exit(0);


# subroutine that does the actual work
sub wanted {
       
	$myPath=$name;
	$isAfile = $myPath=~/\/([^\/]+\.xml)\Z/o;
    
	if ($isAfile) {
		
		# keep user informed
		print STDERR "doing $myPath\n";

		$filename = $1;
		
		# open file and put its content into variable $file
		open (H, "<$myPath") || die "can't open $myPath";	
		$file=<H>;
		close (H);
		
		# loop that isolates individual ARCHER files and performs wordcount routines
		while ($file=~m/<file id="([^"]+)" *\/>.*?<\/body>/gs) {
			$text = $&;
			$id = $1;
			$id=~s/\.xml\Z//;
			if ($text=~m/<\/teiHeader>/) {
				$header = $` . $&;
				$body = $';
				
				# get rid of elements that are marked as having been deleted in the original text
				$body=~s/<del *[^>]*>.*?<\/del>//gs;
				
				$body=~s/<[^>]+>//gs;
				$body=~s/\&quot\;//gs;
				
				$body=~s/\n/ /gs;
				$body=~s/ +/ /gs;
				
				undef @words;
				$countWords = 0;
				@words=split(/ /, $body);
				
				foreach $word (@words) {
					$word=~s/\t//g;
					next unless $word=~m/\S/;
					next if ($word eq "&apos;");

					next if ($word eq "&lt;");
					next if ($word eq "&gt;");
					
					if ($word=~m/\A(\W+)\Z/) {
						if ($word eq "à") {
							$countWords++;
							$totalWordcount++;
							$theWords{$word}++;
						}
					} else {
						$countWords++;
						$totalWordcount++;
						
						$word=~s/&apos;/'/g;
						$word=~s/\A[(."']+//;
						$word=~s/[."',?);:!]+\Z//;
						
						$word = lc($word);
						$theWords{$word}++;
					}					
				}
				
								
				if ($header=~m/<category xml\:id="g_(\w)">/) {
					$genre = $1;
				}
				
				if ($header=~m/<category xml\:id="p_(\w)">/) {
					$period = $1;
				}
				
				if ($header=~m/<category xml\:id="v_(\w)">/) {
					$variety = $1;
				}
				
				
				if ($header=~m/<measure quantity="(\d+) ?" unit="words" \/>/) {
					$originalCount = $1;
				} else {
					warn "No wordcount found in text $id!";
					$originalCount = 0;
				}
				
				$difference = $originalCount - $countWords;
							
				print WC1 "$id\t$countWords\t$originalCount\t$difference\t$genre\t$period\t$variety\n";
				print WC2 "$id\t$countWords\n";
				
								
			} else {
				warn "No header/body found for text $myPath";
				exit(0);
			}	
		}		
	}
}
