=head1 NAME
	WORDS

=head1 SYNOPSIS

				input
	 my $words = new WORDS(@input); # input document
	 			canonicalize
	 $error = $words -> lower('PROPER' or 'FULL');
	 			parse and remove punctuation & trash
	 $numwds = $words -> parse('FULL'or'PROPER','ALL'or'ALPHAS');
	 			drop words outside length range (<=3, >=25)
	 $numwds = $words -> range('3','25');
	 $numwds = $words -> commons();
	 $numchg = $words -> disposses;
	 $numchg = $words -> deplural;
	 $numchg = $words -> unpast;
	 $error = $words -> freq;
	 $error = $words -> pairs;

	 my @words = $words->get('ARRAY');
	 my %wrds = $words->get('FREQ');
	 my %pairs = $words->get('PAIRS');

	 @words = $words->get('PAST');
	 @words = $words->get('PLURAL');

	
=head1 EXAMPLE

	require WORDS;

	#	read in document and initialize objects

	open (IN,"<$filename") || die "Can't open input file $filename, $!\n";
	my @input = <IN>;

	my $words = new WORDS(@input);

	#  Canonicalize the words (make them lower case)

	$error = $words -> lower('PROPER'); 
											 FULL = lowercase everything
											 other option is 'PROPER' which preserves
                                  capitals on words not at the start of a
											 sentence.

	#	Parse words into an array

	$numwds = $words -> parse('FULL','ALL');

			The options are:

			<FULL / PROPER> - parse on all word boundaries,(FULL) or try to
									preserve multi-word words like New York that are
									charactized as Proper nouns. This will fail when
									given a sequence like "did you go to New York George?"
									and glue all three words together.

			<ALL / ALPHAS> - This flag is a trash collector. ALL says keep
								  everything, ALPHAS says drop any "word" that contains
								  non-alpha characters, except for hyphen and apostrophe.
		  parse will, in all cases, remove punctuation. This is important. If
		  you wish to canonicalize the words, preserving proper nouns, do it
		  first, before parsing.

	#  drop words shorter than a minimum or longer than a maximum

	$numwds = $words -> range('3','25'); # drop words of 3 or fewer letters
                                       # or greater equal to 25


	 #  drop common words from the hash (a, and, the, or, but, that, is, are,)

	 $numwds = $words -> commons(); # if you give an array, it will be used

	# dispossess the words (remove trailing 's)

	 $numchg = $words -> disposses;

	 # deplural the words

	 $numchg = $words -> deplural;

	 # un-past the words

	 $numchg = $words -> unpast;

	 # build a hash by frequency

	 $error = $words -> freq;

	 # build a hash of pairs

	 $error = $words -> pairs;


	 #	retrieve lists and print

	 my %wrds = $words->get('FREQ');

	 my $word;
	 foreach $word (sort( wordcmp keys(%wrds))) {
		print "word: $word : $wrds{$word}\n";
	 }
	 sub wordcmp {
		$wrds{$b} <=> $wrds{$a};
	 }

	 %pairs = $words->get('PAIRS');


	 foreach $pair (sort( paircmp keys(%pairs))) {
		print "pair: $pair : $pairs{$pair}\n";
	 }
	 sub paircmp {
		$pairs{$b} <=> $pairs{$a};
	 }



=cut

package WORDS;
use strict;

#####################################################
##  Construct the object                           ##
#####################################################

sub new {
	my $proto = shift;
	my $class = ref($proto) || $proto;  # use as object or class
	my $self = {};

	my $error;

	$self->{INPUT} = '';
	$self->{ARRAY} = [];
	$self->{NUMWDS} = '0';
	$self->{FREQ}  = {};
	$self->{PAIRS} = {};

#		Read in the array, put it in a variable, and tuck it away

	foreach (@_){ #	Clean up extra spaces
		s/^\s*//g;
		s/\s*$//g;
		s/\s{2,}/ /g;
		s/\s/ /g;
	}
	$self->{INPUT} = join ' ' , @_;

	bless ($self, $class);
	return $self;
}


#####################################################
##  Get an array of words                          ##
#####################################################

sub get {
	my $self = shift;
	my $option = shift;
	my $error;

	if ($option eq 'ARRAY') {
		return @{$self->{ARRAY}};
	}
	elsif ($option eq 'PLURAL') {
		 my @temp = '';

		 foreach (@{$self->{ARRAY}}){
			 push(@temp,$_) if /s$/;
		 }
		 return @temp;
	}
	elsif ($option eq 'PAST') {
		 my @temp = '';

		 foreach (@{$self->{ARRAY}}){
			 push(@temp,$_) if /[svrt]ed$/;
		 }
		 return @temp;
	}
	elsif ($option eq 'CAPS') {
		 my @temp = '';

		 foreach (@{$self->{ARRAY}}){
			 push(@temp,$_) if /^[A-Z]/;
		 }
		 return @temp;
	}
	elsif ($option eq 'FREQ') {
		return %{$self->{FREQ}};
	}
	elsif ($option eq 'PAIRS') {
		return %{$self->{PAIRS}};
	}
	elsif ($option eq 'NUMWDS') {
		return $self->{NUMWDS};
	}
	else {$error = 1; }

	return $error;
}
#####################################################
##  Parse an array of lines into an array of words ##
#####################################################

sub parse {
	my $self = shift;
	my $propers = shift;
	my $realwords = shift;
	my $input = $self->{INPUT};

	if ($propers eq 'PROPER') { # allow proper multiple word names (New York)
		$input =~ s/\s([DM]r?s?)\.\s?/ $1%/g; # protect Dr., Mr., Mrs., Ms.
		$input =~ s/\s(St)\.\s?/ $1%/g; # protect St.
		$input =~ s/([A-Z]\w*)\s([A-Z]\w*)/$1%$2/g;
		$input =~ s/([A-Z][\w%]*)\s([A-Z][\w%]*)/$1%$2/g;
		$input =~ s/([A-Z][\w%]*)\s([A-Z][\w%]*)/$1%$2/g;
	}
	if ($realwords eq 'ALPHAS') {
		 $input =~ s/\s[^-'a-zA-Z]+\s/ /g; # remove non-words
		 $input =~ s/\s\w*[^-'\w%]+\w*\s/ /g; # remove words containing bogus char
	}

	$input =~ s/[.,:;"()]/ /g; # remove periods, comas, colons, etc...
	$input =~ s/\s'|'\s/ /g; # remove single quotes
	$input =~ s/^\s*//;
	$input =~ s/\s*$//;

	@{$self->{ARRAY}} = split(/\s+/,$input);

	foreach (@{$self->{ARRAY}} ) {
		s/%/ /g;
	}

	$self->{NUMWDS} = $#{$self->{ARRAY}};

	return $self->{NUMWDS} ;
}


#####################################################
##  Canonicalize the words                         ##
#####################################################

#	Do this *before* parsing, parsing will lose info about
#	which words begin a sentence.

sub lower {
	my $self = shift;
	my $option = shift;
	my $error;
	my $input = $self->{INPUT};

	if ($option eq 'ALL') {
		$input =~ tr/A-Z/a-z/;
	}
	elsif ($option eq 'PROPER') {
		$input =~ s/([A-Z]{2,})/\L$1/g; # lowercase all caps words.
		#	protect word series like Mr. Jackson
		$input =~ s/\s([A-Z]\S*)\s([A-Z])/ >$1 >$2/g;
		$input =~ s/([.?!]\s?[A-Z])/\L$1/g; # lowercase first letter in sentence.
		$input =~ s/(\s{2,}[A-Z])/\L$1/g; # lowercase first letter in paragraph.
		$input =~ s/(^\s*[A-Z])/\L$1/g; # lowercase first letter in paragraph.
		#	remove protection
		$input =~ s/>//g;
	}
	else { $error = 1; }

	$self->{INPUT} = $input;

	return $error;
}

#####################################################
##  Drop the words outside a length range          ##
#####################################################

sub range {
	my $self = shift;
	my $min = shift;
	my $max = shift;

	my @temp ;

	foreach (@{$self->{ARRAY}}){
		if (length($_) > $min && length($_) < $max) {
			push(@temp,$_);
		}
	}

	@{$self->{ARRAY}} = @temp;
	$self->{NUMWDS} = $#{$self->{ARRAY}};

	return $self->{NUMWDS} ;
}

#####################################################
##  Drop common words from the array              ##
#####################################################

#	If no array of targets is given, use internally
#	defined array for drops

sub commons {
	my $self = shift;

	my @commons;

	if (@_) {@commons = @_;}
	else { @commons = (qw/the a and of but if in was to at that there be on this
	it when is had have has /);}

	my %is_common;
   for (@commons) { $is_common{$_} = 1; }

	my @temp ;

	foreach (@{$self->{ARRAY}}){
		if (! defined $is_common{$_} ) {
			push(@temp,$_);
		}
	}
	undef %is_common;

	@{$self->{ARRAY}} = @temp;
	$self->{NUMWDS} = $#{$self->{ARRAY}};

	return $self->{NUMWDS} ;
}


#####################################################
##  Disposses the words (remove 's)                ##
#####################################################

sub disposses {
	my $self = shift;

	my $num = 0;

	foreach (@{$self->{ARRAY}}){
		$num += s/'s$//;
	}

	return $num;
}

#####################################################
##  Deplural the words                             ##
#####################################################

sub deplural {
	my $self = shift;

	my $num = 0;

	foreach (@{$self->{ARRAY}}){
		next if ! /s$/;
		  $num += s/([^aeious])s$/$1/; # singular cons-plural
		  $num += s/^([^aeiou])ies$/$1ie/; # 3 letter -ie words
		  $num += s/elves$/elf/; # -elf
		  $num += s/warves$/warf/; # -warf
		  $num += s/wives$/wife/; # -wife
		  $num += s/yches$/yche/; # -yche greek root words
		  $num += s/([sc][sh])es$/$1/; # -ches and -sses
		  $num += s/phes$/phe/; # catastrophes
		  $num += s/xes$/x/; # boxes, foxes, etc
		  $num += s/([^aio])es$/$1e/; # common case, root end in 'e'
		  $num += s/movies$/movie/; # -movies
		  $num += s/species$/specie/; # -species
		  $num += s/ies$/y/; # -ies
	}

	return $num;
}

#####################################################
##  Convert past tense to present                  ##
#####################################################

sub unpast {
	my $self = shift;

	my $num = 0;

	 foreach (@{$self->{ARRAY}}){
		  next if ! /[svrt]ed$/;

		  if (/hundred$/ ||
				/fred$/    ||
				/unmoved$/    ||
				/kindred$/    ||
				/sacred$/    ||
				/shred$/ ) { next;}
		  $num +=$num +=  s/tted$/t/;
		  $num += s/rred$/r/;
		  $num += s/([rl][vs])ed/$1e/;
		  $num += s/([mnr]or)ed$/$1/;
		  $num += s/([^aeiom][aiou][r])ed$/$1e/;
		  $num += s/([aeio][aeiou][r])ed$/$1/;
		  $num += s/([^aeiou][svrt])ed$/$1/;
		  $num += s/ered$/er/;
		  $num += s/([aeiou][aeiou][t])ed$/$1/;
		  $num += s/([^svc])ited$/$1it/;
		  $num += s/([svc])ited$/$1ite/;
		  $num += s/([aeiou]et)ed$/$1/;
		  $num += s/(ret)ed$/$1/;
		  $num += s/([aeiou][svt])ed$/$1e/;
   }

	return $num;
}

#####################################################
##  Create frequency hash                          ##
#####################################################

sub freq {
	my $self = shift;

	my $error;

	 foreach (@{$self->{ARRAY}}){
		  $self->{FREQ}{$_}++;
	 }

	return $error;
}

#####################################################
##  Create pairs frequency hash                    ##
#####################################################

sub pairs {
	my $self = shift;

	my $error;
	my $last;

	 foreach (@{$self->{ARRAY}}){
		  if (defined $last) {
				$self->{PAIRS}{$last . " " . $_}++;
		  }
		  $last = $_;
	 }

	return $error;
}


1;
