#!/usr/bin/perl # # Usage 1: (train for which features to get) # perl ngrams.pl [-o] [-n] [-w] [-s featuresfile to save] [-c listofconvfiles (or conv1 ... convN)] [-e maximum_relative_entropy] [-f minimum_frequency] [-x maximum_number_of_n_for_which_to_get_ngrams] # # Usage 2: (get features) # perl ngrams.pl [-r featuresfile to read] [-c listofconvfiles (or conv1 ... convN)] # # Usage 1 creates file called # Usage 2 creates files called conv1_textfeatures_.dat # each line of which is of the form : : ... : # first line is % # first feature is assumed to be 'not any of the other features in featuresfile' # # -o : add a feature for unigrams by itself # -n : collapse all noise entries into one lexical type # -w : only use words, don't include noise entries # -s featfile : save features in featfile # -c convfile[s] : conversation files to process (or a file with the names of these files) # -e MRE : only consider those features that have the relative entropy of the class distribution (on utterances which have those features) below MRE # -f MF: : only consider those features that occur at least MF times # -x N : consider as features 1-grams, 2-grams, .. up to N-grams $proj = "tmp"; $MAXRELENT = 0.75; # relative entropy (in training) $MINNUMENT = 5; # minimum number of times an ngram should appear in training before we can use it $MAXLEN = 2; # maximum n for which we are going to find n-grams $do_unigram_1 = 0; # if set to 1, add features for words in moves with only one word $do_one_noise = 0; # if set to 1, all NOISE features are converted to the same type. $do_wordsonly = 0; $ext = ""; # only used in testing @convfiles = (); @convs = (); $MODE = 0; # 0 for training, 1 for testing %labelmap = {}; @labels=('instruct','explain','align','check','query-yn','query-w','acknowledge','clarify','reply-y','reply-n','reply-w','ready','uncodable'); for (my $i=0; $i<=$#labels; $i++) {$labelmap{$labels[$i]} = $i+1;} for (my $i=0; $i<=$#ARGV; $i++) { if (($ARGV[$i] eq "-s") | ($ARGV[$i] eq "-r")) { $proj = $ARGV[$i+1]; if ($ARGV[$i] eq "-r") {$MODE=1;} } elsif ($ARGV[$i] eq "-e") { $MAXRELENT = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-f") { $MINNUMENT = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-x") { $MAXLEN = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-o") { $do_unigram_1 = 1; } elsif ($ARGV[$i] eq "-n") { $do_one_noise = 1; } elsif ($ARGV[$i] eq "-w") { $do_wordsonly = 1; } elsif ($ARGV[$i] eq "-t") { if ($#ARGV >= ($i+1)) { $ext = $ARGV[$i+1]; } } elsif ($ARGV[$i] eq "-c") { my $j=$i+1; while (($j<=$#ARGV) & (substr($ARGV[$j],0,1) ne "-")) { $convfiles[1+$#convfiles] = $ARGV[$j]; $j++; } if (($#convfiles>0) | (-e ($dirr.$convfiles[0].".moves.normpitch"))) { @convs = @convfiles; } else { open (TMP,$convfiles[0]); while () { @tmp = split /[ \n\t\r]+/; foreach $a(@tmp) { if (length($a) > 0) { $convs[1+$#convs] = $a; } } } } } } $write = ""; if ($MODE==0) {$write=">";} open (FEAT,$write.$proj); my $count = 2; my %featind = {}; # Features index (indexing starts at 2, so that any the first index (1) effectively means 'none of the rest') $featind{"NOTINTRAINING"} = 1; # $featind{"NUMWORDS"} = 2; if ($MODE==1) { while (my $line = ) { if ($line =~ /\[([\w\W]+)\]/) { $textfeat = $1; $featind{$1} = $count; $count++; } } } my @classes = (); my @moves = (); my %labelswhere = (); # $labelswhere{$blah} is an array of labels of moves where the features $blah appears (only used in training mode) foreach $conv (@convs) { # print "$conv\n"; open (IN,$conv.".moves.normpitch"); if (1==$MODE) { $outfilename = "$conv"."_text.dat"; if (length($ext)>0) { $outfilename = "$conv"."_text_".$ext.".dat"; } print "$outfilename\n"; open (FEATOUT, ">$outfilename") ; # print FEATOUT "% ", $count-1, "\n"; } while ($line = ) { if ($line =~ "0)) { # print "adding $classthismove for the feature <$f> in the DA <<$wordsthismove>> to an empty array, getting the one below\n"; $labelswhere{$f} = [$classthismove]; } else { # printarray(@{$labelswhere{$f}}); # print "adding $classthismove for the feature <$f> in the DA <<$wordsthismove>> to the array above, getting the one below\n"; push (@{$labelswhere{$f}}, $classthismove); } # printarray(@{$labelswhere{$f}}); # print "\n\n"; } } elsif (1==$MODE) # if testing { print FEATOUT $labelmap{$classthismove}, " "; my %featsthismovecount = (); $featsthismovecount{"NOTINTRAINING"} = 0; foreach $f(@featsthismove) { if (exists($featind{$f})) { if (exists ($featsthismovecount{$f})) { $featsthismovecount{$f}++; # if this is a feature, add to its count } else { $featsthismovecount{$f} = 1; } } elsif ((! ($f=~/\_/)) & (substr($f,length($f)-1,1) ne "1")) { $featsthismovecount{"NOTINTRAINING"}++; # if this a unigram that's not a feature add to the count of the "not-seen-before" words } } foreach $f (keys (%featsthismovecount)) { if ($featsthismovecount{$f}>0) { print FEATOUT $featind{$f},":",$featsthismovecount{$f}, " "; } } # print FEATOUT " % <$wordsthismove> "; # foreach $f (keys (%featsthismovecount)) # { # print FEATOUT " [$f]:",$featsthismovecount{$f} if ($featsthismovecount{$f}>0); # } print FEATOUT " \n"; } } } close (IN); close(FEATOUT) if (1==$MODE); } if (0==$MODE) { # @classes has the class labels foreach $feature (keys (%labelswhere)) { @a = @{$labelswhere{$feature}}; $numentries = $#a+1; @classesoffeatures = @a; @classesoffeatures = map {$labelmap{$_}} @a; $relent = getentropy (@classesoffeatures); if (($relent<$MAXRELENT) & ($numentries >= $MINNUMENT)) { print FEAT " $numentries $relent # [$feature]\n"; } } close (FEAT); } ########################################## sub move2feat { my ($thismove, $MAXLEN) = @_; # thismove is the string of words in this move @parts = split /[\n\t\r ]+/, $thismove; @parts = grep {length($_)>0} @parts; my @featsthismove = (); # features found in this move # remove stammering? (not implemented) # replace NOISE type by common NOISE if ($do_one_noise <= 1) { for ($j=0; $j<=$#parts; $j++) { $wd = $parts[$j]; if ($wd =~ "NOISE") { $parts[$j] = "NOISE"; } } } @oldparts = @parts; if ($#parts>=1) { @wordparts = (); for ($j=0; $j<=$#parts; $j++) { if ($do_wordsonly == 1) { if ( ($parts[$j] !~ "NOISE") & ($parts[$j] !~ "SILENCE")) { $wordparts[1+$#wordparts] = $parts[$j]; } } else { $wordparts[1+$#wordparts] = $parts[$j]; } } if ($#wordparts < 0) { $wordparts[0] = $parts[$#parts]; } @parts = @wordparts; } # now have @parts and @uniqparts containing the words (excluding NOISE, SILENCE) and unique words in each move # find unique elements of move @uniqparts = uniq (@parts); @featsthismove = @uniqparts; # throw in unigrams .. but only do that for unigrams in the unique part # throw in n-grams, for n from bigrams upto and including MAXLEN, # and this time use all parts, not uniqparts for ($k=2; $k<=$MAXLEN; $k++) { for ($j=0; $j<=($#parts-$k+1); $j++) { $wd = $parts[$j]; for (my $i_=1; $i_<=$k-1; $i_++) { $wd = $wd."_".($parts[$j+$i_]); } $featsthismove[1+$#featsthismove] = $wd; } } ## throw in unigram_1 if move has just one element if (($do_unigram_1) & ($#parts==0)) { $wd = $parts[0]."_1"; $featsthismove[1+$#featsthismove] = $wd; } # print "---> $thismove \n--->"; # printarray(@featsthismove); # print "\n"; return @featsthismove; } ################################## sub getentropy # assumes getting array of integers # returns relative entropy { my @a = @_; return 0 if ($#a<0); my $ent = 0; my @count = [0,0,0,0,0,0,0,0,0,0,0,0,0,0]; # actually 14 of these, not 13, as first one isnt used (indices start at 1) foreach $b(@a) { $count[$b]++; } for (my $i=1; $i<=13; $i++) { $b = $count[$i]; $p = $b/ ($#a+1); if ($p>0) { $ent -= $p * log($p);} } return $ent / log(13); } sub getcount { my @a = @_; my %count = {}; foreach $b(@a) { if (exists($count{$b})) {$count{$b} = $count{$b}+1;} else {$count{$b}=1;} } return \%count; } sub uniq { my @parts = @_; my %uniqinmove = {}; my @uniqparts = (); for (my $j=0; $j<=$#parts; $j++) { if (exists($uniqinmove{$parts[$j]})) { $uniqinmove{$parts[$j]}++; } else { $uniqinmove{$parts[$j]} = 1; $uniqparts[1+$#uniqparts] = $parts[$j]; } # print "$parts[$j] --> ",$uniqinmove{$parts[$j]},"\n"; } return @uniqparts; } sub printarray2string { my @a = @_; my $s = "["; if ($#a < 0) { $s = "[]\n"; } else { for (my $i=0; $i<=$#a; $i++) { # print "[", $i, " : ", $a[$i], "] "; $s = $s.$a[$i]; if ($i<$#a) {$s = $s." ";} } $s = $s."]"; } return ($s); } sub printarray # print elements of this array { my @a = @_; if ($#a < 0) { print "[]\n"; } else { for (my $i=0; $i<=$#a; $i++) { print "[", $i, " : ", $a[$i], "] "; # print $a[$i]," "; } print "\n"; } }