#!/usr/local/bin/perl -w # # QUICK AND DIRTY SCRIPT! BEWARE HARDCODED PATHS! # # Given a AMI transcript file in NXT xml segment/word format # convert to .ref list format. The ref format consists of # "session-channel_1_start_end word0 word1 ... wordn" where session is # a unique meeting ID, channel is a channel number starting at 1, and # start and end are times in milliseconds padded to 7 characters. For # example, ES2002a-2_1_2620635_2623385 # # The NXT format consists of two files, a segment file giving the # start and end time of segments and a list of word ids within the # segment, and a word file consisting of the text of the words and the # start/end times of the words (presumably from a forced alignment). # # This version uses the "word" files rather than the "subword" files. # It discards disfluency markers (isloated -), adds a final - to words # marked as truncated, and does some other random clean up. # # There's a simple word mapping file (e.g. to convert British to American). # For an example, see "ami.map". # # The dictionary is only used to handle hyphenation. # # The exact logic is: # If a word exists in Mapping, use the mapped version. # If the word is hyphenated "aaa-bbb": # If "aaabbb" is in dict, return "aaabbb" # Otherwise, return "aaa bbb" # # The script requires XML::Parser, available from CPAN. # use strict; use FileHandle; use Getopt::Std; use XML::Parser; my(%Mappings, %Dictionary, %Words, @Words, $TopDir, $SegmentDir, $WordDir, $Tag); my(%CurWord, $CurText, %CurSeg, @CurWords, $Session); use vars qw($opt_d $opt_h $opt_m); # CHANGE THIS TO POINT TO AMI CORPUS ANNOTATIONS $TopDir = '/n/bourbon/xd/drspeech/data/ami/amicorpus/annotations'; $SegmentDir = "$TopDir/segments"; $WordDir = "$TopDir/words"; $opt_h = 0; getopts("d:hm:") || usage(); usage() if $opt_h; $Tag = shift or usage(); if ($Tag =~ /^(.*)\.[A-Z]$/) { $Session = $1; } else { print STDERR "\nUnexpected AMI tag '$Tag' (you need the channel, e.g. IS1007a.A)\n\n"; exit; } if (defined($opt_m)) { read_mappings(); } if (defined($opt_d)) { read_dictionary(); } ReadWords(); nxt2ref(); sub ReadWords { my($p, $id, $windex); $p = new XML::Parser(Handlers => { Start => \&StartTagWords, End => \&EndTagWords, Char => \&CharWords}); $p->parsefile("$WordDir/$Tag.words.xml"); # Convert %Words to @Words foreach $id (keys %Words) { if ($id =~ /^$Tag\.words([0-9]+)$/) { $windex = $1; $Words[$windex] = $Words{$id}; } else { print STDERR "\nUnexpected id '$id'\n\n"; exit; } } } sub StartTagWords { my($expat, $elem, %attr) = @_; if ($elem ne 'w' && $elem ne 'gap' && $elem ne 'nite:root' && $elem ne 'vocalsound' && $elem ne 'disfmarker') { print STDERR "\nUnexpected element '$elem'.\n\n"; exit; } if ($elem eq 'w' || $elem eq 'vocalsound') { %CurWord = %attr; $CurText = ''; } } # Vocal sound laugh => [laugh] # Vocal sound, any other => @reject@ # Punctuation is not included # Truncated words => word- # Gaps are ignored (is there a gap in the SRI output?) # Disfluency markers (isolated -) are ignored. sub EndTagWords { my($expat, $elem) = @_; if ($elem eq 'vocalsound' && $CurWord{type} eq 'laugh') { $Words{$CurWord{'nite:id'}} = '[laugh]'; $CurText = ''; } elsif ($elem eq 'vocalsound') { $Words{$CurWord{'nite:id'}} = '@reject@'; $CurText = ''; } elsif ($elem eq 'w' && exists($CurWord{punc}) && $CurWord{punc} eq 'true') { # Nothing. Do not output punctuation $CurText = ''; } elsif ($elem eq 'w' && exists($CurWord{trunc}) && $CurWord{trunc} eq 'true') { # Insert fragments with a trailing -, since some occur in the dict. $Words{$CurWord{'nite:id'}} = $CurText . '-'; $CurText = ''; } elsif ($elem eq 'w') { $Words{$CurWord{'nite:id'}} = $CurText; $CurText = ''; } elsif ($elem eq 'gap') { # Nothing $CurText = ''; } elsif ($elem eq 'nite:root') { # Nothing $CurText = ''; } elsif ($elem eq 'disfmarker') { if ($CurText !~ /^\s*$/) { print STDERR "\nGot unexpected text '$CurText' in disfmarker.\n\n"; exit; } $CurText = ''; } else { print STDERR "\nGot unexpected end tag in tag $elem.\n\n"; exit; } } sub CharWords { my($expat, $string) = @_; $CurText .= $string; } sub nxt2ref { my($p); $p = new XML::Parser(Handlers => { Start => \&StartTagSeg, End => \&EndTagSeg, Char => \&CharSeg}); $p->parsefile("$SegmentDir/$Tag.segments.xml"); } ############################################################ # # Segment handlers. # sub StartTagSeg { my($expat, $elem, %attr) = @_; my($id, $ii, $start, $end); if ($elem ne 'segment' && $elem ne 'nite:child' && $elem ne 'nite:root') { print STDERR "\nUnexpected element '$elem' in segment.\n\n"; exit; } if ($elem eq 'segment') { %CurSeg = %attr; @CurWords = (); } elsif ($elem eq 'nite:child') { if ($attr{href} =~ /^$Tag\.words\.xml\#id\($Tag\.words([0-9]+)\)$/) { $id = $1; if (defined($Words[$id])) { push(@CurWords, $Words[$id]); } } elsif ($attr{href} =~ /^$Tag\.words\.xml\#id\($Tag\.words([0-9]+)\)\.\.id\($Tag\.words([0-9]+)\)$/) { $start = $1; $end = $2; for ($ii = $start; $ii <= $end; $ii++) { if (defined($Words[$ii])) { push(@CurWords, $Words[$ii]); } } } else { print STDERR "\nUnexpected nite:child href '$attr{href}'\n\n"; exit; } } } sub EndTagSeg { my($expat, $elem) = @_; my($words, $start, $end); if ($elem ne 'segment' && $elem ne 'nite:child' && $elem ne 'nite:root') { print STDERR "\nUnexpected end tag '$elem' in segment.\n\n"; exit; } if ($elem eq 'segment') { # Yeah, join/split/join isn't too efficient. $words = canonicalize_words(join(" ", @CurWords)); if (defined($words) && $words !~ /^\s*$/) { # Darn it, sometimes they use transcriber_start and sometimes they use starttime $start = $CurSeg{transcriber_start} || $CurSeg{starttime}; $end = $CurSeg{transcriber_end} || $CurSeg{endtime}; printf("%s-%d_1_%07d_%07d $words\n", $Session, $CurSeg{channel}, int($start*1000), int($end*1000)); } @CurWords = ''; } } sub testcurseg { my($str) = @_; if (!exists($CurSeg{$str}) || !defined($CurSeg{$str})) { print STDERR "\n'$str' undefined in segment\n\n"; foreach my $key (keys %CurSeg) { print STDERR "$key = $CurSeg{$key}\n"; } exit; } } sub CharSeg { my($expat, $string) = @_; if ($string !~ /^\s*$/) { print STDERR "\nUnexpected text in segment file: '$string'\n\n"; exit; } } sub canonicalize_words { my($words) = @_; my($word, $out, @out); $words = lc($words); $words =~ s/\([^\)]*\)/ \@reject\@ /g; foreach $word (split(/\s+/, $words)) { push(@out, canonicalize_word($word)); } $out = join(" ", @out); $out =~ s/\s\s+/ /g; # Many cases of "it 's", "Play-Doh 's", etc. Remove the space bewteen 's and previous. # Leave the case alone where the previous word also has a '. $out =~ s/([a-z-]+) \'s\b/$1\'s/g; $out =~ s/([a-z-]+) \'ll\b/$1\'ll/g; return $out; } sub canonicalize_word { my($word) = @_; my($first, $rest, $try); if (exists($Mappings{$word})) { return $Mappings{$word}; } # If hypenated, remove the first hyphen, canonicalize, and # see if it's in the dictionary. If so, use it. If not, replace # hyphen with a space, yielding two words. Canonicalize each, # a return both. NOTE: This means that canonicalize_word can, # in fact, return more than one word. # # This will probably interact badly with acronyms. if (defined($opt_d) && $word =~ /^([^\-]+)-(.+)$/) { $first = $1; $rest = $2; $try = canonicalize_word("$first$rest"); if (exists($Dictionary{$try})) { return $try; } return canonicalize_word($first) . " " . canonicalize_word($rest); } # Some versions of transcripts have isolated punctuation. Remove it. $word =~ s/^\s*[.,?!\$\#]+\s*$//; # If an acronym, handle it if ($word =~ /_/) { $word =~ s/_/. /g; # Add a ' if needed $word =~ s/\. ([^.]+)$/\.\'$1/; # Take out extra if it was there. There should be a cleaner way. $word =~ s/\'\'/\'/g; } return $word; } sub read_mappings { my($fh, $line); $fh = new FileHandle($opt_m) or die "Couldn't open mapping file '$opt_m': $!, stopped"; while ($line = <$fh>) { if ($line =~ /^\s*(.*?)\s*::\s*(.*?)\s*$/) { $Mappings{lc($1)} = lc($2); } elsif ($line !~ /^\s*\#/ && $line !~ /^\s*$/) { print STDERR "Unexpected line '$line' in mapping file '$opt_m'\n"; exit(-1); } } $fh->close(); } sub read_dictionary { my($fh, $line); $fh = new FileHandle($opt_d) or die "Couldn't open dictionary file '$opt_d': $!, stopped"; while ($line = <$fh>) { chop($line); $Dictionary{$line} = 1; } $fh->close(); } sub usage { print STDERR <<'EndOfUsage'; Usage: nxt2ref.pl -m ami_dev.map -d meetings_2004-2.vocab ES2012c.D -m ami.map Mapping file. -d meetings_2004-2.vocab Dictionary (only used for hyphenation rules). EndOfUsage exit(0); }