This object contains the event based parsing code for FASTA format reports.
BEGIN { %MODEMAP = ('FastaOutput' => 'result',
'Hit' => 'hit',
'Hsp' => 'hsp'
);
%MAPPING = (
'Hsp_bit-score' => 'bits',
'Hsp_score' => 'score',
'Hsp_sw-score' => 'swscore',
'Hsp_evalue' => 'evalue',
'Hsp_query-from'=> 'querystart',
'Hsp_query-to' => 'queryend',
'Hsp_hit-from' => 'hitstart',
'Hsp_hit-to' => 'hitend',
'Hsp_positive' => 'conserved',
'Hsp_identity' => 'identical',
'Hsp_gaps' => 'gaps',
'Hsp_hitgaps' => 'hitgaps',
'Hsp_querygaps' => 'querygaps',
'Hsp_qseq' => 'queryseq',
'Hsp_hseq' => 'hitseq',
'Hsp_midline' => 'homolseq',
'Hsp_align-len' => 'hsplen',
'Hsp_query-frame'=> 'queryframe',
'Hsp_hit-frame' => 'hitframe',
'Hit_id' => 'hitname',
'Hit_len' => 'hitlen',
'Hit_accession' => 'hitacc',
'Hit_def' => 'hitdesc',
'Hit_signif' => 'hitsignif',
'Hit_score' => 'hitscore',
'FastaOutput_program' => 'programname',
'FastaOutput_version' => 'programver',
'FastaOutput_query-def'=> 'queryname',
'FastaOutput_query-len'=> 'querylen',
'FastaOutput_db' => 'dbname',
'FastaOutput_db-len' => 'dbsize',
'FastaOutput_db-let' => 'dblets',
'Parameters_matrix' => { 'param' => 'matrix'},
'Parameters_expect' => { 'param' => 'expect'},
'Parameters_include' => { 'param' => 'include'},
'Parameters_sc-match' => { 'param' => 'match'},
'Parameters_sc-mismatch' => { 'param' => 'mismatch'},
'Parameters_gap-open' => { 'param' => 'gapopen'},
'Parameters_gap-ext' => { 'param' => 'gapext'},
'Parameters_word-size' => { 'param' => 'wordsize'},
'Parameters_ktup' => { 'param' => 'ktup'},
'Parameters_filter' => {'param' => 'filter'},
'Statistics_db-num' => { 'stat' => 'dbentries'},
'Statistics_db-len' => { 'stat' => 'dbletters'},
'Statistics_hsp-len' => { 'stat' => 'hsplength'},
'Statistics_eff-space' => { 'stat' => 'effectivespace'},
'Statistics_kappa' => { 'stat' => 'kappa' },
'Statistics_lambda' => { 'stat' => 'lambda' },
'Statistics_entropy' => { 'stat' => 'entropy'},
); } |
sub next_result
{ my ($self) = @_;
my $data = '';
my $seentop = 0;
$self->start_document();
my @hit_signifs;
while( defined ($_ = $self->_readline )) {
next if( ! $self->in_element('hsp') &&
/^\s+$/); if( /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/ ) {
if( $seentop ) {
$self->_pushback($_);
$self->end_element({ 'Name' => 'FastaOutput'});
return $self->end_document();
}
$self->{'_reporttype'} = $1;
$self->start_element({ 'Name' => 'FastaOutput' } );
$seentop = 1;
$self->element({ 'Name' => 'FastaOutput_program',
'Data' => $self->{'_reporttype'}});
$_ = $self->_readline();
my ($version) = (/version\s+(\S+)/);
$version = '' unless defined $version;
$self->element({ 'Name' => 'FastaOutput_version',
'Data' => $version});
my ($last);
while( defined($_ = $self->_readline()) ) {
if( /\s+>(.+)/ || /^\s*vs\s+/ ) {
my $querydef = $1;
if( $last =~ /(\S+)[:,]\s*(\d+)\s+(aa|nt)/ ) {
if( $self->{'_reporttype'} &&
$self->{'_reporttype'} eq 'FASTA' ) {
if( $3 eq 'nt') {
$self->{'_reporttype'} = 'FASTN' ;
} elsif( $3 eq 'aa' ) {
$self->{'_reporttype'} = 'FASTP' ;
}
}
$self->element({'Name' => 'FastaOutput_query-def',
'Data' => $querydef || $1});
$self->element({'Name' => 'FastaOutput_query-len',
'Data' => $2});
} else {
$self->element({'Name' => 'FastaOutput_query-def',
'Data' => $querydef });
$self->warn("unable to find and set query length");
}
last;
}
$last = $_;
}
if( $last =~ /^\s*vs\s+(\S+)/ ||
(defined $_ && /^\s*vs\s+(\S+)/) ||
(defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/)
) {
$self->element({'Name' => 'FastaOutput_db',
'Data' => $1});
}
} elsif( /(\d+) residues in\s+(\d+)\s+sequences/ ) {
$self->element({'Name' => 'FastaOutput_db-let',
'Data' => $1});
$self->element({'Name' => 'FastaOutput_db-len',
'Data' => $2});
$self->element({'Name' => 'Statistics_db-len',
'Data' => $1});
$self->element({'Name' => 'Statistics_db-num',
'Data' => $2});
} elsif( /Lambda=\s+(\S+)/ ) {
$self->element({'Name' => 'Statistics_lambda',
'Data' => $1});
} elsif( /^\s*(Smith-Waterman).+(\S+)\s*matrix/ ) {
$self->element({'Name' => 'Parameters_matrix',
'Data' => $2});
$self->{'_reporttype'} = $1;
$self->element({ 'Name' => 'FastaOutput_program',
'Data' => $self->{'_reporttype'}});
} elsif( /The best scores are:/ ) {
while( defined ($_ = $self->_readline() ) &&
! /^\s+$/ ) {
my @line = split;
push @hit_signifs, [ pop @line, pop @line];
}
} elsif( /^\s*([T]?FAST[XYAF]).+,\s*(\S+)\s*matrix.+ktup:\s*(\d+)/ ) {
$self->element({'Name' => 'Parameters_matrix',
'Data' => $2});
$self->element({'Name' => 'Parameters_ktup',
'Data' => $3});
$self->{'_reporttype'} = $1 if( $self->{'_reporttype'} !~ /FAST[PN]/i ) ;
$self->element({ 'Name' => 'FastaOutput_program',
'Data' => $self->{'_reporttype'}});
} elsif( /gap\-pen:\s+([\-\+]?\d+)\/\s+([\-\+]?\d+).+width:\s+(\d+)/ ) {
$self->element({'Name' => 'Parameters_gap-open',
'Data' => $1});
$self->element({'Name' => 'Parameters_gap-ext',
'Data' => $2});
$self->element({'Name' => 'Parameters_word-size',
'Data' => $3});
} elsif( /^>>(.+) \((\d+)\s*(aa|nt)\)$/ ) {
if( $self->in_element('hsp') ) {
$self->end_element({ 'Name' => 'Hsp'});
}
if( $self->in_element('hit') ) {
$self->end_element({ 'Name' => 'Hit'});
}
$self->start_element({'Name' => 'Hit'});
$self->element({ 'Name' => 'Hit_len',
'Data' => $2});
my ($id,$desc) = split(/\s+/,$1,2);
$self->element({ 'Name' => 'Hit_id',
'Data' => $id});
my $v = shift @hit_signifs;
if( defined $v ) {
$self->element({'Name' => 'Hit_signif',
'Data' => $v->[0]});
$self->element({'Name' => 'Hit_score',
'Data' => $v->[1]});
}
my @pieces = split(/\|/,$id);
my $acc = pop @pieces;
$acc =~ s/\.\d+$//;
$self->element({ 'Name' => 'Hit_accession',
'Data' => $acc});
$self->element({ 'Name' => 'Hit_def',
'Data' => $desc});
$self->start_element({'Name' => 'Hsp'});
$_ = $self->_readline();
my ($score,$bits,$e) = ( /Z-score:\s*(\S+)\s*bits:\s*(\S+)\s+E\(\):\s*(\S+)/ );
$self->element({'Name' => 'Hsp_score',
'Data' => $score});
$self->element({'Name' => 'Hsp_evalue',
'Data' => $e});
$self->element({'Name' => 'Hsp_bit-score',
'Data' => $bits});
$_ = $self->_readline();
if( /Smith-Waterman score:\s*(\d+)/ ) {
$self->element({'Name' => 'Hsp_sw-score',
'Data' => $1});
}
if( /(\d+\.\d+)\%\s*identity\s*\((\d+\.\d+)\%\s*ungapped\)\s*in\s*(\d+)\s+(aa|nt)\s+overlap\s*\((\d+)\-(\d+):(\d+)\-(\d+)\)/ ) {
my ($identper,$gapper,$len,$querystart,
$queryend,$hitstart,$hitend) = ($1,$2,$3,$5,$6,$7,$8);
my $ident = POSIX::ceil(($identper/100) * $len); my $gaps = POSIX::ceil ( ($gapper/100) * $len);
$self->element({'Name' => 'Hsp_gaps',
'Data' => $gaps});
$self->element({'Name' => 'Hsp_identity',
'Data' => $ident});
$self->element({'Name' => 'Hsp_positive',
'Data' => $ident});
$self->element({'Name' => 'Hsp_align-len',
'Data' => $len});
$self->element({'Name' => 'Hsp_query-from',
'Data' => $querystart});
$self->element({'Name' => 'Hsp_query-to',
'Data' => $queryend});
$self->element({'Name' => 'Hsp_hit-from',
'Data' => $hitstart});
$self->element({'Name' => 'Hsp_hit-to',
'Data' => $hitend});
} else {
$self->warn( "unable to parse FASTA score line: $_");
}
} elsif( /\d+\s*residues\s*in\s*\d+\s*query\s*sequences/ ) {
if( $self->in_element('hsp') ) {
$self->end_element({'Name' => 'Hsp'});
}
if( $self->in_element('hit') ) {
$self->end_element({'Name' => 'Hit'});
}
while(defined($_ = $self->_readline() ) ) {
last if( /^Function used was/);
if( /(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+sequence)|(sequence\s+database)/ ) {
$self->_pushback($_);
}
}
$self->end_element({ 'Name' => 'FastaOutput'});
return $self->end_document();
} elsif( $self->in_element('hsp' ) ) {
my @data = ( '','','');
my $count = 0;
my $len = 0;
while( defined($_ ) ) {
chomp;
if( /residues in \d+\s+query\s+sequences/) {
$self->_pushback($_);
last;
}
if( $count == 0 ) {
} elsif( $count == 1 || $count == 3 ) {
if( /^(\S+\s+)(\S+)/ ) {
$len = length($1);
$data[$count-1] = $2;
} elsif( /^\s+\d+/ ) {
$count--; } elsif( /^\s+/ || length($_) == 0) {
} else {
$self->warn("Unrecognized alignment line ($count) $_");
}
} elsif( $count == 2 ) {
if( length($_) >= $len ) {
$data[$count-1] = substr($_,$len);
}
}
last if( $count++ >= 5);
$_ = $self->_readline();
}
if( length($data[0]) > 0 ) {
$self->characters({'Name' => 'Hsp_qseq',
'Data' => $data[0] });
$self->characters({'Name' => 'Hsp_midline',
'Data' => $data[1]});
$self->characters({'Name' => 'Hsp_hseq',
'Data' => $data[2]});
}
} else {
if( ! $seentop ) {
print;
$self->warn("unrecognized FASTA Family report file!");
return undef;
}
}
}} |
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _