BEGIN {
%MODEMAP = (
'BlastOutput' => 'result',
'Iteration' => 'iteration',
'Hit' => 'hit',
'Hsp' => 'hsp'
);
%MAPPING = (
'Hsp_bit-score' => 'HSP-bits',
'Hsp_score' => 'HSP-score',
'Hsp_evalue' => 'HSP-evalue',
'Hsp_n', => 'HSP-n',
'Hsp_pvalue' => 'HSP-pvalue',
'Hsp_query-from' => 'HSP-query_start',
'Hsp_query-to' => 'HSP-query_end',
'Hsp_hit-from' => 'HSP-hit_start',
'Hsp_hit-to' => 'HSP-hit_end',
'Hsp_positive' => 'HSP-conserved',
'Hsp_identity' => 'HSP-identical',
'Hsp_gaps' => 'HSP-hsp_gaps',
'Hsp_hitgaps' => 'HSP-hit_gaps',
'Hsp_querygaps' => 'HSP-query_gaps',
'Hsp_qseq' => 'HSP-query_seq',
'Hsp_hseq' => 'HSP-hit_seq',
'Hsp_midline' => 'HSP-homology_seq',
'Hsp_align-len' => 'HSP-hsp_length',
'Hsp_query-frame' => 'HSP-query_frame',
'Hsp_hit-frame' => 'HSP-hit_frame',
'Hsp_links' => 'HSP-links',
'Hsp_group' => 'HSP-hsp_group',
'Hsp_features' => 'HSP-hit_features',
'Hit_id' => 'HIT-name',
'Hit_len' => 'HIT-length',
'Hit_accession' => 'HIT-accession',
'Hit_def' => 'HIT-description',
'Hit_signif' => 'HIT-significance',
'Hit_score' => 'HIT-score',
'Hit_bits' => 'HIT-bits',
'Iteration_iter-num' => 'ITERATION-number',
'Iteration_converged' => 'ITERATION-converged',
'BlastOutput_program' => 'RESULT-algorithm_name',
'BlastOutput_version' => 'RESULT-algorithm_version',
'BlastOutput_algorithm-reference' => 'RESULT-algorithm_reference',
'BlastOutput_rid' => 'RESULT-rid',
'BlastOutput_query-def' => 'RESULT-query_name',
'BlastOutput_query-len' => 'RESULT-query_length',
'BlastOutput_query-acc' => 'RESULT-query_accession',
'BlastOutput_query-gi' => 'RESULT-query_gi',
'BlastOutput_querydesc' => 'RESULT-query_description',
'BlastOutput_db' => 'RESULT-database_name',
'BlastOutput_db-len' => 'RESULT-database_entries',
'BlastOutput_db-let' => 'RESULT-database_letters',
'BlastOutput_inclusion-threshold' => 'RESULT-inclusion_threshold',
'Parameters_matrix' => { 'RESULT-parameters' => 'matrix' },
'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
'Parameters_include' => { 'RESULT-parameters' => 'include' },
'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
'Statistics_db-len' => { 'RESULT-statistics' => 'dbentries' },
'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
'Statistics_hsp-len' =>
{ 'RESULT-statistics' => 'effective_hsplength' },
'Statistics_query-len' => { 'RESULT-statistics' => 'querylength' },
'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace' },
'Statistics_eff-spaceused' =>
{ 'RESULT-statistics' => 'effectivespaceused' },
'Statistics_eff-dblen' =>
{ 'RESULT-statistics' => 'effectivedblength' },
'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
'Statistics_entropy' => { 'RESULT-statistics' => 'entropy' },
'Statistics_gapped_kappa' => { 'RESULT-statistics' => 'kappa_gapped' },
'Statistics_gapped_lambda' =>
{ 'RESULT-statistics' => 'lambda_gapped' },
'Statistics_gapped_entropy' =>
{ 'RESULT-statistics' => 'entropy_gapped' },
'Statistics_framewindow' =>
{ 'RESULT-statistics' => 'frameshiftwindow' },
'Statistics_decay' => { 'RESULT-statistics' => 'decayconst' },
'Statistics_hit_to_db' => { 'RESULT-statistics' => 'Hits_to_DB' },
'Statistics_num_suc_extensions' =>
{ 'RESULT-statistics' => 'num_successful_extensions' },
'Statistics_length_adjustment' => { 'RESULT-statistics' => 'length_adjustment' },
'Statistics_number_of_hsps_better_than_expect_value_cutoff_without_gapping' =>
{ 'RESULT-statistics' => 'number_of_hsps_better_than_expect_value_cutoff_without_gapping' },
'Statistics_number_of_hsps_gapped' => { 'RESULT-statistics' => 'number_of_hsps_gapped' },
'Statistics_number_of_hsps_successfully_gapped' => { 'RESULT-statistics' => 'number_of_hsps_successfully_gapped' },
'Statistics_DFA_states' => { 'RESULT-statistics' => 'num_dfa_states' },
'Statistics_DFA_size' => { 'RESULT-statistics' => 'dfa_size' },
'Statistics_noprocessors' =>
{ 'RESULT-statistics' => 'no_of_processors' },
'Statistics_neighbortime' =>
{ 'RESULT-statistics' => 'neighborhood_generate_time' },
'Statistics_starttime' => { 'RESULT-statistics' => 'start_time' },
'Statistics_endtime' => { 'RESULT-statistics' => 'end_time' },
);
for my $frame ( 0 .. 3 ) {
for my $strand ( '+', '-' ) {
for my $ind (
qw(length efflength E S W T X X_gapped E2
E2_gapped S2)
)
{
$MAPPING{"Statistics_frame$strand$frame\_$ind"} =
{ 'RESULT-statistics' => "Frame$strand$frame\_$ind" };
}
for my $val (qw(lambda kappa entropy )) {
for my $type (qw(used computed gapped)) {
my $key = "Statistics_frame$strand$frame\_$val\_$type";
my $val =
{ 'RESULT-statistics' =>
"Frame$strand$frame\_$val\_$type" };
$MAPPING{$key} = $val;
}
}
}
}
for my $stats (
qw(T A X1 X2 X3 S1 S2 X1_bits X2_bits X3_bits
S1_bits S2_bits num_extensions
num_successful_extensions
seqs_better_than_cutoff
posted_date
search_cputime total_cputime
search_actualtime total_actualtime
no_of_processors ctxfactor)
)
{
my $key = "Statistics_$stats";
my $val = { 'RESULT-statistics' => $stats };
$MAPPING{$key} = $val;
}
for my $param (
qw(span span1 span2 links warnings notes hspsepsmax
hspsepqmax topcomboN topcomboE postsw cpus wordmask
filter sort_by_pvalue sort_by_count sort_by_highscore
sort_by_totalscore sort_by_subjectlength noseqs gi qtype
qres V B Z Y M N)
)
{
my $key = "Parameters_$param";
my $val = { 'RESULT-parameters' => $param };
$MAPPING{$key} = $val;
}
$DEFAULT_BLAST_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
$MAX_HSP_OVERLAP = 2; $DEFAULTREPORTTYPE = 'BLASTP';
} |
sub next_result
{ my ($self) = @_;
my $v = $self->verbose;
my $data = '';
my $flavor = '';
$self->{'_seentop'} = 0; $self->{'_seentop'} = 0;
my ( $reporttype, $seenquery, $reportline, $reportversion );
my ( $seeniteration, $found_again );
my $incl_threshold = $self->inclusion_threshold;
my $bl2seq_fix;
$self->start_document(); my (@hit_signifs);
my $gapped_stats = 0; local $_ = "\n"; PARSER:
while ( defined( $_ = $self->_readline ) ) {
next if (/^\s+$/); next if (/CPU time:/);
next if (/^>\s*$/);
next if (/\Q[*]+\s+No hits found\s+[*]+\E/);
if (
/^((?:\S+?)?BLAST[NPX]?)\s+(.+)$/i || /^(P?GENEWISE|HFRAME|SWN|TSWN)\s+(.+)/i )
{
($reporttype, $reportversion) = ($1, $2);
if ($reportversion && $reportversion =~ m{WashU$}) {
$self->{'_wublast'}++;
}
$self->debug("blast.pm: Start of new report: $reporttype, $reportversion\n");
if ( $self->{'_seentop'} ) {
$self->_pushback($_);
last PARSER;
}
$self->_start_blastoutput;
if ($reporttype =~ /RPS-BLAST/) {
$reporttype .= '(BLASTP)'; }
$reportline = $_; $self->element(
{
'Name' => 'BlastOutput_program',
'Data' => $reporttype
}
);
$self->element(
{
'Name' => 'BlastOutput_version',
'Data' => $reportversion
}
);
$self->element(
{
'Name' => 'BlastOutput_inclusion-threshold',
'Data' => $incl_threshold
}
);
}
elsif(/^Reference:\s+(.*)$/) {
my $algorithm_reference = "$1\n";
$_ = $self->_readline;
while($_ !~ /^$/ && $_ !~ /^RID:/ && $_ !~ /^Database:/ && $_ !~ /^Query=/) {
$algorithm_reference .= "$_";
$_ = $self->_readline;
}
$self->_pushback($_);
$self->element(
{
'Name' => 'BlastOutput_algorithm-reference',
'Data' => $algorithm_reference
}
);
}
elsif(/^RID:\s+(.*)$/) {
my $rid = $1;
$self->element(
{
'Name' => 'BlastOutput_rid',
'Data' => $rid
}
);
}
elsif (/^(Searching|Results from round)/) {
next unless $1 =~ /Results from round/;
$self->debug("blast.pm: Possible psi blast iterations found...\n");
$self->in_element('hsp')
&& $self->end_element( { 'Name' => 'Hsp' } );
$self->in_element('hit')
&& $self->end_element( { 'Name' => 'Hit' } );
if ( defined $seeniteration ) {
$self->within_element('iteration')
&& $self->end_element( { 'Name' => 'Iteration' } );
$self->start_element( { 'Name' => 'Iteration' } );
}
else {
$self->start_element( { 'Name' => 'Iteration' } );
}
$seeniteration = 1;
}
elsif (/^Query=\s*(.*)$/) {
my $q = $1;
$self->debug("blast.pm: Query= found...$_\n");
my $size = 0;
if ( defined $seenquery ) {
$self->_pushback($_);
$self->_pushback($reportline) if $reportline;
last PARSER;
}
if ( !defined $reporttype ) {
$self->_start_blastoutput;
if ( defined $seeniteration ) {
$self->in_element('iteration')
&& $self->end_element( { 'Name' => 'Iteration' } );
$self->start_element( { 'Name' => 'Iteration' } );
}
else {
$self->start_element( { 'Name' => 'Iteration' } );
}
$seeniteration = 1;
}
$seenquery = $q;
$_ = $self->_readline;
while ( defined($_) ) {
if (/^Database:/) {
$self->_pushback($_);
last;
}
if ( /\((\-?[\d,]+)\s+letters.*\)/ || /^Length=(\-?[\d,]+)/ ) {
$size = $1;
$size =~ s/,//g;
last;
}
else {
$q .= ($q =~ /\w$/ && $_ =~ /^\w/) ? " $_" : $_;
$q =~ s/\s+/ /g; $q =~ s/^ | $//g;
}
$_ = $self->_readline;
}
chomp($q);
my ( $nm, $desc ) = split( /\s+/, $q, 2 );
$self->element(
{
'Name' => 'BlastOutput_query-def',
'Data' => $nm
}
) if $nm;
$self->element(
{
'Name' => 'BlastOutput_query-len',
'Data' => $size
}
);
defined $desc && $desc =~ s/\s+$//;
$self->element(
{
'Name' => 'BlastOutput_querydesc',
'Data' => $desc
}
);
my ( $gi, $acc, $version ) = $self->_get_seq_identifiers($nm);
$version = defined($version) && length($version) ? ".$version" : "";
$self->element(
{
'Name' => 'BlastOutput_query-acc',
'Data' => "$acc$version"
}
) if $acc;
}
elsif (/^>Unfiltered[+-]1$/) {
while($_ !~ /^Database:/) {
$self->debug("Bypassing features line: $_");
$_ = $self->_readline;
}
$self->_pushback($_);
}
elsif (/Sequences producing significant alignments:/) {
$self->debug("blast.pm: Processing NCBI-BLAST descriptions\n");
$flavor = 'ncbi';
if ( !$self->in_element('iteration') ) {
$self->start_element( { 'Name' => 'Iteration' } );
}
$self->element(
{
'Name' => 'BlastOutput_db-len',
'Data' => $self->{'_blsdb_length'}
}
) if $self->{'_blsdb_length'};
$self->element(
{
'Name' => 'BlastOutput_db-let',
'Data' => $self->{'_blsdb_letters'}
}
) if $self->{'_blsdb_letters'};
$self->element(
{
'Name' => 'BlastOutput_db',
'Data' => $self->{'_blsdb'}
}
) if $self->{'_blsdb_letters'};
my $h_regex;
my $seen_block;
DESCLINE:
while ( defined( my $descline = $self->_readline() ) ) {
if ($descline =~ m{^\s*$}) {
last DESCLINE if $seen_block;
next DESCLINE;
}
$seen_block++;
if ($descline =~ /^(\S+)\s+Begin:\s\d+\s+End:\s+\d+/xms) {
my ($id, $nextline) = ($1, $self->_readline);
$nextline =~ s{^!}{};
$descline = "$id $nextline";
}
if ($descline =~ /(?<!cor) # negative lookahead (\d*\.?(?:[\+\-eE]+)?\d+) # number (float or scientific notation) \s+ # space (\d*\.?(?:[\+\-eE]+)?\d+) # number (float or scientific notation) \s*$/xms) {
my ( $score, $evalue ) = ($1, $2);
$evalue =~ s/^e/1e/i;
my @line = split ' ',$descline;
pop @line, pop @line;
if ($3) { pop @line }
push @hit_signifs, [ $evalue, $score, shift @line, join( ' ', @line ) ];
} elsif ($descline =~ /^CONVERGED/i) {
$self->element(
{
'Name' => 'Iteration_converged',
'Data' => 1
}
);
} else {
$self->_pushback($descline); last DESCLINE;
}
}
}
elsif (/Sequences producing High-scoring Segment Pairs:/) {
$self->debug("blast.pm: Processing WU-BLAST descriptions\n");
$_ = $self->_readline();
$flavor = 'wu';
if ( !$self->in_element('iteration') ) {
$self->start_element( { 'Name' => 'Iteration' } );
}
while ( defined( $_ = $self->_readline() )
&& !/^\s+$/ )
{
my @line = split;
pop @line;
push @hit_signifs,
[ pop @line, pop @line, shift @line, join( ' ', @line ) ];
}
}
elsif (/^Database:\s*(.+)$/) {
$self->debug("blast.pm: Database: $1\n");
my $db = $1;
while ( defined( $_ = $self->_readline ) ) {
if (
/^\s+(\-?[\d\,]+|\S+)\s+sequences\; \s +(\-?[\d,]+|\S+)\s+ total\s+letters/ox ) { my ( $s, $l ) = ( $1, $2 ); $s =~ s/,//g;
$l =~ s/,//g;
$self->element(
{
'Name' => 'BlastOutput_db-len',
'Data' => $s
}
);
$self->element(
{
'Name' => 'BlastOutput_db-let',
'Data' => $l
}
);
$self->{'_blsdb'} = $db;
$self->{'_blsdb_length'} = $s;
$self->{'_blsdb_letters'} = $l;
last;
}
else {
chomp;
$db .= $_;
}
}
$self->element(
{
'Name' => 'BlastOutput_db',
'Data' => $db
}
);
}
elsif (/^>\s*(\S+)\s*(.*)?/) {
chomp;
$self->debug("blast.pm: Hit: $1\n");
$self->in_element('hsp')
&& $self->end_element( { 'Name' => 'Hsp' } );
$self->in_element('hit')
&& $self->end_element( { 'Name' => 'Hit' } );
if ( !$self->within_element('result') ) {
$self->_start_blastoutput;
$self->start_element( { 'Name' => 'Iteration' } );
}
elsif ( !$self->within_element('iteration') ) {
$self->start_element( { 'Name' => 'Iteration' } );
}
$self->start_element( { 'Name' => 'Hit' } );
my $id = $1;
my $restofline = $2;
$self->debug("Starting a hit: $1 $2\n");
$self->element(
{
'Name' => 'Hit_id',
'Data' => $id
}
);
my ($gi, $acc, $version ) = $self->_get_seq_identifiers($id);
$self->element(
{
'Name' => 'Hit_accession',
'Data' => $acc
}
);
HITTABLE:
while (my $v = shift @hit_signifs) {
my $tableid = $v->[2];
if ($tableid !~ m{\Q$id\E}) {
$self->debug("Hit table ID $tableid doesn't match current hit id $id, checking next hit table entry...\n");
next HITTABLE;
} else {
last HITTABLE;
}
}
while ( defined( $_ = $self->_readline() ) ) {
next if (/^\s+$/);
chomp;
if (/Length\s*=\s*([\d,]+)/) {
my $l = $1;
$l =~ s/\,//g;
$self->element(
{
'Name' => 'Hit_len',
'Data' => $l
}
);
last;
}
else {
if ($restofline !~ /\s$/) { s/^\s(?!\s)/\x01/; }
$restofline .= $_;
}
}
$restofline =~ s/\s+/ /g;
$self->element(
{
'Name' => 'Hit_def',
'Data' => $restofline
}
);
}
elsif (/\s+(Plus|Minus) Strand HSPs:/i) {
next;
}
elsif (
( $self->in_element('hit') || $self->in_element('hsp') )
&& m/Score\s*=\s*(\S+)\s*bits\s* # Bit score (?:\((\d+)\))?, # Raw score \s+Log\-Length\sScore\s*=\s*(\d+) # Log-Length score /ox ) { $self->in_element('hsp') && $self->end_element( { 'Name' => 'Hsp' } ); $self->start_element( { 'Name' => 'Hsp' } );
$self->debug( "Got paracel genewise HSP score=$1\n");
my ( $bits, $score, $evalue ) = ( $1, $2, $3 );
$evalue =~ s/^e/1e/i;
$self->element(
{
'Name' => 'Hsp_score',
'Data' => $score
}
);
$self->element(
{
'Name' => 'Hsp_bit-score',
'Data' => $bits
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => $evalue
}
);
}
elsif (
( $self->in_element('hit') || $self->in_element('hsp') )
&& m/Score\s*=\s*([^,\s]+), # Raw score \s*Expect\s*=\s*([^,\s]+), # E-value \s*P(?:\(\S+\))?\s*=\s*([^,\s]+) # P-value /ox ) { $self->in_element('hsp') && $self->end_element( { 'Name' => 'Hsp' } ); $self->start_element( { 'Name' => 'Hsp' } );
$self->debug( "Got paracel hframe HSP score=$1\n");
my ( $score, $evalue, $pvalue ) = ( $1, $2, $3 );
$evalue = "1$evalue" if $evalue =~ /^e/;
$pvalue = "1$pvalue" if $pvalue =~ /^e/;
$self->element(
{
'Name' => 'Hsp_score',
'Data' => $score
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => $evalue
}
);
$self->element(
{
'Name' => 'Hsp_pvalue',
'Data' => $pvalue
}
);
}
elsif (
( $self->in_element('hit') || $self->in_element('hsp') )
&& m/Score\s*=\s*(\S+)\s* # Bit score \(([\d\.]+)\s*bits\), # Raw score \s*Expect\s*=\s*([^,\s]+), # E-value \s*(?:Sum)?\s* # SUM P(?:\(\d+\))?\s*=\s*([^,\s]+) # P-value (?:\s*,\s+Group\s*\=\s*(\d+))? # HSP Group /ox ) { # wu-blast HSP parse $self->in_element('hsp') && $self->end_element( { 'Name' => 'Hsp' } ); $self->start_element( { 'Name' => 'Hsp' } );
my ( $score, $bits, $evalue, $pvalue, $group ) =
( $1, $2, $3, $4, $5 );
$evalue =~ s/^e/1e/i;
$pvalue =~ s/^e/1e/i;
$self->element(
{
'Name' => 'Hsp_score',
'Data' => $score
}
);
$self->element(
{
'Name' => 'Hsp_bit-score',
'Data' => $bits
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => $evalue
}
);
$self->element(
{
'Name' => 'Hsp_pvalue',
'Data' => $pvalue
}
);
if ( defined $group ) {
$self->element(
{
'Name' => 'Hsp_group',
'Data' => $group
}
);
}
}
elsif (
( $self->in_element('hit') || $self->in_element('hsp') )
&& m/^\sFeatures\s\w+\sthis\spart/xmso # If the line begins with "Features in/flanking this part of subject sequence:" ) { $self->in_element('hsp') && $self->end_element( { 'Name' => 'Hsp' } ); my $featline;
$_ = $self->_readline;
while($_ !~ /^\s*$/) {
chomp;
$featline .= $_;
$_ = $self->_readline;
}
$self->_pushback($_);
$featline =~ s{(?:^\s+|\s+^)}{}g;
$featline =~ s{\n}{;}g;
$self->start_element( { 'Name' => 'Hsp' } );
$self->element(
{
'Name' => 'Hsp_features',
'Data' => $featline
}
);
$self->{'_seen_hsp_features'} = 1;
}
elsif (
( $self->in_element('hit') || $self->in_element('hsp') )
&& m/Score\s*=\s*(\S+)\s*bits\s* # Bit score (?:\((\d+)\))?, # Missing for BLAT pseudo-BLAST fmt \s*Expect(?:\((\d+\+?)\))?\s*=\s*([^,\s]+) # E-value /ox ) { # parse NCBI blast HSP if( !$self->{'_seen_hsp_features'} ) { $self->in_element('hsp') && $self->end_element( { 'Name' => 'Hsp' } ); $self->start_element( { 'Name' => 'Hsp' } );
}
my ( $bits, $score, $n, $evalue ) = ( $1, $2, $3, $4 );
$evalue =~ s/^e/1e/i;
$self->element(
{
'Name' => 'Hsp_score',
'Data' => $score
}
);
$self->element(
{
'Name' => 'Hsp_bit-score',
'Data' => $bits
}
);
$self->element(
{
'Name' => 'Hsp_evalue',
'Data' => $evalue
}
);
$self->element(
{
'Name' => 'Hsp_n',
'Data' => $n
}
) if defined $n;
$score = '' unless defined $score; $self->debug("Got NCBI HSP score=$score, evalue $evalue\n");
}
elsif (
$self->in_element('hsp')
&& m/Identities\s*=\s*(\d+)\s*\/\s*(\d+)\s*[\d\%\(\)]+\s* (?:,\s*Positives\s*=\s*(\d+)\/(\d+)\s*[\d\%\(\)]+\s*)? # pos only valid for Protein alignments (?:\,\s*Gaps\s*=\s*(\d+)\/(\d+))? # Gaps /oxi ) { $self->element( { 'Name' => 'Hsp_identity', 'Data' => $1 } ); $self->element(
{
'Name' => 'Hsp_align-len',
'Data' => $2
}
);
if ( defined $3 ) {
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => $3
}
);
}
else {
$self->element(
{
'Name' => 'Hsp_positive',
'Data' => $1
}
);
}
if ( defined $6 ) {
$self->element(
{
'Name' => 'Hsp_gaps',
'Data' => $5
}
);
}
$self->{'_Query'} = { 'begin' => 0, 'end' => 0 };
$self->{'_Sbjct'} = { 'begin' => 0, 'end' => 0 };
if (/(Frame\s*=\s*.+)$/) {
$self->_pushback($1);
}
}
elsif ( $self->in_element('hsp')
&& /Strand\s*=\s*(Plus|Minus)\s*\/\s*(Plus|Minus)/i )
{
if (!defined($reporttype)) {
$self->{'_reporttype'} = $reporttype = 'BLASTN';
$bl2seq_fix = 1; }
next;
}
elsif ( $self->in_element('hsp')
&& /Links\s*=\s*(\S+)/ox )
{
$self->element(
{
'Name' => 'Hsp_links',
'Data' => $1
}
);
}
elsif ( $self->in_element('hsp')
&& /Frame\s*=\s*([\+\-][1-3])\s*(\/\s*([\+\-][1-3]))?/ )
{
unless ( defined $reporttype ) {
$bl2seq_fix = 1;
if ( $1 && $2 ) { $reporttype = 'TBLASTX' }
else {
$reporttype = 'BLASTX';
}
$self->{'_reporttype'} = $reporttype;
}
my ( $queryframe, $hitframe );
if ( $reporttype eq 'TBLASTX' ) {
( $queryframe, $hitframe ) = ( $1, $2 );
$hitframe =~ s/\/\s*//g;
}
elsif ( $reporttype eq 'TBLASTN' || $reporttype eq 'PSITBLASTN') {
( $hitframe, $queryframe ) = ( $1, 0 );
}
elsif ( $reporttype eq 'BLASTX' || $reporttype eq 'RPS-BLAST(BLASTP)') {
( $queryframe, $hitframe ) = ( $1, 0 );
if ($reporttype eq 'RPS-BLAST(BLASTP)') {
$self->element(
{
'Name' => 'BlastOutput_program',
'Data' => 'RPS-BLAST(BLASTX)'
}
);
}
}
$self->element(
{
'Name' => 'Hsp_query-frame',
'Data' => $queryframe
}
);
$self->element(
{
'Name' => 'Hsp_hit-frame',
'Data' => $hitframe
}
);
}
elsif (/^Parameters:/
|| /^\s+Database:\s+?/
|| /^\s+Subset/
|| /^\s*Lambda/
|| /^\s*Histogram/
|| ( $self->in_element('hsp') && /WARNING|NOTE/ ) )
{
$self->debug("blast.pm: found parameters section\n ");
$self->in_element('hsp')
&& $self->end_element( { 'Name' => 'Hsp' } );
$self->in_element('hit')
&& $self->end_element( { 'Name' => 'Hit' } );
$self->_cleanup_hits(\@hit_signifs) if scalar(@hit_signifs);
$self->within_element('iteration')
&& $self->end_element( { 'Name' => 'Iteration' } );
next if /^\s+Subset/;
my $blast = (/^(\s+Database\:)|(\s*Lambda)/) ? 'ncbi' : 'wublast';
if (/^\s*Histogram/) {
$blast = 'btk';
}
my $last = '';
$self->element(
{
'Name' => 'Parameters_allowgaps',
'Data' => 'yes'
}
);
while ( defined( $_ = $self->_readline ) ) {
if (
/^((?:\S+)?BLAST[NPX]?)\s+(.+)$/i || /^(P?GENEWISE|HFRAME|SWN|TSWN)\s+(.+)/i )
{
$self->_pushback($_);
last;
}
elsif (/^Query=/) {
$self->_pushback($_);
$self->_pushback($reportline) if $reportline;
last PARSER;
}
if ( /Number of Sequences:\s+([\d\,]+)/i
|| /of sequences in database:\s+(\-?[\d,]+)/i )
{
my $c = $1;
$c =~ s/\,//g;
$self->element(
{
'Name' => 'Statistics_db-len',
'Data' => $c
}
);
}
elsif (/letters in database:\s+(\-?[\d,]+)/i) {
my $s = $1;
$s =~ s/,//g;
$self->element(
{
'Name' => 'Statistics_db-let',
'Data' => $s
}
);
}
elsif ( $blast eq 'btk' ) {
next;
}
elsif ( $blast eq 'wublast' ) {
if (/E=(\S+)/) {
$self->element(
{
'Name' => 'Parameters_expect',
'Data' => $1
}
);
}
elsif (/nogaps/) {
$self->element(
{
'Name' => 'Parameters_allowgaps',
'Data' => 'no'
}
);
}
elsif (/ctxfactor=(\S+)/) {
$self->element(
{
'Name' => 'Statistics_ctxfactor',
'Data' => $1
}
);
}
elsif (
/(postsw|links|span[12]?|warnings|notes|gi|noseqs|qres|qype)/
)
{
$self->element(
{
'Name' => "Parameters_$1",
'Data' => 'yes'
}
);
}
elsif (/(\S+)=(\S+)/) {
$self->element(
{
'Name' => "Parameters_$1",
'Data' => $2
}
);
}
elsif ( $last =~ /(Frame|Strand)\s+MatID\s+Matrix name/i ) {
my $firstgapinfo = 1;
my $frame = undef;
while ( defined($_) && !/^\s+$/ ) {
s/^\s+//;
s/\s+$//;
if ( $firstgapinfo
&& s/Q=(\d+),R=(\d+)\s+//x )
{
$firstgapinfo = 0;
$self->element(
{
'Name' => 'Parameters_gap-open',
'Data' => $1
}
);
$self->element(
{
'Name' => 'Parameters_gap-extend',
'Data' => $2
}
);
my @fields = split;
for my $type (
qw(lambda_gapped
kappa_gapped
entropy_gapped)
)
{
next if $type eq 'n/a';
if ( !@fields ) {
warn "fields is empty for $type\n";
next;
}
$self->element(
{
'Name' =>
"Statistics_frame$frame\_$type",
'Data' => shift @fields
}
);
}
}
else {
my ( $frameo, $matid, $matrix, @fields ) =
split;
if ( !defined $frame ) {
$self->element(
{
'Name' => 'Parameters_matrix',
'Data' => $matrix
}
);
$self->element(
{
'Name' => 'Statistics_lambda',
'Data' => $fields[0]
}
);
$self->element(
{
'Name' => 'Statistics_kappa',
'Data' => $fields[1]
}
);
$self->element(
{
'Name' => 'Statistics_entropy',
'Data' => $fields[2]
}
);
}
$frame = $frameo;
my $ii = 0;
for my $type (
qw(lambda_used
kappa_used
entropy_used
lambda_computed
kappa_computed
entropy_computed)
)
{
my $f = $fields[$ii];
next unless defined $f; if ( $f eq 'same' ) {
$f = $fields[ $ii - 3 ];
}
$ii++;
$self->element(
{
'Name' =>
"Statistics_frame$frame\_$type",
'Data' => $f
}
);
}
}
$_ = $self->_readline;
}
$last = $_;
}
elsif ( $last =~ /(Frame|Strand)\s+MatID\s+Length/i ) {
my $frame = undef;
while ( defined($_) && !/^\s+/ ) {
s/^\s+//;
s/\s+$//;
my @fields = split;
if ( @fields <= 3 ) {
for my $type (qw(X_gapped E2_gapped S2)) {
last unless @fields;
$self->element(
{
'Name' =>
"Statistics_frame$frame\_$type",
'Data' => shift @fields
}
);
}
}
else {
for my $type (
qw(length
efflength
E S W T X E2 S2)
)
{
$self->element(
{
'Name' =>
"Statistics_frame$frame\_$type",
'Data' => shift @fields
}
);
}
}
$_ = $self->_readline;
}
$last = $_;
}
elsif (/(\S+\s+\S+)\s+DFA:\s+(\S+)\s+\((.+)\)/) {
if ( $1 eq 'states in' ) {
$self->element(
{
'Name' => 'Statistics_DFA_states',
'Data' => "$2 $3"
}
);
}
elsif ( $1 eq 'size of' ) {
$self->element(
{
'Name' => 'Statistics_DFA_size',
'Data' => "$2 $3"
}
);
}
}
elsif (
m/^\s+Time to generate neighborhood:\s+ (\S+\s+\S+\s+\S+)/x ) { $self->element( { 'Name' => 'Statistics_neighbortime', 'Data' => $1 } ); }
elsif (/processors\s+used:\s+(\d+)/) {
$self->element(
{
'Name' => 'Statistics_noprocessors',
'Data' => $1
}
);
}
elsif (
m/^\s+(\S+)\s+cpu\s+time:\s+# cputype (\S+\s+\S+\s+\S+) # cputime \s+Elapsed:\s+(\S+)/x ) { my $cputype = lc($1); $self->element(
{
'Name' => "Statistics_$cputype\_cputime",
'Data' => $2
}
);
$self->element(
{
'Name' => "Statistics_$cputype\_actualtime",
'Data' => $3
}
);
}
elsif (/^\s+Start:/) {
my ( $junk, $start, $stime, $end, $etime ) =
split( /\s+(Start|End)\:\s+/, $_ );
chomp($stime);
$self->element(
{
'Name' => 'Statistics_starttime',
'Data' => $stime
}
);
chomp($etime);
$self->element(
{
'Name' => 'Statistics_endtime',
'Data' => $etime
}
);
}
elsif (/^\s+Database:\s+(.+)$/) {
$self->element(
{
'Name' => 'Parameters_full_dbpath',
'Data' => $1
}
);
}
elsif (/^\s+Posted:\s+(.+)/) {
my $d = $1;
chomp($d);
$self->element(
{
'Name' => 'Statistics_posted_date',
'Data' => $d
}
);
}
}
elsif ( $blast eq 'ncbi' ) {
if (m/^Matrix:\s+(.+)\s*$/oxi) { $self->element( { 'Name' => 'Parameters_matrix', 'Data' => $1 } ); }
elsif (/^Gapped/) {
$gapped_stats = 1;
}
elsif (/^Lambda/) {
$_ = $self->_readline;
s/^\s+//;
my ( $lambda, $kappa, $entropy ) = split;
if ($gapped_stats) {
$self->element(
{
'Name' => "Statistics_gapped_lambda",
'Data' => $lambda
}
);
$self->element(
{
'Name' => "Statistics_gapped_kappa",
'Data' => $kappa
}
);
$self->element(
{
'Name' => "Statistics_gapped_entropy",
'Data' => $entropy
}
);
}
else {
$self->element(
{
'Name' => "Statistics_lambda",
'Data' => $lambda
}
);
$self->element(
{
'Name' => "Statistics_kappa",
'Data' => $kappa
}
);
$self->element(
{
'Name' => "Statistics_entropy",
'Data' => $entropy
}
);
}
}
elsif (m/effective\s+search\s+space\s+used:\s+(\d+)/oxi) { $self->element( { 'Name' => 'Statistics_eff-spaceused', 'Data' => $1 } ); }
elsif (m/effective\s+search\s+space:\s+(\d+)/oxi) { $self->element( { 'Name' => 'Statistics_eff-space', 'Data' => $1 } ); }
elsif (
m/Gap\s+Penalties:\s+Existence:\s+(\d+)\, \s+Extension:\s+(\d+)/ox ) { $self->element( { 'Name' => 'Parameters_gap-open', 'Data' => $1 } ); $self->element(
{
'Name' => 'Parameters_gap-extend',
'Data' => $2
}
);
}
elsif (/effective\s+HSP\s+length:\s+(\d+)/) {
$self->element(
{
'Name' => 'Statistics_hsp-len',
'Data' => $1
}
);
}
elsif (/Number\s+of\s+HSP's\s+better\s+than\s+(\S+)\s+without\s+gapping:\s+(\d+)/) {
$self->element(
{
'Name' => 'Statistics_number_of_hsps_better_than_expect_value_cutoff_without_gapping',
'Data' => $2
}
);
}
elsif (/Number\s+of\s+HSP's\s+gapped:\s+(\d+)/) {
$self->element(
{
'Name' => 'Statistics_number_of_hsps_gapped',
'Data' => $1
}
);
}
elsif (/Number\s+of\s+HSP's\s+successfully\s+gapped:\s+(\d+)/) {
$self->element(
{
'Name' => 'Statistics_number_of_hsps_successfully_gapped',
'Data' => $1
}
);
}
elsif (/Length\s+adjustment:\s+(\d+)/) {
$self->element(
{
'Name' => 'Statistics_length_adjustment',
'Data' => $1
}
);
}
elsif (/effective\s+length\s+of\s+query:\s+([\d\,]+)/i) {
my $c = $1;
$c =~ s/\,//g;
$self->element(
{
'Name' => 'Statistics_query-len',
'Data' => $c
}
);
}
elsif (/effective\s+length\s+of\s+database:\s+([\d\,]+)/i) {
my $c = $1;
$c =~ s/\,//g;
$self->element(
{
'Name' => 'Statistics_eff-dblen',
'Data' => $c
}
);
}
elsif (
/^(T|A|X1|X2|X3|S1|S2):\s+(\d+(\.\d+)?)\s+(?:\(\s*(\d+\.\d+) bits\))?/
)
{
my $v = $2;
chomp($v);
$self->element(
{
'Name' => "Statistics_$1",
'Data' => $v
}
);
if ( defined $4 ) {
$self->element(
{
'Name' => "Statistics_$1_bits",
'Data' => $4
}
);
}
}
elsif (
m/frameshift\s+window\, \s+decay\s+const:\s+(\d+)\,\s+([\.\d]+)/x ) { $self->element( { 'Name' => 'Statistics_framewindow', 'Data' => $1 } ); $self->element(
{
'Name' => 'Statistics_decay',
'Data' => $2
}
);
}
elsif (m/^Number\s+of\s+Hits\s+to\s+DB:\s+(\S+)/ox) { $self->element( { 'Name' => 'Statistics_hit_to_db', 'Data' => $1 } ); }
elsif (m/^Number\s+of\s+extensions:\s+(\S+)/ox) { $self->element( { 'Name' => 'Statistics_num_extensions', 'Data' => $1 } ); }
elsif (
m/^Number\s+of\s+successful\s+extensions:\s+ (\S+)/ox ) { $self->element( { 'Name' => 'Statistics_num_suc_extensions', 'Data' => $1 } ); }
elsif (
m/^Number\s+of\s+sequences\s+better\s+than\s+ (\S+):\s+(\d+)/ox ) { $self->element( { 'Name' => 'Parameters_expect', 'Data' => $1 } ); $self->element(
{
'Name' => 'Statistics_seqs_better_than_cutoff',
'Data' => $2
}
);
}
elsif (/^\s+Posted\s+date:\s+(.+)/) {
my $d = $1;
chomp($d);
$self->element(
{
'Name' => 'Statistics_posted_date',
'Data' => $d
}
);
}
elsif ( !/^\s+$/ ) {
}
}
$last = $_;
}
} elsif ( $self->in_element('hsp') ) {
$self->debug("blast.pm: Processing HSP\n");
$self->{'_reporttype'} ||= $DEFAULTREPORTTYPE;
my %data = (
'Query' => '',
'Mid' => '',
'Hit' => ''
);
my $len;
for ( my $i = 0 ; defined($_) && $i < 3 ; $i++ ) {
if ( ( $i == 0 && /^\s+$/) ||
/^\s*(?:Lambda|Minus|Plus|Score)/i )
{
$self->_pushback($_) if defined $_;
$self->end_element( { 'Name' => 'Hsp' } );
last;
}
chomp;
if (/^((Query|Sbjct):?\s+(\-?\d+)?\s*)(\S+)\s+(\-?\d+)?/) {
my ( $full, $type, $start, $str, $end ) =
( $1, $2, $3, $4, $5 );
if ( $str eq '-' ) {
$i = 3 if $type eq 'Sbjct';
}
else {
$data{$type} = $str;
}
$len = length($full);
$self->{"\_$type"}->{'begin'} = $start
unless $self->{"_$type"}->{'begin'};
$self->{"\_$type"}->{'end'} = $end;
} elsif (/^((Query|Sbjct):?\s+(\-?0+)\s*)/) {
$_ = $self->_readline() for 0..1;
last;
} else {
$self->throw("no data for midline $_")
unless ( defined $_ && defined $len );
$data{'Mid'} = substr( $_, $len );
}
$_ = $self->_readline();
}
$self->characters(
{
'Name' => 'Hsp_qseq',
'Data' => $data{'Query'}
}
);
$self->characters(
{
'Name' => 'Hsp_hseq',
'Data' => $data{'Sbjct'}
}
);
$self->characters(
{
'Name' => 'Hsp_midline',
'Data' => $data{'Mid'}
}
);
}
else {
}
}
$self->debug("blast.pm: End of BlastOutput\n");
if ( $self->{'_seentop'} ) {
$self->within_element('hsp')
&& $self->end_element( { 'Name' => 'Hsp' } );
$self->within_element('hit')
&& $self->end_element( { 'Name' => 'Hit' } );
$self->_cleanup_hits(\@hit_signifs) if scalar(@hit_signifs);
$self->within_element('iteration')
&& $self->end_element( { 'Name' => 'Iteration' } );
if ($bl2seq_fix) {
$self->element(
{
'Name' => 'BlastOutput_program',
'Data' => $reporttype
}
);
}
$self->end_element( { 'Name' => 'BlastOutput' } );
}
return $self->end_document();
}
} |
sub _cleanup_hits
{ my ($self, $hits) = @_;
while ( my $v = shift @{ $hits }) {
next unless defined $v;
$self->start_element( { 'Name' => 'Hit' } );
my $id = $v->[2];
my $desc = $v->[3];
$self->element(
{
'Name' => 'Hit_id',
'Data' => $id
}
);
my ($gi, $acc, $version ) = $self->_get_seq_identifiers($id);
$self->element(
{
'Name' => 'Hit_accession',
'Data' => $acc
}
);
if ( defined $v ) {
$self->element(
{
'Name' => 'Hit_signif',
'Data' => $v->[0]
}
);
if (exists $self->{'_wublast'}) {
$self->element(
{
'Name' => 'Hit_score',
'Data' => $v->[1]
}
);
} else {
$self->element(
{
'Name' => 'Hit_bits',
'Data' => $v->[1]
}
);
}
}
$self->element(
{
'Name' => 'Hit_def',
'Data' => $desc
}
);
$self->end_element( { 'Name' => 'Hit' } );
}
}
1;
__END__
Developer Notes
---------------
The following information is added in hopes of increasing the
maintainability of this code. It runs the risk of becoming obsolete as
the code gets updated. As always, double check against the actual
source. If you find any discrepencies, please correct them.
[ This documentation added on 3 Jun 2003. ]
The logic is the brainchild of Jason Stajich, documented by Steve
Chervitz. Jason: please check it over and modify as you see fit.
Question:
Elmo wants to know: How does this module unmarshall data from the input stream?
(i.e., how does information from a raw input file get added to
the correct Bioperl object?)
Answer:
This answer is specific to SearchIO::blast, but may apply to other
SearchIO.pm subclasses as well. The following description gives the
basic idea. The actual processing is a little more complex for
certain types of data (HSP, Report Parameters).
You can think of blast::next_result() as faking a SAX XML parser,
making a non-XML document behave like its XML. The overhead to do this
is quite substantial (~650 lines of code instead of ~80 in
blastxml.pm).
0. First, add a key => value pair for the datum of interest to %MAPPING
Example:
'Foo_bar' => 'Foo-bar',
1. next_result() collects the datum of interest from the input stream,
and calls element().
Example:
$self->element({ 'Name' => 'Foo_bar',
'Data' => $foobar});
2. The element() method is a convenience method that calls start_element(),
characters(), and end_element().
3. start_element() checks to see if the event handler can handle a start_xxx(),
where xxx = the 'Name' parameter passed into element(), and calls start_xxx()
if so. Otherwise, start_element() does not do anything.
Data that will have such an event handler are defined in %MODEMAP.
Typically, there are only handler methods for the main parts of
the search result (e.g., Result, Iteration, Hit, HSP),
which have corresponding Bioperl modules. So in this example,
there was an earlier call such as $self->element({'Name'=>'Foo'})
and the Foo_bar datum is meant to ultimately go into a Foo object.
The start_foo() method in the handler will typically do any
data initialization necessary to prepare for creating a new Foo object.
Example: SearchResultEventBuilder::start_result()
4. characters() takes the value of the 'Data' key from the hashref argument in
the elements() call and saves it in a local data member:
Example:
$self->{'_last_data'} = $data->{'Data'};
5. end_element() is like start_element() in that it does the check for whether
the event handler can handle end_xxx() and if so, calls it, passing in
the data collected from all of the characters() calls that occurred
since the start_xxx() call.
If there isn't any special handler for the data type specified by 'Name', end_element() will place the data saved by characters() into another local data member that saves it in a hash with a key defined by %MAPPING. Example: $nm = $data->{'Name'}; $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
In this case, $MAPPING{$nm} is 'Foo-bar'.
end_element() finishes by resetting the local data member used by characters(). (i.e., $self->{'_last_data'} = '';)
6. When the next_result() method encounters the end of the Foo element in the input stream. It will invoke $self->end_element({'Name'=>'Foo'}). end_element() then sends all of the data in the $self->{'_values'} hash. Note that $self->{'_values'} is cleaned out during start_element(), keeping it at a resonable size.
In the event handler, the end_foo() method takes the hash from end_element() and creates a new hash containing the same data, but having keys lacking the 'Foo' prefix (e.g., 'Foo-bar' becomes '-bar'). The handler's end_foo() method then creates the Foo object, passing in this new hash as an argument.
Example: SearchResultEventBuilder::end_result()
7. Objects created from the data in the search result are managed by
the event handler which adds them to a ResultI object (using API methods
for that object). The ResultI object gets passed back to
SearchIO::end_element() when it calls end_result().
The ResultI object is then saved in an internal data member of the
SearchIO object, which returns it at the end of next_result()
by calling end_document().
(Technical Note: All objects created by end_xxx() methods in the event
handler are returned to SearchIO::end_element(), but the SearchIO object
only cares about the ResultI objects.)
(Sesame Street aficionados note: This answer was NOT given by Mr. Noodle ;-P)} |
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _