class Bio::HMMER::Report

A parser class for a search report by hmmsearch or hmmpfam program in the HMMER package.

Examples

Examples
 #for multiple reports in a single output file (example.hmmpfam)
 Bio::HMMER.reports(File.read("example.hmmpfam")) do |report|
   report.program['name']
   report.parameter['HMM file']
   report.query_info['Query sequence']
   report.hits.each do |hit|
     hit.accession
     hit.description
     hit.score
     hit.evalue
     hit.hsps.each do |hsp|
       hsp.accession
       hsp.domain
       hsp.evalue
       hsp.midline
   end
 end

References

Constants

DELIMITER

Delimiter of each entry for Bio::FlatFile support.

Attributes

domain_top_hits[R]

statistics by hmmsearch. Keys are 'Total memory', 'Satisfying E cutoff' and 'Total hits'.

histogram[R]

statistics by hmmsearch.

hits[R]
hsps[R]

Returns an Array of Bio::HMMER::Report::Hsp objects. Under special circumstances, some HSPs do not have parent Hit objects. If you want to access such HSPs, use this method.

parameter[R]

A hash contains parameters used. Valid keys are 'HMM file' and 'Sequence file'.

program[R]

A Hash contains program information used. Valid keys are 'name', 'version', 'copyright' and 'license'.

query_info[R]

A hash contains the query information. Valid keys are 'query sequence', 'Accession' and 'Description'.

statistical_detail[R]

statistics by hmmsearch. Keys are 'mu', 'lambda', 'chi-sq statistic' and 'P(chi-square)'.

total_seq_searched[R]

statistics by hmmsearch.

whole_seq_top_hits[R]

statistics by hmmsearch. Keys are 'Total memory', 'Satisfying E cutoff' and 'Total hits'.

Public Class Methods

new(data) click to toggle source

Parses a HMMER search report (by hmmpfam or hmmsearch program) and reutrns a Bio::HMMER::Report object.

Examples

hmmpfam_report = Bio::HMMER::Report.new(File.read("hmmpfam.out"))

hmmsearch_report = Bio::HMMER::Report.new(File.read("hmmsearch.out"))
# File lib/bio/appl/hmmer/report.rb, line 156
def initialize(data)

  # The input data is divided into six data fields, i.e. header,
  # query infomation, hits, HSPs, alignments and search statistics.
  # However, header and statistics data don't necessarily exist.
  subdata, is_hmmsearch = get_subdata(data)

  # if header exists, parse it
  if subdata["header"]
    @program, @parameter = parse_header_data(subdata["header"])
  else
    @program, @parameter = [{}, {}]
  end

  @query_info = parse_query_info(subdata["query"])
  @hits       = parse_hit_data(subdata["hit"])
  @hsps       = parse_hsp_data(subdata["hsp"], is_hmmsearch)

  if @hsps != []
    # split alignment subdata into an array of alignments
    aln_ary = subdata["alignment"].split(/^\S+.*?\n/).slice(1..-1)

    # append alignment information to corresponding Hsp
    aln_ary.each_with_index do |aln, i|
      @hsps[i].set_alignment(aln)
    end
  end

  # assign each Hsp object to its parent Hit
  hits_hash = {}
  @hits.each do |hit|
    hits_hash[hit.accession] = hit
  end
  @hsps.each do |hsp|
    if hits_hash.has_key?(hsp.accession)
      hits_hash[hsp.accession].append_hsp(hsp)
    end
  end

  # parse statistics (for hmmsearch)
  if is_hmmsearch
    @histogram, @statistical_detail, @total_seq_searched,          @whole_seq_top_hits, @domain_top_hits =          parse_stat_data(subdata["statistics"])
  end

end

Public Instance Methods

each() { |hit| ... } click to toggle source

Iterates each hit (Bio::HMMER::Report::Hit).

# File lib/bio/appl/hmmer/report.rb, line 206
def each
  @hits.each do |hit|
    yield hit
  end
end
Also aliased as: each_hit
each_hit()
Alias for: each

Private Instance Methods

get_subdata(data) click to toggle source

#get_subdata

# File lib/bio/appl/hmmer/report.rb, line 215
def get_subdata(data)
  subdata = {}
  header_prefix = '\Ahmm(search|pfam) - search'
  query_prefix  = '^Query (HMM|sequence): .*\nAccession: '
  hit_prefix    = '^Scores for (complete sequences|sequence family)'
  hsp_prefix    = '^Parsed for domains:'
  aln_prefix    = '^Alignments of top-scoring domains:\n'
  stat_prefix   = '^\nHistogram of all scores:'

  # if header exists, get it
  if data =~ /#{header_prefix}/
    is_hmmsearch = ($1 == "search")   # hmmsearch or hmmpfam
    subdata["header"] = data[/(\A.+?)(?=#{query_prefix})/m]
  else
    is_hmmsearch = false   # if no header, assumed to be hmmpfam
  end

  # get query, Hit and Hsp data
  subdata["query"] = data[/(#{query_prefix}.+?)(?=#{hit_prefix})/m]
  subdata["hit"]   = data[/(#{hit_prefix}.+?)(?=#{hsp_prefix})/m]
  subdata["hsp"]   = data[/(#{hsp_prefix}.+?)(?=#{aln_prefix})/m]

  # get alignment data
  if is_hmmsearch
    data =~ /#{aln_prefix}(.+?)#{stat_prefix}/m
    subdata["alignment"] = $1
  else
    data =~ /#{aln_prefix}(.+?)\/\/\n/m
    subdata["alignment"] = $1
    raise "multiple reports found" if $'.length > 0
  end
  
  # handle -A option of HMMER
  cutoff_line = '\t\[output cut off at A = \d+ top alignments\]\n\z'
  subdata["alignment"].sub!(/#{cutoff_line}/, '')
  
  # get statistics data
  subdata["statistics"] = data[/(#{stat_prefix}.+)\z/m]

  [subdata, is_hmmsearch]
end
parse_header_data(data) click to toggle source

#parse_header_data

# File lib/bio/appl/hmmer/report.rb, line 260
def parse_header_data(data)
  data =~ /\A(.+? - - -$\n)(.+? - - -$\n)\n\z/m
  program_data   = $1
  parameter_data = $2
  
  program = {}
  program['name'], program['version'], program['copyright'],        program['license'] = program_data.split(/\n/)
  
  parameter = {}
  parameter_data.each_line do |x|
    if /^(.+?):\s+(.*?)\s*$/ =~ x
      parameter[$1] = $2
    end
  end

  [program, parameter]
end
parse_hit_data(data) click to toggle source

#parse_hit_data

# File lib/bio/appl/hmmer/report.rb, line 297
def parse_hit_data(data)
  data.sub!(/.+?---\n/m, '').chop!
  hits = []
  return hits if data == "\t[no hits above thresholds]\n"
  data.each_line do |l|
    hits.push(Hit.new(l))
  end
  hits
end
parse_hsp_data(data, is_hmmsearch) click to toggle source

#parse_hsp_data

# File lib/bio/appl/hmmer/report.rb, line 310
def parse_hsp_data(data, is_hmmsearch)
  data.sub!(/.+?---\n/m, '').chop!
  hsps=[]
  return hsps if data == "\t[no hits above thresholds]\n"
  data.each_line do |l|
    hsps.push(Hsp.new(l, is_hmmsearch))
  end
  return hsps
end
parse_query_info(data) click to toggle source

#parse_query_info

# File lib/bio/appl/hmmer/report.rb, line 282
def parse_query_info(data)
  hash = {}
  data.each_line do |x|
    if /^(.+?):\s+(.*?)\s*$/ =~ x
      hash[$1] = $2
    elsif /\s+\[(.+)\]/ =~ x
      hash['comments'] = $1
    end
  end
  hash
end
parse_stat_data(data) click to toggle source

#parse_stat_data

# File lib/bio/appl/hmmer/report.rb, line 323
def parse_stat_data(data)
  data.sub!(/\nHistogram of all scores:\n(.+?)\n\n\n%/m, '')
  histogram = $1.strip

  statistical_detail = {}
  data.sub!(/(.+?)\n\n/m, '')
  $1.each_line do |l|
    statistical_detail[$1] = $2.to_f if /^\s*(.+?)\s*=\s*(\S+)/ =~ l
  end
    
  total_seq_searched = nil
  data.sub!(/(.+?)\n\n/m, '')
  $1.each_line do |l|
    total_seq_searched = $2.to_i if /^\s*(.+)\s*:\s*(\S+)/ =~ l
  end
    
  whole_seq_top_hits = {}
  data.sub!(/(.+?)\n\n/m, '')
  $1.each_line do |l|
    if /^\s*(.+?):\s*(\d+)\s*$/ =~ l
      whole_seq_top_hits[$1] = $2.to_i
    elsif /^\s*(.+?):\s*(\S+)\s*$/ =~ l
      whole_seq_top_hits[$1] = $2
    end
  end
    
  domain_top_hits = {}
  data.each_line do |l|
    if /^\s*(.+?):\s*(\d+)\s*$/ =~ l
      domain_top_hits[$1] = $2.to_i
    elsif /^\s*(.+?):\s*(\S+)\s*$/ =~ l
      domain_top_hits[$1] = $2
    end
  end

  [histogram, statistical_detail, total_seq_searched,          whole_seq_top_hits, domain_top_hits]
end