class Bio::Scf

Description

This class inherits from the SangerChromatogram superclass. It captures the information contained within an scf format chromatogram file generated by DNA sequencing. See the SangerChromatogram class for usage

Attributes

aqual[RW]

The quality of each base at each position along the length of the sequence is captured by the nqual attributes where n is one of a, c, g or t. Generally the quality will be high for the base that is called at a particular position and low for all the other bases. However at positions of poor sequence quality, more than one base may have similar top scores. By analysing the nqual attributes it may be possible to determine if the base calling was correct or not. The quality of the A base at each sequence position

comments[RW]

A hash of extra information extracted from the chromatogram file

cqual[RW]

The quality of the C base at each sequence position

gqual[RW]

The quality of the G base at each sequence position

tqual[RW]

The quality of the T base at each sequence position

Public Class Methods

new(string) click to toggle source

see SangerChromatogram class for how to create an Scf object and its usage

# File lib/bio/db/sanger_chromatogram/scf.rb, line 37
def initialize(string)
  header = string.slice(0,128)
  # read in header info
  @chromatogram_type, @samples, @sample_offset, @bases, @bases_left_clip, @bases_right_clip, @bases_offset, @comment_size, @comments_offset, @version, @sample_size, @code_set, @header_spare = header.unpack("a4 NNNNNNNN a4 NN N20")
  get_traces(string)
  get_bases_peakIndices_and_qualities(string)
  get_comments(string)
  if @comments["DYEP"]
    @dye_mobility = @comments["DYEP"]
  else
    @dye_mobility = "Unnown"
  end
end

Private Instance Methods

convert_accuracies_to_qualities() click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 181
def convert_accuracies_to_qualities
  qualities = Array.new
  for base_pos in (0..@sequence.length-1)
    case sequence.slice(base_pos,1)
    when "a"
      qualities << @aqual[base_pos]
    when "c"
      qualities << @cqual[base_pos]
    when "g"
      qualities << @gqual[base_pos]
    when "t"
      qualities << @tqual[base_pos]
    else
      qualities << 0
    end
  end
  return qualities
end
convert_deltas_to_values(trace_read) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 168
def convert_deltas_to_values(trace_read)
  p_sample = 0;
  for sample_num in (0..trace_read.size-1)
    trace_read[sample_num] = trace_read[sample_num] + p_sample
    p_sample = trace_read[sample_num];
  end
  p_sample = 0;
  for sample_num in (0..trace_read.size-1)
    trace_read[sample_num] = trace_read[sample_num] + p_sample
    p_sample = trace_read[sample_num];
  end
  return trace_read
end
get_bases_peakIndices_and_qualities(string) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 98
def get_bases_peakIndices_and_qualities(string)
  if @version == "3.00"
    # now go and get the peak index information
    offset = @bases_offset
    length = @bases * 4
    get_v3_peak_indices(string,offset,length)

    # now go and get the accuracy information
    offset += length;
    get_v3_accuracies(string,offset,length)

    # OK, now go and get the base information.
    offset += length;
    length = @bases;
    get_v3_sequence(string,offset,length)

    #combine accuracies to get quality scores
    @qualities= convert_accuracies_to_qualities
  elsif @version == "2.00"
    @peak_indices = []
    @aqual = []
    @cqual = []
    @gqual = []
    @tqual = []
    @qualities = []
    @sequence = ""
    # now go and get the base information
    offset = @bases_offset
    length = @bases * 12
    all_bases_info = string.slice(offset,length)

    (0..length-1).step(12) do |offset2|
      base_info = all_bases_info.slice(offset2,12).unpack("N C C C C a C3")
      @peak_indices << base_info[0]
      @aqual << base_info[1]
      @cqual << base_info[2]
      @gqual << base_info[3]
      @tqual << base_info[4]
      @sequence += base_info[5].downcase
      case base_info[5].downcase
      when "a"
        @qualities << base_info[1]
      when "c"
        @qualities << base_info[2]
      when "g"
        @qualities << base_info[3]
      when "t"
        @qualities << base_info[4]
      else
        @qualities << 0
      end
    end
  end
end
get_comments(string) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 199
def get_comments(string)
  @comments = Hash.new
  comment_string = string.slice(@comments_offset,@comment_size)
  comment_string.gsub!(/\0/, "")
  comment_array = comment_string.split("\n")
  comment_array.each do |comment|
    comment =~ /(\w+)=(.*)/
    @comments[$1] = $2
  end
end
get_traces(string) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 53
def get_traces(string)
  if @version == "3.00"
    # read in trace info
    offset = @sample_offset
    length = @samples * @sample_size
    # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an  unsigned short
    @sample_size == 2 ? byte = "n" : byte = "c"
    for base in ["a" , "c" , "g" , "t"]
      trace_read = string.slice(offset,length).unpack("#{byte}#{@samples}")
      # convert offsets
      for sample_num in (0..trace_read.size-1)
        if trace_read[sample_num] > 30000
          trace_read[sample_num] = trace_read[sample_num] - 65536
        end
      end
      # For 8-bit data we need to emulate a signed/unsigned
      # cast that is implicit in the C implementations.....
      if @sample_size == 1
        for sample_num in (0..trace_read.size-1)
          trace_read[sample_num] += 256 if trace_read[sample_num] < 0
        end
      end
      trace_read = convert_deltas_to_values(trace_read)
      self.instance_variable_set("@#{base}trace", trace_read)
      offset += length
    end
  elsif @version == "2.00"
    @atrace = []
    @ctrace = []
    @gtrace = []
    @ttrace = []
    # read in trace info
    offset = @sample_offset
    length = @samples * @sample_size * 4
    # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an  unsigned short
    @sample_size == 2 ? byte = "n" : byte = "c"
    trace_read = string.slice(offset,length).unpack("#{byte}#{@samples*4}")
    (0..(@samples-1)*4).step(4) do |offset2|
      @atrace << trace_read[offset2]
      @ctrace << trace_read[offset2+1]
      @gtrace << trace_read[offset2+2]
      @ttrace << trace_read[offset2+3]
    end
  end
end
get_v3_accuracies(string,offset,length) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 155
def get_v3_accuracies(string,offset,length)
  qualities   = string.slice(offset,length)
  qual_length = length/4;
  qual_offset = 0;
  for base in ["a" , "c" , "g" , "t"]
    self.instance_variable_set("@#{base}qual",qualities.slice(qual_offset,qual_length).unpack("C#{qual_length}"))
    qual_offset += qual_length
  end
end
get_v3_peak_indices(string,offset,length) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 152
def get_v3_peak_indices(string,offset,length)
  @peak_indices = string.slice(offset,length).unpack("N#{length/4}")
end
get_v3_sequence(string,offset,length) click to toggle source
# File lib/bio/db/sanger_chromatogram/scf.rb, line 164
def get_v3_sequence(string,offset,length)
  @sequence = string.slice(offset,length).unpack("a#{length}").join('').downcase
end