module Bio::Alignment::Output
Public Instance Methods
common routine for interleaved/non-interleaved phylip format
# File lib/bio/alignment.rb, line 1099 def __output_phylip_common(options = {}) len = self.alignment_length aln = [ " #{self.number_of_sequences} #{len}\n" ] sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') } if options[:replace_space] sn.collect! { |x| x.gsub(/\s/, '_') } end if !options.has_key?(:escape) or options[:escape] sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') } end if !options.has_key?(:split) or options[:split] sn.collect! { |x| x.split(/\s/)[0].to_s } end if !options.has_key?(:avoid_same_name) or options[:avoid_same_name] sn = __clustal_avoid_same_name(sn, 10) end namewidth = 10 seqwidth = (options[:width] or 60) seqwidth = seqwidth.div(10) * 10 seqregexp = Regexp.new("(.{1,#{seqwidth.div(10) * 11}})") gchar = (options[:gap_char] or '-') aseqs = Array.new(self.number_of_sequences).clear self.each_seq do |s| aseqs << s.to_s.gsub(self.gap_regexp, gchar) end case options[:case].to_s when /lower/i aseqs.each { |s| s.downcase! } when /upper/i aseqs.each { |s| s.upcase! } end aseqs.collect! do |s| snx = sn.shift head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth] head2 = ' ' * namewidth s << (gchar * (len - s.length)) s.gsub!(/(.{1,10})/n, " \\1") s.gsub!(seqregexp, "\\1\n") a = s.split(/^/) head += a.shift ret = a.collect { |x| head2 + x } ret.unshift(head) ret end lines = (len + seqwidth - 1).div(seqwidth) [ aln, aseqs, lines ] end
# File lib/bio/alignment.rb, line 873 def output(format, *arg) case format when :clustal output_clustal(*arg) when :fasta output_fasta(*arg) when :phylip output_phylip(*arg) when :phylipnon output_phylipnon(*arg) when :msf output_msf(*arg) when :molphy output_molphy(*arg) else raise "Unknown format: #{format.inspect}" end end
Generates ClustalW-formatted text
- seqs
-
sequences (must be an alignment object)
- names
-
names of the sequences
- options
-
options
# File lib/bio/alignment.rb, line 1045 def output_clustal(options = {}) __clustal_formatter(self, self.sequence_names, options) end
Generates fasta format text and returns a string.
# File lib/bio/alignment.rb, line 1059 def output_fasta(options={}) #(original) width = (options[:width] or 70) if options[:avoid_same_name] then na = __clustal_avoid_same_name(self.sequence_names, 30) else na = self.sequence_names.collect do |k| k.to_s.gsub(/[\r\n\x00]/, ' ') end end if width and width > 0 then w_reg = Regexp.new(".{1,#{width}}") self.collect do |s| ">#{na.shift}\n" + s.to_s.gsub(w_reg, "\\0\n") end.join('') else self.collect do |s| ">#{na.shift}\n" + s.to_s + "\n" end.join('') end end
Generates Molphy alignment format text as a string
# File lib/bio/alignment.rb, line 1151 def output_molphy(options = {}) len = self.alignment_length header = "#{self.number_of_sequences} #{len}\n" sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') } if options[:replace_space] sn.collect! { |x| x.gsub(/\s/, '_') } end if !options.has_key?(:escape) or options[:escape] sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') } end if !options.has_key?(:split) or options[:split] sn.collect! { |x| x.split(/\s/)[0].to_s } end if !options.has_key?(:avoid_same_name) or options[:avoid_same_name] sn = __clustal_avoid_same_name(sn, 30) end seqwidth = (options[:width] or 60) seqregexp = Regexp.new("(.{1,#{seqwidth}})") gchar = (options[:gap_char] or '-') aseqs = Array.new(len).clear self.each_seq do |s| aseqs << s.to_s.gsub(self.gap_regexp, gchar) end case options[:case].to_s when /lower/i aseqs.each { |s| s.downcase! } when /upper/i aseqs.each { |s| s.upcase! } end aseqs.collect! do |s| s << (gchar * (len - s.length)) s.gsub!(seqregexp, "\\1\n") sn.shift + "\n" + s end aseqs.unshift(header) aseqs.join('') end
Generates msf formatted text as a string
# File lib/bio/alignment.rb, line 1193 def output_msf(options = {}) len = self.seq_length if !options.has_key?(:avoid_same_name) or options[:avoid_same_name] sn = __clustal_avoid_same_name(self.sequence_names) else sn = self.sequence_names.collect do |x| x.to_s.gsub(/[\r\n\x00]/, ' ') end end if !options.has_key?(:replace_space) or options[:replace_space] sn.collect! { |x| x.gsub(/\s/, '_') } end if !options.has_key?(:escape) or options[:escape] sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') } end if !options.has_key?(:split) or options[:split] sn.collect! { |x| x.split(/\s/)[0].to_s } end seqwidth = 50 namewidth = [31, sn.collect { |x| x.length }.max ].min sep = ' ' * 2 seqregexp = Regexp.new("(.{1,#{seqwidth}})") gchar = (options[:gap_char] or '.') pchar = (options[:padding_char] or '~') aseqs = Array.new(self.number_of_sequences).clear self.each_seq do |s| aseqs << s.to_s.gsub(self.gap_regexp, gchar) end aseqs.each do |s| s.sub!(/\A#{Regexp.escape(gchar)}+/) { |x| pchar * x.length } s.sub!(/#{Regexp.escape(gchar)}+\z/, '') s << (pchar * (len - s.length)) end case options[:case].to_s when /lower/i aseqs.each { |s| s.downcase! } when /upper/i aseqs.each { |s| s.upcase! } else #default upcase aseqs.each { |s| s.upcase! } end case options[:type].to_s when /protein/i, /aa/i amino = true when /na/i amino = false else if seqclass == Bio::Sequence::AA then amino = true elsif seqclass == Bio::Sequence::NA then amino = false else # if we can't determine, we asuume as protein. amino = aseqs.size aseqs.each { |x| amino -= 1 if /\A[acgt]\z/i =~ x } amino = false if amino <= 0 end end seq_type = (amino ? 'P' : 'N') fn = (options[:entry_id] or self.__id__.abs.to_s + '.msf') dt = (options[:time] or Time.now).strftime('%B %d, %Y %H:%M') sums = aseqs.collect { |s| GCG::Seq.calc_checksum(s) } #sums = aseqs.collect { |s| 0 } sum = 0; sums.each { |x| sum += x }; sum = 10000 msf = [ "#{seq_type == 'N' ? 'N' : 'A' }A_MULTIPLE_ALIGNMENT 1.0\n", "\n", "\n", " #{fn} MSF: #{len} Type: #{seq_type} #{dt} Check: #{sum} ..\n", "\n" ] sn.each do |snx| msf << ' Name: ' + sprintf('%*s', -namewidth, snx.to_s)[0, namewidth] + " Len: #{len} Check: #{sums.shift} Weight: 1.00\n" end msf << "\n//\n" aseqs.collect! do |s| snx = sn.shift head = sprintf("%*s", namewidth, snx.to_s)[0, namewidth] + sep s.gsub!(seqregexp, "\\1\n") a = s.split(/^/) a.collect { |x| head + x } end lines = (len + seqwidth - 1).div(seqwidth) i = 1 lines.times do msf << "\n" n_l = i n_r = [ i + seqwidth - 1, len ].min if n_l != n_r then w = [ n_r - n_l + 1 - n_l.to_s.length - n_r.to_s.length, 1 ].max msf << (' ' * namewidth + sep + n_l.to_s + ' ' * w + n_r.to_s + "\n") else msf << (' ' * namewidth + sep + n_l.to_s + "\n") end aseqs.each { |a| msf << a.shift } i += seqwidth end msf << "\n" msf.join('') end
generates phylip interleaved alignment format as a string
# File lib/bio/alignment.rb, line 1082 def output_phylip(options = {}) aln, aseqs, lines = __output_phylip_common(options) lines.times do aseqs.each { |a| aln << a.shift } aln << "\n" end aln.pop if aln[-1] == "\n" aln.join('') end
generates Phylip3.2 (old) non-interleaved format as a string
# File lib/bio/alignment.rb, line 1093 def output_phylipnon(options = {}) aln, aseqs, lines = __output_phylip_common(options) aln.first + aseqs.join('') end
# #to_clustal is deprecated. Instead, please use output_clustal. +
# File lib/bio/alignment.rb, line 1053 def to_clustal(*arg) warn "to_clustal is deprecated. Please use output_clustal." output_clustal(*arg) end
Private Instance Methods
Changes sequence names if there are conflicted names for ClustalW format.
- array
-
names of the sequences (array of string)
- len
-
length to check (default:30)
# File lib/bio/alignment.rb, line 930 def __clustal_avoid_same_name(array, len = 30) na = array.collect { |k| k.to_s.gsub(/[\r\n\x00]/, ' ') } if dupidx = __clustal_have_same_name?(na, len) procs = [ Proc.new { |s, i| s[0, len].to_s.gsub(/\s/, '_') + s[len..-1].to_s }, # Proc.new { |s, i| # "#{i}_#{s}" # }, ] procs.each do |pr| dupidx.each do |i| s = array[i] na[i] = pr.call(s.to_s, i) end dupidx = __clustal_have_same_name?(na, len) break unless dupidx end if dupidx then na.each_with_index do |s, i| na[i] = "#{i}_#{s}" end end end na end
Generates ClustalW-formatted text
- seqs
-
sequences (must be an alignment object)
- names
-
names of the sequences
- options
-
options
# File lib/bio/alignment.rb, line 963 def __clustal_formatter(seqs, names, options = {}) #(original) aln = [ "CLUSTAL (0.00) multiple sequence alignment\n\n" ] len = seqs.seq_length sn = names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') } if options[:replace_space] sn.collect! { |x| x.gsub(/\s/, '_') } end if !options.has_key?(:escape) or options[:escape] sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') } end if !options.has_key?(:split) or options[:split] sn.collect! { |x| x.split(/\s/)[0].to_s } end if !options.has_key?(:avoid_same_name) or options[:avoid_same_name] sn = __clustal_avoid_same_name(sn) end if sn.find { |x| x.length > 10 } then seqwidth = 50 namewidth = 30 sep = ' ' * 6 else seqwidth = 60 namewidth = 10 sep = ' ' * 6 end seqregexp = Regexp.new("(.{1,#{seqwidth}})") gchar = (options[:gap_char] or '-') case options[:type].to_s when /protein/i, /aa/i mopt = { :type => :aa } when /na/i mopt = { :type => :na } else mopt = {} end mline = (options[:match_line] or seqs.match_line(mopt)) aseqs = Array.new(seqs.number_of_sequences).clear seqs.each_seq do |s| aseqs << s.to_s.gsub(seqs.gap_regexp, gchar) end case options[:case].to_s when /lower/i aseqs.each { |s| s.downcase! } when /upper/i aseqs.each { |s| s.upcase! } end aseqs << mline aseqs.collect! do |s| snx = sn.shift head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth] + sep s << (gchar * (len - s.length)) s.gsub!(seqregexp, "\\1\n") a = s.split(/^/) if options[:seqnos] and snx then i = 0 a.each do |x| x.chomp! l = x.tr(gchar, '').length i += l x.concat(l > 0 ? " #{i}\n" : "\n") end end a.collect { |x| head + x } end lines = (len + seqwidth - 1).div(seqwidth) lines.times do aln << "\n" aseqs.each { |a| aln << a.shift } end aln.join('') end
Check whether there are same names for ClustalW format.
- array
-
names of the sequences (array of string)
- len
-
length to check (default:30)
# File lib/bio/alignment.rb, line 896 def __clustal_have_same_name?(array, len = 30) na30 = array.collect do |k| k.to_s.split(/[\x00\s]/)[0].to_s[0, len].gsub(/\:\;\,\(\)/, '_').to_s end #p na30 na30idx = (0...(na30.size)).to_a na30idx.sort! do |x,y| na30[x] <=> na30[y] end #p na30idx y = nil dupidx = [] na30idx.each do |x| if y and na30[y] == na30[x] then dupidx << y dupidx << x end y = x end if dupidx.size > 0 then dupidx.sort! dupidx.uniq! dupidx else false end end