class Bio::FlatFile::AutoDetect
AutoDetect automatically determines database class of given data.
Constants
- BottomRule
Special element that is always bottom priority.
- TopRule
Special element that is always top priority.
Public Class Methods
make a new autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 361 def self.[](*arg) a = self.new arg.each { |e| a.add(e) } a end
returns the default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 348 def self.default unless @default then @default = self.make_default end @default end
sets the default autodetect object.
# File lib/bio/io/flatfile/autodetection.rb, line 356 def self.default=(ad) @default = ad end
make a default of default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 368 def self.make_default a = self[ genbank = RuleRegexp[ 'Bio::GenBank', /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ], genpept = RuleRegexp[ 'Bio::GenPept', /^LOCUS .+ aa .+/ ], medline = RuleRegexp[ 'Bio::MEDLINE', /^PMID\- [0-9]+$/ ], embl = RuleRegexp[ 'Bio::EMBL', /^ID .+\; .*(DNA|RNA|XXX)\;/ ], sptr = RuleRegexp2[ 'Bio::SPTR', /^ID .+\; *PRT\;/, /^ID [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ], prosite = RuleRegexp[ 'Bio::PROSITE', /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ], transfac = RuleRegexp[ 'Bio::TRANSFAC', /^AC [-A-Za-z0-9_\.]+$/ ], aaindex = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text| if /^H [-A-Z0-9_\.]+$/ =~ text then if text =~ /^M [rc]/ then Bio::AAindex2 elsif text =~ /^I A\/L/ then Bio::AAindex1 else false #fail to determine end else nil end end, litdb = RuleRegexp[ 'Bio::LITDB', /^CODE [0-9]+$/ ], pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE', /^ENTRY .+ Pathway\s+Module\s*/ ], pathway = RuleRegexp[ 'Bio::KEGG::PATHWAY', /^ENTRY .+ Pathway\s*/ ], brite = RuleRegexp[ 'Bio::KEGG::BRITE', /^Entry [A-Z0-9]+/ ], orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY', /^ENTRY .+ KO\s*/ ], drug = RuleRegexp[ 'Bio::KEGG::DRUG', /^ENTRY .+ Drug\s*/ ], glycan = RuleRegexp[ 'Bio::KEGG::GLYCAN', /^ENTRY .+ Glycan\s*/ ], enzyme = RuleRegexp2[ 'Bio::KEGG::ENZYME', /^ENTRY EC [0-9\.]+$/, /^ENTRY .+ Enzyme\s*/ ], compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND', /^ENTRY C[A-Za-z0-9\._]+$/, /^ENTRY .+ Compound\s*/ ], reaction = RuleRegexp2[ 'Bio::KEGG::REACTION', /^ENTRY R[A-Za-z0-9\._]+$/, /^ENTRY .+ Reaction\s*/ ], genes = RuleRegexp[ 'Bio::KEGG::GENES', /^ENTRY .+ (CDS|gene|.*RNA|Contig) / ], genome = RuleRegexp[ 'Bio::KEGG::GENOME', /^ENTRY [a-z]+$/ ], fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster', 'Bio::FANTOM::MaXML::Sequence') do |text| if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text case $1 when 'clusters' Bio::FANTOM::MaXML::Cluster when 'sequences' Bio::FANTOM::MaXML::Sequence else nil #unknown end else nil end end, pdb = RuleRegexp[ 'Bio::PDB', /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ], het = RuleRegexp[ 'Bio::PDB::ChemicalComponent', /^RESIDUE +.+ +\d+\s*$/ ], clustal = RuleRegexp2[ 'Bio::ClustalW::Report', /^CLUSTAL .*\(.*\).*sequence +alignment/, /^CLUSTAL FORMAT for T-COFFEE/ ], gcg_msf = RuleRegexp[ 'Bio::GCG::Msf', /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ], gcg_seq = RuleRegexp[ 'Bio::GCG::Seq', /^!!(N|A)A_SEQUENCE .+/ ], blastxml = RuleRegexp[ 'Bio::Blast::Report', /\<\!DOCTYPE BlastOutput PUBLIC / ], wublast = RuleRegexp[ 'Bio::Blast::WU::Report', /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast', /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], blast = RuleRegexp[ 'Bio::Blast::Default::Report', /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], tblast = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast', /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], rpsblast = RuleRegexp[ 'Bio::Blast::RPSBlast::Report', /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], blat = RuleRegexp[ 'Bio::Blat::Report', /^psLayout version \d+/ ], spidey = RuleRegexp[ 'Bio::Spidey::Report', /^\-\-SPIDEY version .+\-\-$/ ], hmmer = RuleRegexp[ 'Bio::HMMER::Report', /^HMMER +\d+\./ ], sim4 = RuleRegexp[ 'Bio::Sim4::Report', /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ], fastq = RuleRegexp[ 'Bio::Fastq', /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+/ ], fastaformat = RuleProc.new('Bio::FastaFormat', 'Bio::NBRF', 'Bio::FastaNumericFormat') do |text| if /^>.+$/ =~ text case text when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ Bio::NBRF when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ Bio::FastaFormat when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ Bio::FastaNumericFormat else false end else nil end end ] # dependencies # NCBI genbank.is_prior_to genpept # EMBL/UniProt embl.is_prior_to sptr sptr.is_prior_to prosite prosite.is_prior_to transfac # KEGG #aaindex.is_prior_to litdb #litdb.is_prior_to brite pathway_module.is_prior_to pathway pathway.is_prior_to brite brite.is_prior_to orthology orthology.is_prior_to drug drug.is_prior_to glycan glycan.is_prior_to enzyme enzyme.is_prior_to compound compound.is_prior_to reaction reaction.is_prior_to genes genes.is_prior_to genome # PDB pdb.is_prior_to het # BLAST wublast.is_prior_to wutblast wutblast.is_prior_to blast blast.is_prior_to tblast # Fastq BottomRule.is_prior_to(fastq) fastq.is_prior_to(fastaformat) # FastaFormat BottomRule.is_prior_to(fastaformat) # for debug #debug_first = RuleDebug.new('debug_first') #a.add(debug_first) #debug_first.is_prior_to(TopRule) ## for debug #debug_last = RuleDebug.new('debug_last') #a.add(debug_last) #BottomRule.is_prior_to(debug_last) #fastaformat.is_prior_to(debug_last) a.rehash return a end
Creates a new Autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 226 def initialize # stores autodetection rules. @rules = Hash.new # stores elements (cache) @elements = nil self.add(TopRule) self.add(BottomRule) end
Public Instance Methods
Adds a new element. Returns elem.
# File lib/bio/io/flatfile/autodetection.rb, line 237 def add(elem) raise 'element name conflicts' if @rules[elem.name] @elements = nil @rules[elem.name] = elem elem end
Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 305 def autodetect(text, meta = {}) r = nil elements.each do |e| #$stderr.puts e.name r = e.guess(text, meta) break if r end r end
autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 318 def autodetect_flatfile(ff, lines = 31) meta = {} stream = ff.instance_eval { @stream } begin path = stream.path rescue NameError end if path then meta[:path] = path # call autodetect onece with meta and without any read action if r = self.autodetect(stream.prefetch_buffer, meta) return r end end # reading stream 1.upto(lines) do |x| break unless line = stream.prefetch_gets if line.strip.size > 0 then if r = self.autodetect(stream.prefetch_buffer, meta) return r end end end return nil end
Iterates over each element.
# File lib/bio/io/flatfile/autodetection.rb, line 298 def each_rule(&x) #:yields: elem elements.each(&x) end
Returns current elements as an array whose order fulfills all elements' priorities.
# File lib/bio/io/flatfile/autodetection.rb, line 275 def elements unless @elements ary = tsort ary.reverse! @elements = ary end @elements end
visualizes the object (mainly for debug)
# File lib/bio/io/flatfile/autodetection.rb, line 291 def inspect "<#{self.class.to_s} " + self.elements.collect { |e| e.name.inspect }.join(' ') + ">" end
rebuilds the object and clears internal cache.
# File lib/bio/io/flatfile/autodetection.rb, line 285 def rehash @rules.rehash @elements = nil end
(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.
# File lib/bio/io/flatfile/autodetection.rb, line 253 def tsort_each_child(elem) if elem == TopRule then @rules.each_value do |e| yield e unless e == TopRule or e.lower_priority_elements.index(TopRule) end elsif elem == BottomRule then @rules.each_value do |e| yield e if e.higher_priority_elements.index(BottomRule) end else elem.lower_priority_elements.each do |e| yield e if e != BottomRule end unless elem.higher_priority_elements.index(BottomRule) yield BottomRule end end end
(required by TSort.) For all elements, yields each element.
# File lib/bio/io/flatfile/autodetection.rb, line 246 def tsort_each_node(&x) @rules.each_value(&x) end