EMBLDB
Parser class for UniProtKB/SwissProt and TrEMBL database entry.
returns contents in the CC lines.
Bio::SPTR#cc -> Hash
returns an object of contents in the TOPIC.
Bio::SPTR#cc(TOPIC) -> Array w/in Hash, Hash
returns contents of the "ALTERNATIVE PRODUCTS".
Bio::SPTR#cc('ALTERNATIVE PRODUCTS') -> Hash
{'Event' => str, 'Named isoforms' => int, 'Comment' => str, 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]} CC -!- ALTERNATIVE PRODUCTS: CC Event=Alternative splicing; Named isoforms=15; ... CC placentae isoforms. All tissues differentially splice exon 13; CC Name=A; Synonyms=no del; CC IsoId=P15529-1; Sequence=Displayed;
returns contents of the "DATABASE".
Bio::SPTR#cc('DATABASE') -> Array
[{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...] CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
returns contents of the "MASS SPECTROMETRY".
Bio::SPTR#cc('MASS SPECTROMETRY') -> Array
[{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...] CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT CC IN LIVER, KIDNEY, LUNG AND BRAIN. CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK; CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
See also www.expasy.org/sprot/userman.html#CC_line
# File lib/bio/db/embl/sptr.rb, line 775 def cc(topic = nil) unless @data['CC'] cc = Hash.new comment_border= '-' * (77 - 4 + 1) dlm = /-!- / # 12KD_MYCSM has no CC lines. return cc if get('CC').size == 0 cc_raw = fetch('CC') # Removing the copyright statement. cc_raw.sub!(/ *---.+---/, '') # Not any CC Lines without the copyright statement. return cc if cc_raw == '' begin cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0] cc_raw = cc_raw.sub(dlm,'') cc_raw.split(dlm).each do |tmp| tmp = tmp.strip if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp key = $1 body = $2 body.gsub!(/- (?!AND)/,'-') body.strip! unless cc[key] cc[key] = [body] else cc[key].push(body) end else raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"', '', get('CC'),''].join("\n") end end rescue NameError if fetch('CC') == '' return {} else raise ["Error: Invalid CC Lines: [#{entry_id}]: ", "\n'#{self.get('CC')}'\n", "(#{$!})"].join end rescue NoMethodError end @data['CC'] = cc end case topic when 'ALLERGEN' return @data['CC'][topic] when 'ALTERNATIVE PRODUCTS' return cc_alternative_products(@data['CC'][topic]) when 'BIOPHYSICOCHEMICAL PROPERTIES' return cc_biophysiochemical_properties(@data['CC'][topic]) when 'BIOTECHNOLOGY' return @data['CC'][topic] when 'CATALITIC ACTIVITY' return cc_catalytic_activity(@data['CC'][topic]) when 'CAUTION' return cc_caution(@data['CC'][topic]) when 'COFACTOR' return @data['CC'][topic] when 'DEVELOPMENTAL STAGE' return @data['CC'][topic].join('') when 'DISEASE' return @data['CC'][topic].join('') when 'DOMAIN' return @data['CC'][topic] when 'ENZYME REGULATION' return @data['CC'][topic].join('') when 'FUNCTION' return @data['CC'][topic].join('') when 'INDUCTION' return @data['CC'][topic].join('') when 'INTERACTION' return cc_interaction(@data['CC'][topic]) when 'MASS SPECTROMETRY' return cc_mass_spectrometry(@data['CC'][topic]) when 'MISCELLANEOUS' return @data['CC'][topic] when 'PATHWAY' return cc_pathway(@data['CC'][topic]) when 'PHARMACEUTICAL' return @data['CC'][topic] when 'POLYMORPHISM' return @data['CC'][topic] when 'PTM' return @data['CC'][topic] when 'RNA EDITING' return cc_rna_editing(@data['CC'][topic]) when 'SIMILARITY' return @data['CC'][topic] when 'SUBCELLULAR LOCATION' return cc_subcellular_location(@data['CC'][topic]) when 'SUBUNIT' return @data['CC'][topic] when 'TISSUE SPECIFICITY' return @data['CC'][topic] when 'TOXIC DOSE' return @data['CC'][topic] when 'WEB RESOURCE' return cc_web_resource(@data['CC'][topic]) when 'DATABASE' # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"]. tmp = Array.new db = @data['CC']['DATABASE'] return db unless db db.each do |e| db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil} e.sub(/.$/,'').split(/;/).each do |line| case line when /NAME=(.+)/ db['NAME'] = $1 when /NOTE=(.+)/ db['NOTE'] = $1 when /WWW="(.+)"/ db['WWW'] = $1 when /FTP="(.+)"/ db['FTP'] = $1 end end tmp.push(db) end return tmp when nil return @data['CC'] else return @data['CC'][topic] end end
# File lib/bio/db/embl/sptr.rb, line 1131 def dr(key = nil) unless key embl_dr else (embl_dr[key] or []).map {|x| {'Accession' => x[0], 'Version' => x[1], ' ' => x[2], 'Molecular Type' => x[3]} } end end
returns a Hash of information in the DT lines.
hash keys: ['created', 'sequence', 'annotation']
Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is changed, and the word "annotation" is no longer used in DT lines. Despite the change, the word "annotation" is still used for keeping compatibility.
returns a String of information in the DT lines by a given key.
DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.) DT DD-MMM-YYY (sequence version NN) DT DD-MMM-YYY (entry version NN)
The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. Below is the older format.
DT DD-MMM-YYY (rel. NN, Created) DT DD-MMM-YYY (rel. NN, Last sequence update) DT DD-MMM-YYY (rel. NN, Last annotation update)
# File lib/bio/db/embl/sptr.rb, line 158 def dt(key = nil) return dt[key] if key return @data['DT'] if @data['DT'] part = self.get('DT').split(/\n/) @data['DT'] = { 'created' => part[0].sub(/\w{2} /,'').strip, 'sequence' => part[1].sub(/\w{2} /,'').strip, 'annotation' => part[2].sub(/\w{2} /,'').strip } end
returns a ENTRY_NAME in the ID line.
# File lib/bio/db/embl/sptr.rb, line 99 def entry_id id_line('ENTRY_NAME') end
returns contents in the feature table.
sp = Bio::SPTR.new(entry) ft = sp.ft ft.class #=> Hash ft.keys.each do |feature_key| ft[feature_key].each do |feature| feature['From'] #=> '1' feature['To'] #=> '21' feature['Description'] #=> '' feature['FTId'] #=> '' feature['diff'] #=> [] feature['original'] #=> [feature_key, '1', '21', '', ''] end end
Bio::SPTR#ft -> Hash
{FEATURE_KEY => [{'From' => int, 'To' => int, 'Description' => aStr, 'FTId' => aStr, 'diff' => [original_residues, changed_residues], 'original' => aAry }],...}
returns an Array of the information about the feature_name in the feature table.
Bio::SPTR#ft(feature_name) -> Array of Hash
[{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
Col Data item ----- ----------------- 1- 2 FT 6-13 Feature name 15-20 `FROM' endpoint 22-27 `TO' endpoint 35-75 Description (>=0 per key) ----- -----------------
Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors including '<', '>' or '?'. (c.f. '<1', '?42')
See also www.expasy.org/sprot/userman.html#FT_line
# File lib/bio/db/embl/sptr.rb, line 1196 def ft(feature_key = nil) return ft[feature_key] if feature_key return @data['FT'] if @data['FT'] table = [] begin get('FT').split("\n").each do |line| if line =~ /^FT \w/ feature = line.chomp.ljust(74) table << [feature[ 5..12].strip, # Feature Name feature[14..19].strip, # From feature[21..26].strip, # To feature[34..74].strip ] # Description else table.last << line.chomp.sub!(/^FT +/, '') end end # Joining Description lines table = table.map { |feature| ftid = feature.pop if feature.last =~ /FTId=/ if feature.size > 4 feature = [feature[0], feature[1], feature[2], feature[3, feature.size - 3].join(" ")] end feature << if ftid then ftid else '' end } hash = {} table.each do |feature| hash[feature[0]] = [] unless hash[feature[0]] hash[feature[0]] << { # Removing '<', '>' or '?' in FROM/TO endopoint. 'From' => feature[1].sub(/\D/, '').to_i, 'To' => feature[2].sub(/\D/, '').to_i, 'Description' => feature[3], 'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''), 'diff' => [], 'original' => feature } case feature[0] when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT' case hash[feature[0]].last['Description'] when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/ original_res = $1 changed_res = $2 original_res = original_res.gsub(/ /,'').strip chenged_res = changed_res.gsub(/ /,'').strip when /Missing/ original_res = seq.subseq(hash[feature[0]].last['From'], hash[feature[0]].last['To']) changed_res = '' end hash[feature[0]].last['diff'] = [original_res, chenged_res] end end rescue raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n" end @data['FT'] = hash end
returns a String of the first gene name in the GN line.
# File lib/bio/db/embl/sptr.rb, line 438 def gene_name gene_names.first end
returns a Array of gene names in the GN line.
# File lib/bio/db/embl/sptr.rb, line 427 def gene_names gn # set @data['GN'] if it hasn't been already done if @data['GN'].first.class == Hash then @data['GN'].collect { |element| element[:name] } else @data['GN'].first end end
returns gene names in the GN line.
New UniProt/SwissProt format:
Bio::SPTR#gn -> [ <gene record>* ]
where <gene record> is:
{ :name => '...', :synonyms => [ 's1', 's2', ... ], :loci => [ 'l1', 'l2', ... ], :orfs => [ 'o1', 'o2', ... ] }
Old format:
Bio::SPTR#gn -> Array # AND
Bio::SPTR#gn -> Array # OR
# File lib/bio/db/embl/sptr.rb, line 351 def gn unless @data['GN'] case fetch('GN') when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/ @data['GN'] = gn_uniprot_parser else @data['GN'] = gn_old_parser end end @data['GN'] end
Bio::SPTR#hi #=> hash
# File lib/bio/db/embl/sptr.rb, line 691 def hi unless @data['HI'] @data['HI'] = [] fetch('HI').split(/\. /).each do |hlist| hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''} hash['Category'], hash['Keywords'] = hlist.split(': ') hash['Keywords'] = hash['Keywords'].split('; ') hash['Keyword'] = hash['Keywords'].pop hash['Keyword'].sub!(/\.$/, '') @data['HI'] << hash end end @data['HI'] end
returns a Hash of the ID line.
returns a content (Int or String) of the ID line by a given key. Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
ID P53_HUMAN Reviewed; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
ID P53_HUMAN STANDARD; PRT; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
# File lib/bio/db/embl/sptr.rb, line 74 def id_line(key = nil) return id_line[key] if key return @data['ID'] if @data['ID'] part = @orig['ID'].split(/ +/) if part[4].to_s.chomp == 'AA.' then # after UniProtKB release 9.0 of 31-Oct-2006 # (http://www.uniprot.org/docs/sp_news.htm) molecule_type = nil sequence_length = part[3].to_i else molecule_type = part[3].sub(/;/,'') sequence_length = part[4].to_i end @data['ID'] = { 'ENTRY_NAME' => part[1], 'DATA_CLASS' => part[2].sub(/;/,''), 'MOLECULE_TYPE' => molecule_type, 'SEQUENCE_LENGTH' => sequence_length } end
returns a MOLECULE_TYPE in the ID line.
A short-cut for Bio::SPTR#id_line('MOLECULE_TYPE').
# File lib/bio/db/embl/sptr.rb, line 109 def molecule id_line('MOLECULE_TYPE') end
OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line
# File lib/bio/db/embl/sptr.rb, line 521 def oh unless @data['OH'] @data['OH'] = fetch('OH').split("\. ").map {|x| if x =~ /NCBI_TaxID=(\d+);/ taxid = $1 else raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):", $!, "\n", get('OH'), "\n"].join end if x =~ /NCBI_TaxID=\d+; (.+)/ host_name = $1 host_name.sub!(/\.$/, '') else host_name = nil end {'NCBI_TaxID' => taxid, 'HostName' => host_name} } end @data['OH'] end
returns a Array of Hashs or a String of the OS line when a key given.
Bio::EMBLDB#os -> Array
[{'name' => '(Human)', 'os' => 'Homo sapiens'}, {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
Bio::EPTR#os -> Hash
{'name' => "(Human)", 'os' => 'Homo sapiens'}
Bio::SPTR#os['name'] -> "(Human)"
Bio::EPTR#os(0) -> "Homo sapiens (Human)"
OS Genus species (name). OS Genus species (name0) (name1). OS Genus species (name0) (name1). OS Genus species (name0), G s0 (name0), and G s (name0) (name1). OS Homo sapiens (Human), and Rarrus norveticus (Rat) OS Hippotis sp. Clark and Watts 825. OS unknown cyperaceous sp.
# File lib/bio/db/embl/sptr.rb, line 460 def os(num = nil) unless @data['OS'] os = Array.new fetch('OS').split(/, and|, /).each do |tmp| if tmp =~ /(\w+ *[\w\d \:\'\+\-\.]+[\w\d\.])/ org = $1 tmp =~ /(\(.+\))/ os.push({'name' => $1, 'os' => org}) else raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n" end end @data['OS'] = os end if num # EX. "Trifolium repens (white clover)" return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}" else return @data['OS'] end end
returns a Hash of oraganism taxonomy cross-references.
Bio::SPTR#ox -> Hash
{'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
OX NCBI_TaxID=1234; OX NCBI_TaxID=1234, 2345, 3456, 4567;
# File lib/bio/db/embl/sptr.rb, line 504 def ox unless @data['OX'] tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip } hsh = Hash.new tmp.each do |e| db,refs = e.split(/=/) hsh[db] = refs.split(/, */) end @data['OX'] = hsh end return @data['OX'] end
returns the proposed official name of the protein. Returns a String.
Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from "RecName: Full=" or "SubName: Full=" line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.
For old format, the method parses the DE lines and returns the protein name as a String.
"DE #{OFFICIAL_NAME} (#{SYNONYM})" "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]." OFFICIAL_NAME 1/entry SYNONYM >=0 CONTEINS >=0
# File lib/bio/db/embl/sptr.rb, line 251 def protein_name @data['DE'] ||= parse_DE_line_rel14(get('DE')) parsed_de_line = @data['DE'] if parsed_de_line then # since UniProtKB release 14.0 of 22-Jul-2008 name = nil parsed_de_line.each do |a| case a[0] when 'RecName', 'SubName' if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then name = name_pair[1] break end end end name = name.to_s else # old format (before Rel. 13.x) name = "" if de_line = fetch('DE') then str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part) name = str[/^[^(]*/].strip name << ' (Fragment)' if str =~ /fragment/ end end return name end
returns contents in the R lines.
Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
where <reference information Hash> is:
{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
R Lines
RN RC RP RX RA RT RL RG
# File lib/bio/db/embl/sptr.rb, line 557 def ref unless @data['R'] @data['R'] = [get('R').split(/\nRN /)].flatten.map { |str| hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''} str = 'RN ' + str unless /^RN / =~ str str.split("\n").each do |line| if /^(R[NPXARLCTG]) (.+)/ =~ line hash[$1] += $2 + ' ' else raise "Invalid format in R lines, \n[#{line}]\n" end end hash['RN'] = set_RN(hash['RN']) hash['RC'] = set_RC(hash['RC']) hash['RP'] = set_RP(hash['RP']) hash['RX'] = set_RX(hash['RX']) hash['RA'] = set_RA(hash['RA']) hash['RT'] = set_RT(hash['RT']) hash['RL'] = set_RL(hash['RL']) hash['RG'] = set_RG(hash['RG']) hash } end @data['R'] end
returns Bio::Reference object from Bio::EMBLDB::Common#ref.
# File lib/bio/db/embl/sptr.rb, line 651 def references unless @data['references'] ary = self.ref.map {|ent| hash = Hash.new('') ent.each {|key, value| case key when 'RA' hash['authors'] = value.split(/, /) when 'RT' hash['title'] = value when 'RL' if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/ hash['journal'] = $1 hash['volume'] = $2 hash['issue'] = $3 hash['pages'] = $4 hash['year'] = $5 else hash['journal'] = value end when 'RX' # PUBMED, MEDLINE, DOI value.each do |tag, xref| hash[ tag.downcase ] = xref end end } Reference.new(hash) } @data['references'] = References.new(ary) end @data['references'] end
returns a Bio::Sequence::AA of the amino acid sequence.
blank Line; sequence data (>=1)
# File lib/bio/db/embl/sptr.rb, line 1306 def seq unless @data[''] @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') ) end return @data[''] end
returns a SEQUENCE_LENGTH in the ID line.
A short-cut for Bio::SPTR#id_line('SEQUENCE_LENGHT').
# File lib/bio/db/embl/sptr.rb, line 118 def sequence_length id_line('SEQUENCE_LENGTH') end
# File lib/bio/db/embl/sptr.rb, line 588 def set_RN(data) data.strip end
returns a Hash of conteins in the SQ lines.
Bio::SPTRL#sq -> hsh
returns a value of a key given in the SQ lines.
Bio::SPTRL#sq(key) -> int or str
Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
'CRC64']
SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64; SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
# File lib/bio/db/embl/sptr.rb, line 1278 def sq(key = nil) unless @data['SQ'] if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/ @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 } else raise "Invalid SQ Line: \n'#{fetch('SQ')}'" end end if key case key when /mw/, /molecular/, /weight/ @data['SQ']['MW'] when /len/, /length/, /AA/ @data['SQ']['aalen'] else @data['SQ'][key] end else @data['SQ'] end end
returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.
Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from "RecName: Short=", "RecName: EC=", and AltName lines, except after "Contains:" or "Includes:". For keeping compatibility with old format parser, "RecName: EC=N.N.N.N" is reported as "EC N.N.N.N". In addition, to prevent confusion, "Allergen=" and "CD_antigen=" prefixes are added for the corresponding fields.
For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.
# File lib/bio/db/embl/sptr.rb, line 294 def synonyms ary = Array.new @data['DE'] ||= parse_DE_line_rel14(get('DE')) parsed_de_line = @data['DE'] if parsed_de_line then # since UniProtKB release 14.0 of 22-Jul-2008 parsed_de_line.each do |a| case a[0] when 'Includes', 'Contains' break #the each loop when 'RecName', 'SubName', 'AltName' a[1..-1].each do |b| if name = b[1] and b[1] != self.protein_name then case b[0] when 'EC' name = "EC " + b[1] when 'Allergen', 'CD_antigen' name = b[0] + '=' + b[1] else name = b[1] end ary.push name end end end #case a[0] end #parsed_de_line.each else # old format (before Rel. 13.x) if de_line = fetch('DE') then line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part line.scan(/\([^)]+/) do |synonym| unless synonym =~ /fragment/ then ary << synonym[1..-1].strip # index to remove the leading ( end end end end return ary end
Generated with the Darkfish Rdoc Generator 2.