class Bio::SPTR
Parser class for UniProtKB/SwissProt and TrEMBL database entry.
Public Instance Methods
returns contents in the CC lines.
returns an object of contents in the TOPIC.
returns contents of the “ALTERNATIVE PRODUCTS”.
-
#cc('ALTERNATIVE PRODUCTS') -> Hash
{'Event' => str, 'Named isoforms' => int, 'Comment' => str, 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]} CC -!- ALTERNATIVE PRODUCTS: CC Event=Alternative splicing; Named isoforms=15; ... CC placentae isoforms. All tissues differentially splice exon 13; CC Name=A; Synonyms=no del; CC IsoId=P15529-1; Sequence=Displayed;
returns contents of the “DATABASE”.
-
#cc('DATABASE') -> Array
[{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...] CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
returns contents of the “MASS SPECTROMETRY”.
-
#cc('MASS SPECTROMETRY') -> Array
[{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...] CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
CC lines (>=0, optional)¶ ↑
CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT CC IN LIVER, KIDNEY, LUNG AND BRAIN. CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK; CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
See also www.expasy.org/sprot/userman.html#CC_line
# File lib/bio/db/embl/sptr.rb, line 775 def cc(topic = nil) unless @data['CC'] cc = Hash.new comment_border= '-' * (77 - 4 + 1) dlm = /-!- / # 12KD_MYCSM has no CC lines. return cc if get('CC').size == 0 cc_raw = fetch('CC') # Removing the copyright statement. cc_raw.sub!(/ *---.+---/m, '') # Not any CC Lines without the copyright statement. return cc if cc_raw == '' begin cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0] cc_raw = cc_raw.sub(dlm,'') cc_raw.split(dlm).each do |tmp| tmp = tmp.strip if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp key = $1 body = $2 body.gsub!(/- (?!AND)/,'-') body.strip! unless cc[key] cc[key] = [body] else cc[key].push(body) end else raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"', '', get('CC'),''].join("\n") end end rescue NameError if fetch('CC') == '' return {} else raise ["Error: Invalid CC Lines: [#{entry_id}]: ", "\n'#{self.get('CC')}'\n", "(#{$!})"].join end rescue NoMethodError end @data['CC'] = cc end case topic when 'ALLERGEN' return @data['CC'][topic] when 'ALTERNATIVE PRODUCTS' return cc_alternative_products(@data['CC'][topic]) when 'BIOPHYSICOCHEMICAL PROPERTIES' return cc_biophysiochemical_properties(@data['CC'][topic]) when 'BIOTECHNOLOGY' return @data['CC'][topic] when 'CATALITIC ACTIVITY' return cc_catalytic_activity(@data['CC'][topic]) when 'CAUTION' return cc_caution(@data['CC'][topic]) when 'COFACTOR' return @data['CC'][topic] when 'DEVELOPMENTAL STAGE' return @data['CC'][topic].join('') when 'DISEASE' return @data['CC'][topic].join('') when 'DOMAIN' return @data['CC'][topic] when 'ENZYME REGULATION' return @data['CC'][topic].join('') when 'FUNCTION' return @data['CC'][topic].join('') when 'INDUCTION' return @data['CC'][topic].join('') when 'INTERACTION' return cc_interaction(@data['CC'][topic]) when 'MASS SPECTROMETRY' return cc_mass_spectrometry(@data['CC'][topic]) when 'MISCELLANEOUS' return @data['CC'][topic] when 'PATHWAY' return cc_pathway(@data['CC'][topic]) when 'PHARMACEUTICAL' return @data['CC'][topic] when 'POLYMORPHISM' return @data['CC'][topic] when 'PTM' return @data['CC'][topic] when 'RNA EDITING' return cc_rna_editing(@data['CC'][topic]) when 'SIMILARITY' return @data['CC'][topic] when 'SUBCELLULAR LOCATION' return cc_subcellular_location(@data['CC'][topic]) when 'SUBUNIT' return @data['CC'][topic] when 'TISSUE SPECIFICITY' return @data['CC'][topic] when 'TOXIC DOSE' return @data['CC'][topic] when 'WEB RESOURCE' return cc_web_resource(@data['CC'][topic]) when 'DATABASE' # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"]. tmp = Array.new db = @data['CC']['DATABASE'] return db unless db db.each do |e| db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil} e.sub(/.$/,'').split(/;/).each do |line| case line when /NAME=(.+)/ db['NAME'] = $1 when /NOTE=(.+)/ db['NOTE'] = $1 when /WWW="(.+)"/ db['WWW'] = $1 when /FTP="(.+)"/ db['FTP'] = $1 end end tmp.push(db) end return tmp when nil return @data['CC'] else return @data['CC'][topic] end end
# File lib/bio/db/embl/sptr.rb, line 1131 def dr(key = nil) unless key embl_dr else (embl_dr[key] or []).map {|x| {'Accession' => x[0], 'Version' => x[1], ' ' => x[2], 'Molecular Type' => x[3]} } end end
returns a Hash of information in the DT lines.
hash keys: ['created', 'sequence', 'annotation']
Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.
returns a String of information in the DT lines by a given key.
DT Line; date (3/entry)¶ ↑
DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.) DT DD-MMM-YYY (sequence version NN) DT DD-MMM-YYY (entry version NN)
The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. Below is the older format.
Old format of DT Line; date (3/entry)¶ ↑
DT DD-MMM-YYY (rel. NN, Created) DT DD-MMM-YYY (rel. NN, Last sequence update) DT DD-MMM-YYY (rel. NN, Last annotation update)
# File lib/bio/db/embl/sptr.rb, line 158 def dt(key = nil) return dt[key] if key return @data['DT'] if @data['DT'] part = self.get('DT').split(/\n/) @data['DT'] = { 'created' => part[0].sub(/\w{2} /,'').strip, 'sequence' => part[1].sub(/\w{2} /,'').strip, 'annotation' => part[2].sub(/\w{2} /,'').strip } end
returns a ENTRY_NAME in the ID line.
# File lib/bio/db/embl/sptr.rb, line 99 def entry_id id_line('ENTRY_NAME') end
returns contents in the feature table.
Examples¶ ↑
sp = Bio::SPTR.new(entry) ft = sp.ft ft.class #=> Hash ft.keys.each do |feature_key| ft[feature_key].each do |feature| feature['From'] #=> '1' feature['To'] #=> '21' feature['Description'] #=> '' feature['FTId'] #=> '' feature['diff'] #=> [] feature['original'] #=> [feature_key, '1', '21', '', ''] end end
-
{FEATURE_KEY => [{'From' => int, 'To' => int, 'Description' => aStr, 'FTId' => aStr, 'diff' => [original_residues, changed_residues], 'original' => aAry }],...}
returns an Array of the information about the feature_name in the feature table.
FT Line; feature table data (>=0, optional)¶ ↑
Col Data item ----- ----------------- 1- 2 FT 6-13 Feature name 15-20 `FROM' endpoint 22-27 `TO' endpoint 35-75 Description (>=0 per key) ----- -----------------
Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors including '<', '>' or '?'. (c.f. '<1', '?42')
See also www.expasy.org/sprot/userman.html#FT_line
# File lib/bio/db/embl/sptr.rb, line 1196 def ft(feature_key = nil) return ft[feature_key] if feature_key return @data['FT'] if @data['FT'] table = [] begin get('FT').split("\n").each do |line| if line =~ /^FT \w/ feature = line.chomp.ljust(74) table << [feature[ 5..12].strip, # Feature Name feature[14..19].strip, # From feature[21..26].strip, # To feature[34..74].strip ] # Description else table.last << line.chomp.sub!(/^FT +/, '') end end # Joining Description lines table = table.map { |feature| ftid = feature.pop if feature.last =~ /FTId=/ if feature.size > 4 feature = [feature[0], feature[1], feature[2], feature[3, feature.size - 3].join(" ")] end feature << if ftid then ftid else '' end } hash = {} table.each do |feature| hash[feature[0]] = [] unless hash[feature[0]] hash[feature[0]] << { # Removing '<', '>' or '?' in FROM/TO endopoint. 'From' => feature[1].sub(/\D/, '').to_i, 'To' => feature[2].sub(/\D/, '').to_i, 'Description' => feature[3], 'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''), 'diff' => [], 'original' => feature } case feature[0] when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT' case hash[feature[0]].last['Description'] when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/ original_res = $1 changed_res = $2 original_res = original_res.gsub(/ /,'').strip chenged_res = changed_res.gsub(/ /,'').strip when /Missing/i original_res = seq.subseq(hash[feature[0]].last['From'], hash[feature[0]].last['To']) changed_res = '' end hash[feature[0]].last['diff'] = [original_res, chenged_res] end end rescue raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n" end @data['FT'] = hash end
returns a String of the first gene name in the GN line.
# File lib/bio/db/embl/sptr.rb, line 438 def gene_name gene_names.first end
returns a Array of gene names in the GN line.
# File lib/bio/db/embl/sptr.rb, line 427 def gene_names gn # set @data['GN'] if it hasn't been already done if @data['GN'].first.class == Hash then @data['GN'].collect { |element| element[:name] } else @data['GN'].first end end
returns gene names in the GN line.
New UniProt/SwissProt format:
-
#gn -> [ <gene record>* ]
where <gene record> is:
{ :name => '...', :synonyms => [ 's1', 's2', ... ], :loci => [ 'l1', 'l2', ... ], :orfs => [ 'o1', 'o2', ... ] }
Old format:
GN Line: Gene name(s) (>=0, optional)¶ ↑
# File lib/bio/db/embl/sptr.rb, line 351 def gn unless @data['GN'] case fetch('GN') when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/ @data['GN'] = gn_uniprot_parser else @data['GN'] = gn_old_parser end end @data['GN'] end
The HI line¶ ↑
#hi #=> hash
# File lib/bio/db/embl/sptr.rb, line 691 def hi unless @data['HI'] @data['HI'] = [] fetch('HI').split(/\. /).each do |hlist| hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''} hash['Category'], hash['Keywords'] = hlist.split(': ') hash['Keywords'] = hash['Keywords'].split('; ') hash['Keyword'] = hash['Keywords'].pop hash['Keyword'].sub!(/\.$/, '') @data['HI'] << hash end end @data['HI'] end
returns a Hash of the ID line.
returns a content (Int or String) of the ID line by a given key. Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
ID Line (since UniProtKB release 9.0 of 31-Oct-2006)¶ ↑
ID P53_HUMAN Reviewed; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
Examples¶ ↑
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
ID Line (older style)¶ ↑
ID P53_HUMAN STANDARD; PRT; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
Examples¶ ↑
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
# File lib/bio/db/embl/sptr.rb, line 74 def id_line(key = nil) return id_line[key] if key return @data['ID'] if @data['ID'] part = @orig['ID'].split(/ +/) if part[4].to_s.chomp == 'AA.' then # after UniProtKB release 9.0 of 31-Oct-2006 # (http://www.uniprot.org/docs/sp_news.htm) molecule_type = nil sequence_length = part[3].to_i else molecule_type = part[3].sub(/;/,'') sequence_length = part[4].to_i end @data['ID'] = { 'ENTRY_NAME' => part[1], 'DATA_CLASS' => part[2].sub(/;/,''), 'MOLECULE_TYPE' => molecule_type, 'SEQUENCE_LENGTH' => sequence_length } end
returns a MOLECULE_TYPE in the ID line.
A short-cut for #id_line('MOLECULE_TYPE').
# File lib/bio/db/embl/sptr.rb, line 109 def molecule id_line('MOLECULE_TYPE') end
The OH Line; ¶ ↑
OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line
# File lib/bio/db/embl/sptr.rb, line 521 def oh unless @data['OH'] @data['OH'] = fetch('OH').split("\. ").map {|x| if x =~ /NCBI_TaxID=(\d+);/ taxid = $1 else raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):", $!, "\n", get('OH'), "\n"].join end if x =~ /NCBI_TaxID=\d+; (.+)/ host_name = $1 host_name.sub!(/\.$/, '') else host_name = nil end {'NCBI_TaxID' => taxid, 'HostName' => host_name} } end @data['OH'] end
returns a Array of Hashs or a String of the OS line when a key given.
-
Bio::EMBLDB#os -> Array
[{'name' => '(Human)', 'os' => 'Homo sapiens'}, {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
-
Bio::EPTR#os -> Hash
{'name' => "(Human)", 'os' => 'Homo sapiens'}
-
#os['name'] -> “(Human)”
-
Bio::EPTR#os(0) -> “Homo sapiens (Human)”
OS Line; organism species (>=1)¶ ↑
OS Genus species (name). OS Genus species (name0) (name1). OS Genus species (name0) (name1). OS Genus species (name0), G s0 (name0), and G s (name0) (name1). OS Homo sapiens (Human), and Rarrus norveticus (Rat) OS Hippotis sp. Clark and Watts 825. OS unknown cyperaceous sp.
# File lib/bio/db/embl/sptr.rb, line 460 def os(num = nil) unless @data['OS'] os = Array.new fetch('OS').split(/, and|, /).each do |tmp| if tmp =~ /(\w+ *[\w\d \:\\+\-\.]+[\w\d\.])/ org = $1 tmp =~ /(\(.+\))/ os.push({'name' => $1, 'os' => org}) else raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n" end end @data['OS'] = os end if num # EX. "Trifolium repens (white clover)" return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}" else return @data['OS'] end end
returns a Hash of oraganism taxonomy cross-references.
OX Line; organism taxonomy cross-reference (>=1 per entry)¶ ↑
OX NCBI_TaxID=1234; OX NCBI_TaxID=1234, 2345, 3456, 4567;
# File lib/bio/db/embl/sptr.rb, line 504 def ox unless @data['OX'] tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip } hsh = Hash.new tmp.each do |e| db,refs = e.split(/=/) hsh[db] = refs.split(/, */) end @data['OX'] = hsh end return @data['OX'] end
returns the proposed official name of the protein. Returns a String.
Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.
For old format, the method parses the DE lines and returns the protein name as a String.
DE Line; description (>=1)¶ ↑
"DE #{OFFICIAL_NAME} (#{SYNONYM})" "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]." OFFICIAL_NAME 1/entry SYNONYM >=0 CONTEINS >=0
# File lib/bio/db/embl/sptr.rb, line 251 def protein_name @data['DE'] ||= parse_DE_line_rel14(get('DE')) parsed_de_line = @data['DE'] if parsed_de_line then # since UniProtKB release 14.0 of 22-Jul-2008 name = nil parsed_de_line.each do |a| case a[0] when 'RecName', 'SubName' if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then name = name_pair[1] break end end end name = name.to_s else # old format (before Rel. 13.x) name = "" if de_line = fetch('DE') then str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part) name = str[/^[^(]*/].strip name << ' (Fragment)' if str =~ /fragment/i end end return name end
returns contents in the R lines.
-
Bio::EMBLDB::Common#ref -> [ <refernece information Hash>* ]
where <reference information Hash> is:
{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
R Lines
-
RN RC RP RX RA RT RL RG
# File lib/bio/db/embl/sptr.rb, line 557 def ref unless @data['R'] @data['R'] = [get('R').split(/\nRN /)].flatten.map { |str| hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''} str = 'RN ' + str unless /^RN / =~ str str.split("\n").each do |line| if /^(R[NPXARLCTG]) (.+)/ =~ line hash[$1] += $2 + ' ' else raise "Invalid format in R lines, \n[#{line}]\n" end end hash['RN'] = set_RN(hash['RN']) hash['RC'] = set_RC(hash['RC']) hash['RP'] = set_RP(hash['RP']) hash['RX'] = set_RX(hash['RX']) hash['RA'] = set_RA(hash['RA']) hash['RT'] = set_RT(hash['RT']) hash['RL'] = set_RL(hash['RL']) hash['RG'] = set_RG(hash['RG']) hash } end @data['R'] end
returns Bio::Reference object from Bio::EMBLDB::Common#ref.
# File lib/bio/db/embl/sptr.rb, line 651 def references unless @data['references'] ary = self.ref.map {|ent| hash = Hash.new('') ent.each {|key, value| case key when 'RA' hash['authors'] = value.split(/, /) when 'RT' hash['title'] = value when 'RL' if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/ hash['journal'] = $1 hash['volume'] = $2 hash['issue'] = $3 hash['pages'] = $4 hash['year'] = $5 else hash['journal'] = value end when 'RX' # PUBMED, MEDLINE, DOI value.each do |tag, xref| hash[ tag.downcase ] = xref end end } Reference.new(hash) } @data['references'] = References.new(ary) end @data['references'] end
returns a Bio::Sequence::AA of the amino acid sequence.
blank Line; sequence data (>=1)
# File lib/bio/db/embl/sptr.rb, line 1306 def seq unless @data[''] @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') ) end return @data[''] end
returns a SEQUENCE_LENGTH in the ID line.
A short-cut for #id_line('SEQUENCE_LENGHT').
# File lib/bio/db/embl/sptr.rb, line 118 def sequence_length id_line('SEQUENCE_LENGTH') end
# File lib/bio/db/embl/sptr.rb, line 588 def set_RN(data) data.strip end
returns a Hash of conteins in the SQ lines.
-
Bio::SPTRL#sq -> hsh
returns a value of a key given in the SQ lines.
-
Bio::SPTRL#sq(key) -> int or str
-
Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
'CRC64']
SQ Line; sequence header (1/entry)¶ ↑
SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64; SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
# File lib/bio/db/embl/sptr.rb, line 1278 def sq(key = nil) unless @data['SQ'] if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/ @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 } else raise "Invalid SQ Line: \n'#{fetch('SQ')}'" end end if key case key when /mw/, /molecular/, /weight/ @data['SQ']['MW'] when /len/, /length/, /AA/ @data['SQ']['aalen'] else @data['SQ'][key] end else @data['SQ'] end end
returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.
Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.
For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.
# File lib/bio/db/embl/sptr.rb, line 294 def synonyms ary = Array.new @data['DE'] ||= parse_DE_line_rel14(get('DE')) parsed_de_line = @data['DE'] if parsed_de_line then # since UniProtKB release 14.0 of 22-Jul-2008 parsed_de_line.each do |a| case a[0] when 'Includes', 'Contains' break #the each loop when 'RecName', 'SubName', 'AltName' a[1..-1].each do |b| if name = b[1] and b[1] != self.protein_name then case b[0] when 'EC' name = "EC " + b[1] when 'Allergen', 'CD_antigen' name = b[0] + '=' + b[1] else name = b[1] end ary.push name end end end #case a[0] end #parsed_de_line.each else # old format (before Rel. 13.x) if de_line = fetch('DE') then line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part line.scan(/\([^)]+/) do |synonym| unless synonym =~ /fragment/i then ary << synonym[1..-1].strip # index to remove the leading ( end end end end return ary end
Private Instance Methods
# File lib/bio/db/embl/sptr.rb, line 913 def cc_alternative_products(data) ap = data.join('') return ap unless ap # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+ tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "", 'Variants' => []} if /Event=(.+?);/ =~ ap tmp['Event'] = $1 tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /) end if /Named isoforms=(\S+?);/ =~ ap tmp['Named isoforms'] = $1 end if /Comment=(.+?);/m =~ ap tmp['Comment'] = $1 end ap.scan(/Name=.+?Sequence=.+?;/).each do |ent| tmp['Variants'] << cc_alternative_products_variants(ent) end return tmp end
# File lib/bio/db/embl/sptr.rb, line 937 def cc_alternative_products_variants(data) variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []} data.split(/; /).map {|x| x.split(/=/) }.each do |e| case e[0] when 'Sequence', 'Synonyms', 'IsoId' e[1] = e[1].sub(/;/,'').split(/, /) end variant[e[0]] = e[1] end variant end
# File lib/bio/db/embl/sptr.rb, line 951 def cc_biophysiochemical_properties(data) data = data[0] hash = {'Absorption' => {}, 'Kinetic parameters' => {}, 'pH dependence' => "", 'Redox potential' => "", 'Temperature dependence' => ""} if data =~ /Absorption: Abs\(max\)=(.+?);/ hash['Absorption']['Abs(max)'] = $1 end if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/ hash['Absorption']['Note'] = $1 end if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/ hash['Kinetic parameters']['KM'] = $1 hash['Kinetic parameters']['Vmax'] = $2 end if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/ hash['Kinetic parameters']['Note'] = $1 end if data =~ /pH dependence: (.+?);/ hash['pH dependence'] = $1 end if data =~ /Redox potential: (.+?);/ hash['Redox potential'] = $1 end if data =~ /Temperature dependence: (.+?);/ hash['Temperature dependence'] = $1 end hash end
# File lib/bio/db/embl/sptr.rb, line 986 def cc_caution(data) data.join('') end
returns conteins in a line of the CC INTERACTION section.
CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
# File lib/bio/db/embl/sptr.rb, line 995 def cc_interaction(data) str = data.join('') it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/) it.map {|ent| ent.map! {|x| x.strip } if ent[0] =~ /^(.+):(.+)/ spac = $1 spid = $2.split(' ')[0] optid = nil elsif ent[0] =~ /Self/ spac = self.entry_id spid = self.entry_id optid = nil end if ent[0] =~ /^.+:.+ (.+)/ optid = $1 end {'SP_Ac' => spac, 'identifier' => spid, 'NbExp' => ent[1], 'IntAct' => ent[2].split(', '), 'optional_identifier' => optid} } end
# File lib/bio/db/embl/sptr.rb, line 1023 def cc_mass_spectrometry(data) # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX]. return data unless data data.map { |m| mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil, 'NOTE' => nil} m.sub(/.$/,'').split(/;/).each do |line| case line when /MW=(.+)/ mass['MW'] = $1 when /MW_ERR=(.+)/ mass['MW_ERR'] = $1 when /METHOD=(.+)/ mass['METHOD'] = $1 when /RANGE=(\d+-\d+)/ mass['RANGE'] = $1 # RANGE class ? when /NOTE=(.+)/ mass['NOTE'] = $1 end end mass } end
# File lib/bio/db/embl/sptr.rb, line 1050 def cc_pathway(data) data.map {|x| x.sub(/\.$/, '') }.map {|x| x.split(/; | and |: /) }[0] end
# File lib/bio/db/embl/sptr.rb, line 1058 def cc_rna_editing(data) data = data.join('') entry = {'Modified_positions' => [], 'Note' => ""} if data =~ /Modified_positions=(.+?)(\.|;)/ entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ') else raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}" end if data =~ /Note=(.+)/ entry['Note'] = $1 end entry end
# File lib/bio/db/embl/sptr.rb, line 1074 def cc_subcellular_location(data) data.map {|x| x.split('. ').map {|y| y.split('; ').map {|z| z.sub(/\.$/, '') } } }[0] end
# File lib/bio/db/embl/sptr.rb, line 1092 def cc_web_resource(data) data.map {|x| entry = {'Name' => nil, 'Note' => nil, 'URL' => nil} x.split(';').each do |y| case y when /(Name|Note)\=(.+)/ key = $1 val = $2.strip entry[key] = val when /(NAME|NOTE)\=(.+)/ key = $1.downcase.capitalize val = $2.strip entry[key] = val when /URL\=\"(.+)\"/ entry['URL'] = $1.strip end end entry } end
returns contents in the old style GN line.
GN Line: Gene name(s) (>=0, optional)¶ ↑
GN HNS OR DRDX OR OSMZ OR BGLY. GN CECA1 AND CECA2. GN CECA1 AND (HOGE OR FUGA). GN NAME1 [(AND|OR) NAME]+.
#gn -> Array # AND
#gn[0] -> Array # OR #gene_names -> Array
# File lib/bio/db/embl/sptr.rb, line 375 def gn_old_parser names = Array.new if get('GN').size > 0 names = fetch('GN').sub(/\.$/,'').split(/ AND /) names.map! { |synonyms| synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e| e.strip } } end @data['GN'] = names end
returns contents in the structured GN line. The new format of the GN line is:
GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...]; GN ORFNames=[, ...];
-
#gn -> [ <gene record>* ]
where <gene record> is:
{ :name => '...', :synonyms => [ 's1', 's2', ... ], :loci => [ 'l1', 'l2', ... ], :orfs => [ 'o1', 'o2', ... ] }
# File lib/bio/db/embl/sptr.rb, line 401 def gn_uniprot_parser @data['GN'] = Array.new gn_line = fetch('GN').strip records = gn_line.split(/\s*and\s*/) records.each do |record| gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []} record.each_line(';') do |element| case element when /Name=/ then gene_hash[:name] = $'[0..-2] when /Synonyms=/ then gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/) when /OrderedLocusNames=/ then gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/) when /ORFNames=/ then gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/) end end @data['GN'] << gene_hash end return @data['GN'] end
(private) parses DE line (description lines) since UniProtKB release 14.0 of 22-Jul-2008
Return array containing array.
www.uniprot.org/docs/sp_news.htm
# File lib/bio/db/embl/sptr.rb, line 177 def parse_DE_line_rel14(str) # Retruns if it is not the new format since Rel.14 return nil unless /^DE (RecName|AltName|SubName)\: / =~ str ret = [] cur = nil str.each_line do |line| case line when /^DE (Includes|Contains)\: *$/ cur = [ $1 ] ret.push cur cur = nil #subcat_and_desc = nil next when /^DE *(RecName|AltName|SubName)\: +(.*)/ category = $1 subcat_and_desc = $2 cur = [ category ] ret.push cur when /^DE *(Flags)\: +(.*)/ category = $1 desc = $2 flags = desc.strip.split(/\s*\;\s*/) || [] cur = [ category, flags ] ret.push cur cur = nil #subcat_and_desc = nil next when /^DE *(.*)/ subcat_and_desc = $1 else warn "Warning: skipped DE line in unknown format: #{line.inspect}" #subcat_and_desc = nil next end case subcat_and_desc when nil # does nothing when /\A([^\=]+)\=(.*)/ subcat = $1 desc = $2 desc.sub!(/\;\s*\z/, '') unless cur warn "Warning: unknown category in DE line: #{line.inspect}" cur = [ '' ] ret.push cur end cur.push [ subcat, desc ] else warn "Warning: skipped DE line description in unknown format: #{line.inspect}" end end ret end
# File lib/bio/db/embl/sptr.rb, line 626 def set_RA(data) data = data.sub(/; *$/, '') end
# File lib/bio/db/embl/sptr.rb, line 592 def set_RC(data) data.scan(/([STP]\w+)=(.+);/).map { |comment| [comment[1].split(/, and |, /)].flatten.map { |text| {'Token' => comment[0], 'Text' => text} } }.flatten end
# File lib/bio/db/embl/sptr.rb, line 642 def set_RG(data) data = data.split('; ') end
# File lib/bio/db/embl/sptr.rb, line 637 def set_RL(data) data = data.strip end
# File lib/bio/db/embl/sptr.rb, line 601 def set_RP(data) data = data.strip data = data.sub(/\.$/, '') data.split(/, AND |, /i).map {|x| x = x.strip x = x.gsub(' ', ' ') } end
# File lib/bio/db/embl/sptr.rb, line 631 def set_RT(data) data = data.sub(/; *$/, '') data = data.gsub(/(^"|"$)/, '') end
# File lib/bio/db/embl/sptr.rb, line 611 def set_RX(data) rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil} if data =~ /MEDLINE=(.+?);/ rx['MEDLINE'] = $1 end if data =~ /PubMed=(.+?);/ rx['PubMed'] = $1 end if data =~ /DOI=(.+?);/ rx['DOI'] = $1 end rx end