Object
Determine the method to use to measure String length in bytes, because StringScanner#pos can only be set in bytes.
In Ruby 1.8 String#length returns always the string length in bytes.
In Ruby 1.9+ String#length returns string length in characters and we need to use String#bytesize instead.
Some keywords can be followed by regular expressions (eg, return and throw). Others can be followed by division.
First 6 are always reserved in ECMAScript 5.1 Others are only reserved in strict mode. www.ecma-international.org/ecma-262/5.1/#sec-7.6.1.2 developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Reserved_Words
JavaScript whitespace can consist of any Unicode space separator characters.
In Ruby 1.9+ we can just use the [[:space:]] character class and match them all.
In Ruby 1.8 we need a regex that identifies the specific bytes in UTF-8 text.
# File lib/rkelly/tokenizer.rb, line 114 def initialize(&block) @lexemes = Hash.new {|hash, key| hash[key] = [] } token(:COMMENT, /\/(?:\*(?:.)*?\*\/|\/[^\n]*)/, ['/']) token(:STRING, /"(?:[^"\\]*(?:\\.[^"\\]*)*)"|'(?:[^'\\]*(?:\\.[^'\\]*)*)'/, ["'", '"']) # Matcher for basic ASCII whitespace. # (Unicode whitespace is handled separately in #match_lexeme) # # Can't use just "\s" in regex, because in Ruby 1.8 this # doesn't include the vertical tab "\v" character token(:S, /[ \t\r\n\f\v]*/, [" ", "\t", "\r", "\n", "\f", "\v"]) # A regexp to match floating point literals (but not integer literals). digits = ('0'..'9').to_a token(:NUMBER, /\d+\.\d*(?:[eE][-+]?\d+)?|\d+(?:\.\d*)?[eE][-+]?\d+|\.\d+(?:[eE][-+]?\d+)?/, digits+['.']) do |type, value| value.gsub!(/\.(\D)/, '.0\1') if value =~ /\.\w/ value.gsub!(/\.$/, '.0') if value =~ /\.$/ value.gsub!(/^\./, '0.') if value =~ /^\./ [type, eval(value)] end token(:NUMBER, /0[xX][\da-fA-F]+|0[0-7]*|\d+/, digits) do |type, value| [type, eval(value)] end word_chars = ('a'..'z').to_a + ('A'..'Z').to_a + ['_', '$'] token(:RAW_IDENT, /([_\$A-Za-z][_\$0-9A-Za-z]*)/, word_chars) do |type,value| if KEYWORDS[value] [KEYWORDS[value], value] elsif RESERVED[value] [:RESERVED, value] else [:IDENT, value] end end # To distinguish regular expressions from comments, we require that # regular expressions start with a non * character (ie, not look like # /*foo*/). Note that we can't depend on the length of the match to # correctly distinguish, since `/**/i` is longer if matched as a regular # expression than as matched as a comment. # Incidentally, we're also not matching empty regular expressions # (eg, // and //g). Here we could depend on match length and priority to # determine that these are actually comments, but it turns out to be # easier to not match them in the first place. token(:REGEXP, %{ / (?# beginning ) (?: [^\r\n\[/\\]+ (?# any char except \r \n [ / \ ) | \\ [^\r\n] (?# escape sequence ) | \[ (?:[^\]\\]|\\.)* \] (?# [...] can contain any char including / ) (?# only \ and ] have to be escaped here ) )+ /[gim]* (?# ending + modifiers ) }, ['/']) literal_chars = LITERALS.keys.map {|k| k.slice(0,1) }.uniq literal_regex = Regexp.new(LITERALS.keys.sort_by { |x| x.length }.reverse.map { |x| "#{x.gsub(/([|+*^])/, '\\\\\1')}" }.join('|')) token(:LITERALS, literal_regex, literal_chars) do |type, value| [LITERALS[value], value] end symbols = ('!'..'/').to_a + (':'..'@').to_a + ('['..'^').to_a + ['`'] + ('{'..'~').to_a token(:SINGLE_CHAR, /./, symbols) do |type, value| [value, value] end end
# File lib/rkelly/tokenizer.rb, line 192 def raw_tokens(string) scanner = StringScanner.new(string) tokens = [] range = CharRange::EMPTY accepting_regexp = true while !scanner.eos? token = match_lexeme(scanner, accepting_regexp) if token.name != :S accepting_regexp = followable_by_regex(token) end scanner.pos += token.value.send(BYTESIZE_METHOD) token.range = range = range.next(token.value) tokens << token end tokens end
Generated with the Darkfish Rdoc Generator 2.