class Prawn::Format::Lexer

The Lexer class is used by the formatting subsystem to scan a string and extract tokens from it. The tokens it looks for are either text, XML entities, or XML tags.

Note that the lexer only scans for a subset of XML–it is not a true XML scanner, and understands just enough to provide a basic markup language for use in formatting documents.

The subset includes only XML entities and tags–instructions, comments, and the like are not supported.

Constants

ENTITY_MAP

Attributes

verbatim[RW]

Controls whether whitespace is lexed verbatim or not. If not, adjacent whitespace is compressed into a single space character (this includes newlines).

Public Class Methods

new(text) click to toggle source

Create a new lexer that will scan the given text. The text must be UTF-8 encoded, and must consist of well-formed XML in the subset understand by the lexer.

# File lib/prawn/format/lexer.rb, line 30
def initialize(text)
  @scanner = StringScanner.new(text)
  @state = :start
  @verbatim = false
end

Public Instance Methods

each() click to toggle source

Iterates over each token in the string, until the end of the string is reached. Each token is yielded. See next for a discussion of the available token types.

# File lib/prawn/format/lexer.rb, line 64
def each
  while (token = next_token)
    yield token
  end
end

private

  def scan_next_token
    case @state
    when :start then scan_start_state
    when :self_close then scan_self_close_state
    end
  end

  if RUBY_VERSION >= "1.9.0"
    def scan_other_text
      @scanner.scan(/[^-\xE2\x80\x94\s<&]+/)
    end
  else
    def scan_other_text
      return nil if @scanner.eos?

      result = @scanner.scan_until(/[-\s<&]|\xE2\x80\x94/)
      if result
        @scanner.pos -= @scanner.matched.length
        return nil if result == "<" || result == "&"
        return result[0,result.length - @scanner.matched.length]
      else
        result = @scanner.rest
        @scanner.terminate
        return result
      end
    end
  end
error(message) click to toggle source
# File lib/prawn/format/lexer.rb, line 216
def error(message)
  raise InvalidFormat, "#{message} at #{@scanner.pos} -> #{@scanner.rest.inspect[0,50]}..."
end
next() click to toggle source

Returns the next token from the scanner. If the end of the string has been reached, this will return nil. Otherwise, the token itself is returned as a hash. The hash will always include a :type key, identifying the type of the token. It will be one of :text, :open, or :close.

For :text tokens, the hash will also contain a :text key, which will point to an array of strings. Each element of the array contains either word, whitespace, or some other character at which the line may be broken.

For :open tokens, the hash will contain a :tag key which identifies the name of the tag (as a symbol), and an :options key, which is another hash that contains the options that were given with the tag.

For :close tokens, the hash will contain only a :tag key.

# File lib/prawn/format/lexer.rb, line 53
def next
  if @state == :start && @scanner.eos?
    return nil
  else
    scan_next_token
  end
end
scan_end_tag() click to toggle source
# File lib/prawn/format/lexer.rb, line 204
def scan_end_tag
  tag = @scanner.scan(/\w+/).to_sym
  @scanner.skip(/\s*/)
  @scanner.scan(/>/) or error("unclosed ending tag #{tag.inspect}")
  { :type => :close, :tag => tag }
end
scan_entity() click to toggle source
# File lib/prawn/format/lexer.rb, line 156
def scan_entity
  entity = @scanner.scan(/(?:#x?)?\w+/) or error("bad format for entity")
  @scanner.scan(/;/) or error("missing semicolon to terminate entity")

  text = case entity
    when /#(\d+)/ then [$1.to_i].pack("U*")
    when /#x([0-9a-f]+)/ then [$1.to_i(16)].pack("U*")
    else
      result = ENTITY_MAP[entity] or error("unrecognized entity #{entity.inspect}")
      result.dup
    end

  { :type => :text, :text => [text] }
end
scan_next_text_chunk() click to toggle source
# File lib/prawn/format/lexer.rb, line 118
def scan_next_text_chunk
  if @verbatim
    scan_verbatim_text_chunk
  else
    scan_nonverbatim_text_chunk
  end
end
scan_nonverbatim_text_chunk() click to toggle source
# File lib/prawn/format/lexer.rb, line 113
def scan_nonverbatim_text_chunk
  (@scanner.scan(/\s+/) && " ") || # whitespace
  scan_text_chunk
end
scan_open_tag() click to toggle source
# File lib/prawn/format/lexer.rb, line 171
def scan_open_tag
  tag = @scanner.scan(/\w+/) or error("'<' without valid tag")
  tag = tag.downcase.to_sym

  options = {}
  @scanner.skip(/\s*/)
  while !@scanner.eos? && @scanner.peek(1) =~ /\w/
    name = @scanner.scan(/\w+/)
    @scanner.scan(/\s*=\s*/) or error("expected assigment after option #{name}")
    if (delim = @scanner.scan(/['"]/))
      value = @scanner.scan(/[^#{delim}]*/)
      @scanner.scan(/#{delim}/) or error("expected option value to end with #{delim}")
    else
      value = @scanner.scan(/[^\s>]*/)
    end
    options[name.downcase.to_sym] = value
    @scanner.skip(/\s*/)
  end

  if @scanner.scan(%r(/))
    @self_close = true
    @tag = tag
    @state = :self_close
  else
    @self_close = false
    @state = :start
  end

  @scanner.scan(/>/) or error("unclosed tag #{tag.inspect}")

  { :type => :open, :tag => tag, :options => options }
end
scan_other_text() click to toggle source
# File lib/prawn/format/lexer.rb, line 84
def scan_other_text
  return nil if @scanner.eos?

  result = @scanner.scan_until(/[-\s<&]|\xE2\x80\x94/)
  if result
    @scanner.pos -= @scanner.matched.length
    return nil if result == "<" || result == "&"
    return result[0,result.length - @scanner.matched.length]
  else
    result = @scanner.rest
    @scanner.terminate
    return result
  end
end
scan_self_close_state() click to toggle source
# File lib/prawn/format/lexer.rb, line 211
def scan_self_close_state
  @state = :start
  { :type => :close, :tag => @tag }
end
scan_start_state() click to toggle source
# File lib/prawn/format/lexer.rb, line 126
def scan_start_state
  if @scanner.scan(/</)
    if @scanner.scan(%r(/))
      scan_end_tag
    else
      scan_open_tag
    end
  elsif @scanner.scan(/&/)
    scan_entity
  else
    pieces = []
    loop do
      chunk = scan_next_text_chunk or break
      pieces << chunk
    end
    { :type => :text, :text => pieces }
  end
end
scan_text_chunk() click to toggle source
# File lib/prawn/format/lexer.rb, line 100
def scan_text_chunk
  @scanner.scan(/-/)            || # hyphen
  @scanner.scan(/\xe2\x80\x94/) || # mdash
  scan_other_text
end
scan_verbatim_text_chunk() click to toggle source
# File lib/prawn/format/lexer.rb, line 106
def scan_verbatim_text_chunk
  @scanner.scan(/\r\n|\r|\n/) || # newline
  @scanner.scan(/\t/)         || # tab
  @scanner.scan(/ +/)         || # spaces
  scan_text_chunk
end