class Rouge::Lexer

@abstract A lexer transforms text into a stream of `[token, chunk]` pairs.

Public Class Methods

aliases(*args) click to toggle source

Used to specify alternate names this lexer class may be found by.

@example

class Erb < Lexer
  tag 'erb'
  aliases 'eruby', 'rhtml'
end

Lexer.find('eruby') # => Erb
# File lib/rouge/lexer.rb, line 274
def aliases(*args)
  args.map!(&:to_s)
  args.each { |arg| Lexer.register(arg, self) }
  (@aliases ||= []).concat(args)
end
all() click to toggle source

@return a list of all lexers.

# File lib/rouge/lexer.rb, line 100
def all
  registry.values.uniq
end
analyze_text(text) click to toggle source

@abstract

Return a number between 0 and 1 indicating the likelihood that the text given should be lexed with this lexer. The default implementation returns 0. Values under 0.5 will only be used to disambiguate filename or mimetype matches.

@param [TextAnalyzer] text

the text to be analyzed, with a couple of handy methods on it,
like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?}
# File lib/rouge/lexer.rb, line 429
def self.analyze_text(text)
  0
end
assert_utf8!(str) click to toggle source

@private

# File lib/rouge/lexer.rb, line 301
def assert_utf8!(str)
  return if %w(US-ASCII UTF-8 ASCII-8BIT).include? str.encoding.name
  raise EncodingError.new(
    "Bad encoding: #{str.encoding.names.join(',')}. " +
    "Please convert your string to UTF-8."
  )
end
default_options(o={}) click to toggle source
# File lib/rouge/lexer.rb, line 22
def default_options(o={})
  @default_options ||= {}
  @default_options.merge!(o)
  @default_options
end
demo(arg=:absent) click to toggle source

Specify or get a small demo string for this lexer

# File lib/rouge/lexer.rb, line 93
def demo(arg=:absent)
  return @demo = arg unless arg == :absent

  @demo = File.read(demo_file, encoding: 'utf-8')
end
demo_file(arg=:absent) click to toggle source

Specify or get the path name containing a small demo for this lexer (can be overriden by {demo}).

# File lib/rouge/lexer.rb, line 86
def demo_file(arg=:absent)
  return @demo_file = Pathname.new(arg) unless arg == :absent

  @demo_file = Pathname.new(__FILE__).dirname.join('demos', tag)
end
desc(arg=:absent) click to toggle source

Specify or get this lexer's description.

# File lib/rouge/lexer.rb, line 76
def desc(arg=:absent)
  if arg == :absent
    @desc
  else
    @desc = arg
  end
end
filenames(*fnames) click to toggle source

Specify a list of filename globs associated with this lexer.

@example

class Ruby < Lexer
  filenames '*.rb', '*.ruby', 'Gemfile', 'Rakefile'
end
# File lib/rouge/lexer.rb, line 286
def filenames(*fnames)
  (@filenames ||= []).concat(fnames)
end
find(name) click to toggle source

Given a string, return the correct lexer class.

# File lib/rouge/lexer.rb, line 29
def find(name)
  registry[name.to_s]
end
find_fancy(str, code=nil) click to toggle source

Find a lexer, with fancy shiny features.

  • The string you pass can include CGI-style options

    Lexer.find_fancy('erb?parent=tex')
    
  • You can pass the special name 'guess' so we guess for you, and you can pass a second argument of the code to guess by

    Lexer.find_fancy('guess', "#!/bin/bash\necho Hello, world")
    

This is used in the Redcarpet plugin as well as Rouge's own markdown lexer for highlighting internal code blocks.

# File lib/rouge/lexer.rb, line 47
def find_fancy(str, code=nil)
  name, opts = str ? str.split('?', 2) : [nil, '']

  # parse the options hash from a cgi-style string
  opts = CGI.parse(opts || '').map do |k, vals|
    [ k.to_sym, vals.empty? ? true : vals[0] ]
  end

  opts = Hash[opts]

  lexer_class = case name
  when 'guess', nil
    self.guess(:source => code, :mimetype => opts[:mimetype])
  when String
    self.find(name)
  end

  lexer_class && lexer_class.new(opts)
end
guess(info={}) click to toggle source

Guess which lexer to use based on a hash of info.

@option info :mimetype

A mimetype to guess by

@option info :filename

A filename to guess by

@option info :source

The source itself, which, if guessing by mimetype or filename
fails, will be searched for shebangs, <!DOCTYPE ...> tags, and
other hints.

@see ::analyze_text @see ::guesses

# File lib/rouge/lexer.rb, line 153
def guess(info={})
  lexers = guesses(info)

  return Lexers::PlainText if lexers.empty?
  return lexers[0] if lexers.size == 1

  raise AmbiguousGuess.new(lexers)
end
guess_by_filename(fname) click to toggle source
# File lib/rouge/lexer.rb, line 166
def guess_by_filename(fname)
  guess :filename => fname
end
guess_by_mimetype(mt) click to toggle source
# File lib/rouge/lexer.rb, line 162
def guess_by_mimetype(mt)
  guess :mimetype => mt
end
guess_by_source(source) click to toggle source
# File lib/rouge/lexer.rb, line 170
def guess_by_source(source)
  guess :source => source
end
guesses(info={}) click to toggle source

Guess which lexer to use based on a hash of info.

This accepts the same arguments as ::guess, but will never throw an error. It will return a (possibly empty) list of potential lexers to use.

# File lib/rouge/lexer.rb, line 109
def guesses(info={})
  mimetype, filename, source = info.values_at(:mimetype, :filename, :source)
  lexers = registry.values.uniq
  total_size = lexers.size

  lexers = filter_by_mimetype(lexers, mimetype) if mimetype
  return lexers if lexers.size == 1

  lexers = filter_by_filename(lexers, filename) if filename
  return lexers if lexers.size == 1

  if source
    # If we're filtering against *all* lexers, we only use confident return
    # values from analyze_text.  But if we've filtered down already, we can trust
    # the analysis more.
    source_threshold = lexers.size < total_size ? 0 : 0.5
    return [best_by_source(lexers, source, source_threshold)].compact
  end

  []
end
lex(stream, opts={}, &b) click to toggle source

Lexes `stream` with the given options. The lex is delegated to a new instance.

@see lex

# File lib/rouge/lexer.rb, line 18
def lex(stream, opts={}, &b)
  new(opts).lex(stream, &b)
end
mimetypes(*mts) click to toggle source

Specify a list of mimetypes associated with this lexer.

@example

class Html < Lexer
  mimetypes 'text/html', 'application/xhtml+xml'
end
# File lib/rouge/lexer.rb, line 296
def mimetypes(*mts)
  (@mimetypes ||= []).concat(mts)
end
new(opts={}) click to toggle source

Create a new lexer with the given options. Individual lexers may specify extra options. The only current globally accepted option is `:debug`.

@option opts :debug

Prints debug information to stdout.  The particular info depends
on the lexer in question.  In regex lexers, this will log the
state stack at the beginning of each step, along with each regex
tried and each stream consumed.  Try it, it's pretty useful.
# File lib/rouge/lexer.rb, line 326
def initialize(opts={})
  options(opts)

  @debug = option(:debug)
end
tag(t=nil) click to toggle source

Used to specify or get the canonical name of this lexer class.

@example

class MyLexer < Lexer
  tag 'foo'
end

MyLexer.tag # => 'foo'

Lexer.find('foo') # => MyLexer
# File lib/rouge/lexer.rb, line 258
def tag(t=nil)
  return @tag if t.nil?

  @tag = t.to_s
  Lexer.register(@tag, self)
end
title(t=nil) click to toggle source

Specify or get this lexer's title. Meant to be human-readable.

# File lib/rouge/lexer.rb, line 68
def title(t=nil)
  if t.nil?
    t = tag.capitalize
  end
  @title ||= t
end

Protected Class Methods

register(name, lexer) click to toggle source

@private

# File lib/rouge/lexer.rb, line 243
def register(name, lexer)
  registry[name.to_s] = lexer
end

Private Class Methods

best_by_source(lexers, source, threshold=0) click to toggle source
# File lib/rouge/lexer.rb, line 212
def best_by_source(lexers, source, threshold=0)
  source = case source
  when String
    source
  when ->(s){ s.respond_to? :read }
    source.read
  else
    raise 'invalid source'
  end

  assert_utf8!(source)

  source = TextAnalyzer.new(source)

  best_result = threshold
  best_match = nil
  lexers.each do |lexer|
    result = lexer.analyze_text(source) || 0
    return lexer if result == 1

    if result > best_result
      best_match = lexer
      best_result = result
    end
  end

  best_match
end
filter_by_filename(lexers, fname) click to toggle source

returns a list of lexers that match the given filename with equal specificity (i.e. number of wildcards in the pattern). This helps disambiguate between, e.g. the Nginx lexer, which matches `nginx.conf`, and the Conf lexer, which matches `*.conf`. In this case, nginx will win because the pattern has no wildcards, while `*.conf` has one.

# File lib/rouge/lexer.rb, line 186
def filter_by_filename(lexers, fname)
  fname = File.basename(fname)

  out = []
  best_seen = nil
  lexers.each do |lexer|
    score = lexer.filenames.map do |pattern|
      if File.fnmatch?(pattern, fname, File::FNM_DOTMATCH)
        # specificity is better the fewer wildcards there are
        pattern.scan(/[*?\[]/).size
      end
    end.compact.min

    next unless score

    if best_seen.nil? || score < best_seen
      best_seen = score
      out = [lexer]
    elsif score == best_seen
      out << lexer
    end
  end

  out.any? ? out : lexers
end
filter_by_mimetype(lexers, mt) click to toggle source
# File lib/rouge/lexer.rb, line 175
def filter_by_mimetype(lexers, mt)
  filtered = lexers.select { |lexer| lexer.mimetypes.include? mt }
  filtered.any? ? filtered : lexers
end
registry() click to toggle source
# File lib/rouge/lexer.rb, line 310
def registry
  @registry ||= {}
end

Public Instance Methods

debug() { || ... } click to toggle source

@deprecated Instead of `debug { “foo” }`, simply `puts “foo” if @debug`.

Leave a debug message if the `:debug` option is set. The message is given as a block because some debug messages contain calculated information that is unnecessary for lexing in the real world.

Calls to this method should be guarded with “if @debug” for best performance when debugging is turned off.

@example

debug { "hello, world!" } if @debug
# File lib/rouge/lexer.rb, line 360
def debug
  warn "Lexer#debug is deprecated.  Simply puts if @debug instead."
  puts yield if @debug
end
lex(string, opts={}, &b) click to toggle source

Given a string, yield [token, chunk] pairs. If no block is given, an enumerator is returned.

@option opts :continue

Continue the lex from the previous state (i.e. don't call #reset!)
# File lib/rouge/lexer.rb, line 377
def lex(string, opts={}, &b)
  return enum_for(:lex, string, opts) unless block_given?

  Lexer.assert_utf8!(string)

  reset! unless opts[:continue]

  # consolidate consecutive tokens of the same type
  last_token = nil
  last_val = nil
  stream_tokens(string) do |tok, val|
    next if val.empty?

    if tok == last_token
      last_val << val
      next
    end

    b.call(last_token, last_val) if last_token
    last_token = tok
    last_val = val
  end

  b.call(last_token, last_val) if last_token
end
option(k, v=:absent) click to toggle source

get or specify one option for this lexer

# File lib/rouge/lexer.rb, line 340
def option(k, v=:absent)
  if v == :absent
    options[k]
  else
    options({ k => v })
  end
end
options(o={}) click to toggle source

get and/or specify the options for this lexer.

# File lib/rouge/lexer.rb, line 333
def options(o={})
  (@options ||= {}).merge!(o)

  self.class.default_options.merge(@options)
end
reset!() click to toggle source

@abstract

Called after each lex is finished. The default implementation is a noop.

# File lib/rouge/lexer.rb, line 369
def reset!
end
stream_tokens(stream, &b) click to toggle source

@abstract

Yield `[token, chunk]` pairs, given a prepared input stream. This must be implemented.

@param [StringScanner] stream

the stream
# File lib/rouge/lexer.rb, line 415
def stream_tokens(stream, &b)
  raise 'abstract'
end
tag() click to toggle source

delegated to {Lexer.tag}

# File lib/rouge/lexer.rb, line 404
def tag
  self.class.tag
end