class Babosa::Identifier

This class provides some string-manipulation methods specific to slugs.

Note that this class includes many “bang methods” such as {#clean!} and {#normalize!} that perform actions on the string in-place. Each of these methods has a corresponding “bangless” method (i.e., +Identifier#clean!+ and +Identifier#clean+) which does not appear in the documentation because it is generated dynamically.

All of the bang methods return an instance of String, while the bangless versions return an instance of Babosa::Identifier, so that calls to methods specific to this class can be chained:

string = Identifier.new("hello world")
string.with_separators! # => "hello-world"
string.with_separators  # => <Babosa::Identifier:0x000001013e1590 @wrapped_string="hello-world">

@see www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table

Constants

Error

Attributes

to_s[R]
wrapped_string[R]

Public Class Methods

new(string) click to toggle source

@param string [#to_s] The string to use as the basis of the Identifier.

# File lib/babosa/identifier.rb, line 64
def initialize(string)
  @wrapped_string = string.to_s
  tidy_bytes!
  normalize_utf8!
end
utf8_proxy() click to toggle source

Return the proxy used for UTF-8 support. @see Babosa::UTF8::Proxy

# File lib/babosa/identifier.rb, line 49
def self.utf8_proxy
  @@utf8_proxy
end
utf8_proxy=(obj) click to toggle source

Set a proxy object used for UTF-8 support. @see Babosa::UTF8::Proxy

# File lib/babosa/identifier.rb, line 55
def self.utf8_proxy=(obj)
  @@utf8_proxy = obj
end

Public Instance Methods

==(value) click to toggle source
# File lib/babosa/identifier.rb, line 70
def ==(value)
  @wrapped_string.to_s == value.to_s
end
approximate_ascii!(*kinds)
Alias for: transliterate!
clean!() click to toggle source

Converts dashes to spaces, removes leading and trailing spaces, and replaces multiple whitespace characters with a single space. @return String

# File lib/babosa/identifier.rb, line 130
def clean!
  @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
end
default_normalize_options() click to toggle source

The default options for {#normalize!}. Override to set your own defaults.

# File lib/babosa/identifier.rb, line 272
def default_normalize_options
  {:transliterate => true, :max_length => 255, :separator => "-"}
end
downcase!() click to toggle source

Perform UTF-8 sensitive downcasing. @return String

# File lib/babosa/identifier.rb, line 240
def downcase!
  @wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
end
empty?() click to toggle source
# File lib/babosa/identifier.rb, line 78
def empty?
  # included to make this class :respond_to? :empty for compatibility with Active Support's
  # #blank?
  @wrapped_string.empty?
end
eql?(value) click to toggle source
# File lib/babosa/identifier.rb, line 74
def eql?(value)
  @wrapped_string == value
end
method_missing(symbol, *args, &block) click to toggle source
# File lib/babosa/identifier.rb, line 59
def method_missing(symbol, *args, &block)
  @wrapped_string.__send__(symbol, *args, &block)
end
normalize!(options = nil) click to toggle source

Normalize the string for use as a URL slug. Note that in this context, normalize means, strip, remove non-letters/numbers, downcasing, truncating to 255 bytes and converting whitespace to dashes. @param Options @return String

# File lib/babosa/identifier.rb, line 146
def normalize!(options = nil)
  options = default_normalize_options.merge(options || {})

  if translit_option = options[:transliterate]
    if translit_option != true
      transliterate!(*translit_option)
    else
      transliterate!(*options[:transliterations])
    end
  end
  to_ascii! if options[:to_ascii]
  clean!
  word_chars!
  clean!
  downcase!
  truncate_bytes!(options[:max_length])
  with_separators!(options[:separator])
end
normalize_utf8!() click to toggle source

Perform Unicode composition on the wrapped string. @return String

# File lib/babosa/identifier.rb, line 246
def normalize_utf8!
  @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
end
tidy_bytes!() click to toggle source

Attempt to convert characters encoded using CP1252 and IS0-8859-1 to UTF-8. @return String

# File lib/babosa/identifier.rb, line 253
def tidy_bytes!
  @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
end
to_ascii!() click to toggle source

Delete any non-ascii characters. @return String

# File lib/babosa/identifier.rb, line 192
def to_ascii!
  @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
end
to_identifier() click to toggle source
# File lib/babosa/identifier.rb, line 267
def to_identifier
  self
end
Also aliased as: to_slug
to_ruby_method!(allow_bangs = true) click to toggle source

Normalize a string so that it can safely be used as a Ruby method name.

# File lib/babosa/identifier.rb, line 166
def to_ruby_method!(allow_bangs = true)
  leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
  leader          = leader.to_s
  trailer         = trailer.to_s
  if allow_bangs
    trailer.downcase!
    trailer.gsub!(/[^a-z0-9!=\?]/, '')
  else
    trailer.downcase!
    trailer.gsub!(/[^a-z0-9]/, '')
  end
  id = leader.to_identifier
  id.transliterate!
  id.to_ascii!
  id.clean!
  id.word_chars!
  id.clean!
  @wrapped_string = id.to_s + trailer
  if @wrapped_string == ""
    raise Error, "Input generates impossible Ruby method name"
  end
  with_separators!("_")
end
to_slug()
Alias for: to_identifier
transliterate!(*kinds) click to toggle source

Approximate an ASCII string. This works only for Western strings using characters that are Roman-alphabet characters + diacritics. Non-letter characters are left unmodified.

string = Identifier.new "Łódź
string.transliterate                 # => "Lodz, Poland"
string = Identifier.new "日本"
string.transliterate                 # => "日本"

You can pass any key(s) from Characters.approximations as arguments. This allows for contextual approximations. Various languages are supported, you can see which ones by looking at the source of {Babosa::Transliterator::Base}.

string = Identifier.new "Jürgen Müller"
string.transliterate                 # => "Jurgen Muller"
string.transliterate :german         # => "Juergen Mueller"
string = Identifier.new "¡Feliz año!"
string.transliterate                 # => "¡Feliz ano!"
string.transliterate :spanish        # => "¡Feliz anio!"

The approximations are an array, which you can modify if you choose:

# Make Spanish use "nh" rather than "nn"
Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"

Notice that this method does not simply convert to ASCII; if you want to remove non-ASCII characters such as “¡” and “¿”, use {#to_ascii!}:

string.transliterate!(:spanish)       # => "¡Feliz anio!"
string.transliterate!                 # => "¡Feliz anio!"

@param *args <Symbol> @return String

# File lib/babosa/identifier.rb, line 117
def transliterate!(*kinds)
  kinds.compact!
  kinds = [:latin] if kinds.empty?
  kinds.each do |kind|
    transliterator = Transliterator.get(kind).instance
    @wrapped_string = transliterator.transliterate(@wrapped_string)
  end
  @wrapped_string
end
Also aliased as: approximate_ascii!
truncate!(max) click to toggle source

Truncate the string to max characters. @example

"üéøá".to_identifier.truncate(3) #=> "üéø"

@return String

# File lib/babosa/identifier.rb, line 200
def truncate!(max)
  @wrapped_string = unpack("U*")[0...max].pack("U*")
end
truncate_bytes!(max) click to toggle source

Truncate the string to max bytes. This can be useful for ensuring that a UTF-8 string will always fit into a database column with a certain max byte length. The resulting string may be less than max if the string must be truncated at a multibyte character boundary. @example

"üéøá".to_identifier.truncate_bytes(3) #=> "ü"

@return String

# File lib/babosa/identifier.rb, line 211
def truncate_bytes!(max)
  return @wrapped_string if @wrapped_string.bytesize <= max
  curr = 0
  new = []
  unpack("U*").each do |char|
    break if curr > max
    char = [char].pack("U")
    curr += char.bytesize
    if curr <= max
      new << char
    end
  end
  @wrapped_string = new.join
end
upcase!() click to toggle source

Perform UTF-8 sensitive upcasing. @return String

# File lib/babosa/identifier.rb, line 234
def upcase!
  @wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
end
with_dashes!(char = "-")
Alias for: with_separators!
with_separators!(char = "-") click to toggle source

Replaces whitespace with dashes (“-”). @return String

# File lib/babosa/identifier.rb, line 228
def with_separators!(char = "-")
  @wrapped_string = @wrapped_string.gsub(/\s/u, char)
end
Also aliased as: with_dashes!
word_chars!() click to toggle source

Remove any non-word characters. For this library's purposes, this means anything other than letters, numbers, spaces, newlines and linefeeds. @return String

# File lib/babosa/identifier.rb, line 137
def word_chars!
  @wrapped_string = (unpack("U*") - Babosa::STRIPPABLE).pack("U*")
end

Private Instance Methods

send_to_new_instance(*args) click to toggle source

Used as the basis of the bangless methods.

# File lib/babosa/identifier.rb, line 285
def send_to_new_instance(*args)
  id = Identifier.allocate
  id.instance_variable_set :@wrapped_string, to_s
  id.send(*args)
  id
end