module Addressable::IDNA

Constants

ACE_MAX_LENGTH
ACE_PREFIX
COMPOSITION_TABLE
HANGUL_LBASE
HANGUL_LCOUNT
HANGUL_NCOUNT
HANGUL_SBASE
HANGUL_SCOUNT
HANGUL_TBASE
HANGUL_TCOUNT
HANGUL_VBASE
HANGUL_VCOUNT
PUNYCODE_BASE
PUNYCODE_DAMP
PUNYCODE_DELIMITER
PUNYCODE_INITIAL_BIAS
PUNYCODE_INITIAL_N
PUNYCODE_MAXINT
PUNYCODE_PRINT_ASCII
PUNYCODE_SKEW
PUNYCODE_TMAX
PUNYCODE_TMIN
UNICODE_DATA

This is a sparse Unicode table. Codepoints without entries are assumed to have the value: [0, 0, nil, nil, nil, nil, nil]

UNICODE_DATA_CANONICAL
UNICODE_DATA_COMBINING_CLASS
UNICODE_DATA_COMPATIBILITY
UNICODE_DATA_EXCLUSION
UNICODE_DATA_LOWERCASE
UNICODE_DATA_TITLECASE
UNICODE_DATA_UPPERCASE
UNICODE_MAX_LENGTH
UNICODE_TABLE

This module is loosely based on idn_actionmailer by Mick Staugaard, the unicode library by Yoshida Masato, and the punycode implementation by Kazuhiro Nishiyama. Most of the code was copied verbatim, but some reformatting was done, and some translation from C was done.

Without their code to work from as a base, we'd all still be relying on the presence of libidn. Which nobody ever seems to have installed.

Original sources: github.com/staugaard/idn_actionmailer www.yoshidam.net/Ruby.html#unicode rubyforge.org/frs/?group_id=2550

UTF8_REGEX
UTF8_REGEX_MULTIBYTE

Public Class Methods

to_ascii(value) click to toggle source
# File lib/addressable/idna/native.rb, line 34
def self.to_ascii(value)
  value.to_s.split('.', -1).map do |segment|
    if segment.size > 0
      IDN::Idna.toASCII(segment)
    else
      ''
    end
  end.join('.')
end
to_unicode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 44
def self.to_unicode(value)
  value.to_s.split('.', -1).map do |segment|
    if segment.size > 0
      IDN::Idna.toUnicode(segment)
    else
      ''
    end
  end.join('.')
end
unicode_normalize_kc(value) click to toggle source
# File lib/addressable/idna/native.rb, line 30
def self.unicode_normalize_kc(value)
  IDN::Stringprep.nfkc_normalize(value.to_s)
end

Private Class Methods

lookup_unicode_combining_class(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 276
def self.lookup_unicode_combining_class(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
    0)
end
lookup_unicode_compatibility(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 284
def self.lookup_unicode_compatibility(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
lookup_unicode_composition(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 299
def self.lookup_unicode_composition(unpacked)
  return COMPOSITION_TABLE[unpacked]
end
lookup_unicode_lowercase(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 291
def self.lookup_unicode_lowercase(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
    codepoint)
end
punycode_adapt(delta, numpoints, firsttime) click to toggle source

Bias adaptation method

# File lib/addressable/idna/pure.rb, line 654
def self.punycode_adapt(delta, numpoints, firsttime)
  delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1
  # delta >> 1 is a faster way of doing delta / 2
  delta += delta / numpoints
  difference = PUNYCODE_BASE - PUNYCODE_TMIN

  k = 0
  while delta > (difference * PUNYCODE_TMAX) / 2
    delta /= difference
    k += PUNYCODE_BASE
  end

  k + (difference + 1) * delta / (delta + PUNYCODE_SKEW)
end
punycode_basic?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 622
def self.punycode_basic?(codepoint)
  codepoint < 0x80
end
punycode_decode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 26
def self.punycode_decode(value)
  IDN::Punycode.decode(value.to_s)
end
punycode_decode_digit(codepoint) click to toggle source

Returns the numeric value of a basic codepoint (for use in representing integers) in the range 0 to base - 1, or PUNYCODE_BASE if codepoint does not represent a value.

# File lib/addressable/idna/pure.rb, line 640
def self.punycode_decode_digit(codepoint)
  if codepoint - 48 < 10
    codepoint - 22
  elsif codepoint - 65 < 26
    codepoint - 65
  elsif codepoint - 97 < 26
    codepoint - 97
  else
    PUNYCODE_BASE
  end
end
punycode_delimiter?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 627
def self.punycode_delimiter?(codepoint)
  codepoint == PUNYCODE_DELIMITER
end
punycode_encode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 22
def self.punycode_encode(value)
  IDN::Punycode.encode(value.to_s)
end
punycode_encode_digit(d) click to toggle source
# File lib/addressable/idna/pure.rb, line 632
def self.punycode_encode_digit(d)
  d + 22 + 75 * ((d < 26) ? 1 : 0)
end
unicode_compose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 132
def self.unicode_compose(unpacked)
  unpacked_result = []
  length = unpacked.length

  return unpacked if length == 0

  starter = unpacked[0]
  starter_cc = lookup_unicode_combining_class(starter)
  starter_cc = 256 if starter_cc != 0
  for i in 1...length
    ch = unpacked[i]
    cc = lookup_unicode_combining_class(ch)

    if (starter_cc == 0 &&
        (composite = unicode_compose_pair(starter, ch)) != nil)
      starter = composite
      startercc = lookup_unicode_combining_class(composite)
    else
      unpacked_result << starter
      starter = ch
      startercc = cc
    end
  end
  unpacked_result << starter
  return unpacked_result
end
unicode_compose_pair(ch_one, ch_two) click to toggle source
# File lib/addressable/idna/pure.rb, line 160
def self.unicode_compose_pair(ch_one, ch_two)
  if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
      ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
    # Hangul L + V
    return HANGUL_SBASE + (
      (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
    ) * HANGUL_TCOUNT
  elsif ch_one >= HANGUL_SBASE &&
      ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
      (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
      ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
       # Hangul LV + T
    return ch_one + (ch_two - HANGUL_TBASE)
  end

  p = []
  ucs4_to_utf8 = lambda do |ch|
    if ch < 128
      p << ch
    elsif ch < 2048
      p << (ch >> 6 | 192)
      p << (ch & 63 | 128)
    elsif ch < 0x10000
      p << (ch >> 12 | 224)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    elsif ch < 0x200000
      p << (ch >> 18 | 240)
      p << (ch >> 12 & 63 | 128)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    elsif ch < 0x4000000
      p << (ch >> 24 | 248)
      p << (ch >> 18 & 63 | 128)
      p << (ch >> 12 & 63 | 128)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    elsif ch < 0x80000000
      p << (ch >> 30 | 252)
      p << (ch >> 24 & 63 | 128)
      p << (ch >> 18 & 63 | 128)
      p << (ch >> 12 & 63 | 128)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    end
  end

  ucs4_to_utf8.call(ch_one)
  ucs4_to_utf8.call(ch_two)

  return lookup_unicode_composition(p)
end
unicode_decompose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 238
def self.unicode_decompose(unpacked)
  unpacked_result = []
  for cp in unpacked
    if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
      l, v, t = unicode_decompose_hangul(cp)
      unpacked_result << l
      unpacked_result << v if v
      unpacked_result << t if t
    else
      dc = lookup_unicode_compatibility(cp)
      unless dc
        unpacked_result << cp
      else
        unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
      end
    end
  end
  return unpacked_result
end
unicode_decompose_hangul(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 259
def self.unicode_decompose_hangul(codepoint)
  sindex = codepoint - HANGUL_SBASE;
  if sindex < 0 || sindex >= HANGUL_SCOUNT
    l = codepoint
    v = t = nil
    return l, v, t
  end
  l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
  v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
  t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
  if t == HANGUL_TBASE
    t = nil
  end
  return l, v, t
end
unicode_downcase(input) click to toggle source

Unicode aware downcase method.

@api private @param [String] input

The input string.

@return [String] The downcased result.

# File lib/addressable/idna/pure.rb, line 124
def self.unicode_downcase(input)
  input = input.to_s unless input.is_a?(String)
  unpacked = input.unpack("U*")
  unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
  return unpacked.pack("U*")
end
unicode_sort_canonical(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 214
def self.unicode_sort_canonical(unpacked)
  unpacked = unpacked.dup
  i = 1
  length = unpacked.length

  return unpacked if length < 2

  while i < length
    last = unpacked[i-1]
    ch = unpacked[i]
    last_cc = lookup_unicode_combining_class(last)
    cc = lookup_unicode_combining_class(ch)
    if cc != 0 && last_cc != 0 && last_cc > cc
      unpacked[i] = last
      unpacked[i-1] = ch
      i -= 1 if i > 1
    else
      i += 1
    end
  end
  return unpacked
end