module ClassifierReborn::Hasher

Constants

STOPWORDS

Create a lazily-loaded hash of stopword data

STOPWORDS_PATH

Public Instance Methods

clean_word_hash(str, language = 'en') click to toggle source

Return a word hash without extra punctuation or short symbols, just stemmed words

# File lib/classifier-reborn/extensions/hasher.rb, line 22
def clean_word_hash(str, language = 'en')
  word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
end
word_hash(str, language = 'en') click to toggle source

Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.

# File lib/classifier-reborn/extensions/hasher.rb, line 15
def word_hash(str, language = 'en')
  cleaned_word_hash = clean_word_hash(str, language)
  symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
  return cleaned_word_hash.merge(symbol_hash)
end
word_hash_for_symbols(words) click to toggle source
# File lib/classifier-reborn/extensions/hasher.rb, line 36
def word_hash_for_symbols(words)
  d = Hash.new(0)
  words.each do |word|
    d[word.intern] += 1
  end
  return d
end
word_hash_for_words(words, language = 'en') click to toggle source
# File lib/classifier-reborn/extensions/hasher.rb, line 26
def word_hash_for_words(words, language = 'en')
  d = Hash.new(0)
  words.each do |word|
    if word.length > 2 && !STOPWORDS[language].include?(word)
      d[word.stem.intern] += 1
    end
  end
  return d
end