class String
These are extensions to the String class to provide convenience methods for the Classifier package.
- Author
-
Lucas Carlson (lucas@rufy.com)
- Copyright
-
Copyright © 2005 Lucas Carlson
- License
-
LGPL
Constants
- CORPUS_SKIP_WORDS
Public Instance Methods
clean_word_hash()
click to toggle source
Return a word hash without extra punctuation or short symbols, just stemmed words
# File lib/classifier/extensions/word_hash.rb, line 28 def clean_word_hash word_hash_for_words gsub(/[^\w\s]/,"").split end
paragraph_summary( count=1, separator=" [...] " )
click to toggle source
# File lib/classifier/lsi/summary.rb, line 10 def paragraph_summary( count=1, separator=" [...] " ) perform_lsi split_paragraphs, count, separator end
split_paragraphs()
click to toggle source
# File lib/classifier/lsi/summary.rb, line 18 def split_paragraphs split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive end
split_sentences()
click to toggle source
# File lib/classifier/lsi/summary.rb, line 14 def split_sentences split /(\.|\!|\?)/ # TODO: make this less primitive end
summary( count=10, separator=" [...] " )
click to toggle source
# File lib/classifier/lsi/summary.rb, line 6 def summary( count=10, separator=" [...] " ) perform_lsi split_sentences, count, separator end
without_punctuation()
click to toggle source
Removes common punctuation symbols, returning a new string. E.g.,
"Hello (greeting's), with {braces} < >...?".without_punctuation => "Hello greetings with braces "
# File lib/classifier/extensions/word_hash.rb, line 15 def without_punctuation tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "") end
word_hash()
click to toggle source
Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.
# File lib/classifier/extensions/word_hash.rb, line 21 def word_hash word_hash = clean_word_hash() symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split) return word_hash.merge(symbol_hash) end
Private Instance Methods
perform_lsi(chunks, count, separator)
click to toggle source
# File lib/classifier/lsi/summary.rb, line 24 def perform_lsi(chunks, count, separator) lsi = Classifier::LSI.new :auto_rebuild => false chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 } lsi.build_index summaries = lsi.highest_relative_content count return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator) end
word_hash_for_symbols(words)
click to toggle source
# File lib/classifier/extensions/word_hash.rb, line 46 def word_hash_for_symbols(words) d = Hash.new(0) words.each do |word| d[word.intern] += 1 end return d end
word_hash_for_words(words)
click to toggle source
# File lib/classifier/extensions/word_hash.rb, line 34 def word_hash_for_words(words) d = Hash.new(0) words.each do |word| word.downcase! if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2 d[word.stem.intern] += 1 end end return d end