class Ai4r::Classifiers::ID3

Introduction

This is an implementation of the ID3 algorithm (Quinlan) Given a set of preclassified examples, it builds a top-down induction of decision tree, biased by the information gain and entropy measure.

How to use it

DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target'  ]

DATA_ITEMS = [  
       ['New York',  '<30',      'M', 'Y'],
       ['Chicago',     '<30',      'M', 'Y'],
       ['Chicago',     '<30',      'F', 'Y'],
       ['New York',  '<30',      'M', 'Y'],
       ['New York',  '<30',      'M', 'Y'],
       ['Chicago',     '[30-50)',  'M', 'Y'],
       ['New York',  '[30-50)',  'F', 'N'],
       ['Chicago',     '[30-50)',  'F', 'Y'],
       ['New York',  '[30-50)',  'F', 'N'],
       ['Chicago',     '[50-80]', 'M', 'N'],
       ['New York',  '[50-80]', 'F', 'N'],
       ['New York',  '[50-80]', 'M', 'N'],
       ['Chicago',     '[50-80]', 'M', 'N'],
       ['New York',  '[50-80]', 'F', 'N'],
       ['Chicago',     '>80',      'F', 'Y']
     ]

data_set = DataSet.new(:data_items=>DATA_SET, :data_labels=>DATA_LABELS)
id3 = Ai4r::Classifiers::ID3.new.build(data_set)

id3.get_rules
  # =>  if age_range=='<30' then marketing_target='Y'
        elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
        elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
        elsif age_range=='[50-80]' then marketing_target='N'
        elsif age_range=='>80' then marketing_target='Y'
        else raise 'There was not enough information during training to do a proper induction for this data element' end

id3.eval(['New York', '<30', 'M'])
  # =>  'Y'

A better way to load the data

In the real life you will use lot more data training examples, with more attributes. Consider moving your data to an external CSV (comma separate values) file.

data_file = "#{File.dirname(__FILE__)}/data_set.csv"
data_set = DataSet.load_csv_with_labels data_file
id3 = Ai4r::Classifiers::ID3.new.build(data_set)

A nice tip for data evaluation

id3 = Ai4r::Classifiers::ID3.new.build(data_set)

age_range = '<30'
marketing_target = nil
eval id3.get_rules   
puts marketing_target
  # =>  'Y'

More about ID3 and decision trees

About the project

Author

Sergio Fierens

License

MPL 1.1

Url

ai4r.org/

Constants

LOG2

Attributes

data_set[R]

Private Class Methods

log2(z) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 165
def self.log2(z)
  return 0.0 if z == 0
  Math.log(z)/LOG2
end
sum(values) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 160
def self.sum(values)
  values.inject( 0 ) { |sum,x| sum+x }
end

Public Instance Methods

build(data_set) click to toggle source

Create a new ID3 classifier. You must provide a DataSet instance as parameter. The last attribute of each item is considered as the item class.

# File lib/ai4r/classifiers/id3.rb, line 99
def build(data_set)
  data_set.check_not_empty
  @data_set = data_set
  preprocess_data(@data_set.data_items)
  return self
end
eval(data) click to toggle source

You can evaluate new data, predicting its category. e.g.

id3.eval(['New York',  '<30', 'F'])  # => 'Y'
# File lib/ai4r/classifiers/id3.rb, line 109
def eval(data)
  @tree.value(data) if @tree
end
get_rules() click to toggle source

This method returns the generated rules in ruby code. e.g.

id3.get_rules
  # =>  if age_range=='<30' then marketing_target='Y'
        elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
        elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
        elsif age_range=='[50-80]' then marketing_target='N'
        elsif age_range=='>80' then marketing_target='Y'
        else raise 'There was not enough information during training to do a proper induction for this data element' end

It is a nice way to inspect induction results, and also to execute them:

age_range = '<30'
marketing_target = nil
eval id3.get_rules   
puts marketing_target
  # =>  'Y'
# File lib/ai4r/classifiers/id3.rb, line 130
def get_rules
  #return "Empty ID3 tree" if !@tree
  rules = @tree.get_rules
  rules = rules.collect do |rule|
      "#{rule[0..-2].join(' and ')} then #{rule.last}"
  end
  return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
end

Private Instance Methods

build_node(data_examples, flag_att = []) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 145
def build_node(data_examples, flag_att = [])
  return ErrorNode.new if data_examples.length == 0
  domain = domain(data_examples)   
  return CategoryNode.new(@data_set.data_labels.last, domain.last[0]) if domain.last.length == 1
  min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
  flag_att << min_entropy_index
  split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
  return CategoryNode.new(@data_set.data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
  nodes = split_data_examples.collect do |partial_data_examples|  
    build_node(partial_data_examples, flag_att)
  end
  return EvaluationNode.new(@data_set.data_labels, min_entropy_index, domain[min_entropy_index], nodes)
end
domain(data_examples) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 217
def domain(data_examples)
  #return build_domains(data_examples)
  domain = []
  @data_set.data_labels.length.times { domain << [] }
  data_examples.each do |data|
    data.each_index do |i|
      domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
    end
  end
  return domain
end
entropy(freq_grid, total_examples) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 249
def entropy(freq_grid, total_examples)
  #Calc entropy of each element
  entropy = 0
  freq_grid.each do |att_freq|
    att_total_freq = ID3.sum(att_freq)
    partial_entropy = 0
    if att_total_freq != 0
      att_freq.each do |freq|
        prop = freq.to_f/att_total_freq
        partial_entropy += (-1*prop*ID3.log2(prop))
      end
    end
    entropy += (att_total_freq.to_f/total_examples) * partial_entropy
  end
  return entropy
end
freq_grid(att_index, data_examples, domain) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 230
def freq_grid(att_index, data_examples, domain)
  #Initialize empty grid
  grid_element = []
  domain.last.length.times { grid_element << 0} 
  grid = [] 
  domain[att_index].length.times { grid << grid_element.clone }
  #Fill frecuency with grid
  data_examples.each do |example|
    att_val = example[att_index]
    att_val_index = domain[att_index].index(att_val)
    category = example.last
    category_index = domain.last.index(category)
    freq = grid[att_val_index][category_index] + 1
    grid[att_val_index][category_index] = freq
  end
  return grid
end
min_entropy_index(data_examples, domain, flag_att=[]) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 202
def min_entropy_index(data_examples, domain, flag_att=[])
  min_entropy = nil
  min_index = 0
  domain[0..-2].each_index do |index|
    freq_grid = freq_grid(index, data_examples, domain)
    entropy = entropy(freq_grid, data_examples.length)
    if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
      min_entropy = entropy 
      min_index = index 
    end
  end
  return min_index
end
most_freq(examples, domain) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 171
def most_freq(examples, domain)
  freqs = []
  domain.last.length.times { freqs << 0}
  examples.each do |example|
    cat_index = domain.last.index(example.last)
    freq = freqs[cat_index] + 1
    freqs[cat_index] = freq
  end
  max_freq = freqs.max
  max_freq_index = freqs.index(max_freq)
  domain.last[max_freq_index]
end
preprocess_data(data_examples) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 140
def preprocess_data(data_examples)
  @tree = build_node(data_examples)
end
split_data_examples(data_examples, domain, att_index) click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 185
def split_data_examples(data_examples, domain, att_index)
  data_examples_array = []
  att_value_examples = {}
  data_examples.each do |example|
    example_set = att_value_examples[example[att_index]]
    example_set = [] if !example_set
    example_set << example
    att_value_examples.store(example[att_index], example_set)
  end
  att_value_examples.each_pair do |att_value, example_set|
     att_value_index = domain[att_index].index(att_value)
     data_examples_array[att_value_index] = example_set
  end
  return data_examples_array
end