class Ai4r::Classifiers::ID3
Introduction¶ ↑
This is an implementation of the ID3 algorithm (Quinlan) Given a set of preclassified examples, it builds a top-down induction of decision tree, biased by the information gain and entropy measure.
How to use it¶ ↑
DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ] DATA_ITEMS = [ ['New York', '<30', 'M', 'Y'], ['Chicago', '<30', 'M', 'Y'], ['Chicago', '<30', 'F', 'Y'], ['New York', '<30', 'M', 'Y'], ['New York', '<30', 'M', 'Y'], ['Chicago', '[30-50)', 'M', 'Y'], ['New York', '[30-50)', 'F', 'N'], ['Chicago', '[30-50)', 'F', 'Y'], ['New York', '[30-50)', 'F', 'N'], ['Chicago', '[50-80]', 'M', 'N'], ['New York', '[50-80]', 'F', 'N'], ['New York', '[50-80]', 'M', 'N'], ['Chicago', '[50-80]', 'M', 'N'], ['New York', '[50-80]', 'F', 'N'], ['Chicago', '>80', 'F', 'Y'] ] data_set = DataSet.new(:data_items=>DATA_SET, :data_labels=>DATA_LABELS) id3 = Ai4r::Classifiers::ID3.new.build(data_set) id3.get_rules # => if age_range=='<30' then marketing_target='Y' elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y' elsif age_range=='[30-50)' and city=='New York' then marketing_target='N' elsif age_range=='[50-80]' then marketing_target='N' elsif age_range=='>80' then marketing_target='Y' else raise 'There was not enough information during training to do a proper induction for this data element' end id3.eval(['New York', '<30', 'M']) # => 'Y'
A better way to load the data ¶ ↑
In the real life you will use lot more data training examples, with more attributes. Consider moving your data to an external CSV (comma separate values) file.
data_file = "#{File.dirname(__FILE__)}/data_set.csv" data_set = DataSet.load_csv_with_labels data_file id3 = Ai4r::Classifiers::ID3.new.build(data_set)
A nice tip for data evaluation¶ ↑
id3 = Ai4r::Classifiers::ID3.new.build(data_set) age_range = '<30' marketing_target = nil eval id3.get_rules puts marketing_target # => 'Y'
More about ID3 and decision trees¶ ↑
About the project¶ ↑
- Author
-
Sergio Fierens
- License
-
MPL 1.1
- Url
Constants
- LOG2
Attributes
data_set[R]
Private Class Methods
log2(z)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 165 def self.log2(z) return 0.0 if z == 0 Math.log(z)/LOG2 end
sum(values)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 160 def self.sum(values) values.inject( 0 ) { |sum,x| sum+x } end
Public Instance Methods
build(data_set)
click to toggle source
Create a new ID3 classifier. You must provide a DataSet instance as parameter. The last attribute of each item is considered as the item class.
# File lib/ai4r/classifiers/id3.rb, line 99 def build(data_set) data_set.check_not_empty @data_set = data_set preprocess_data(@data_set.data_items) return self end
eval(data)
click to toggle source
You can evaluate new data, predicting its category. e.g.
id3.eval(['New York', '<30', 'F']) # => 'Y'
# File lib/ai4r/classifiers/id3.rb, line 109 def eval(data) @tree.value(data) if @tree end
get_rules()
click to toggle source
This method returns the generated rules in ruby code. e.g.
id3.get_rules # => if age_range=='<30' then marketing_target='Y' elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y' elsif age_range=='[30-50)' and city=='New York' then marketing_target='N' elsif age_range=='[50-80]' then marketing_target='N' elsif age_range=='>80' then marketing_target='Y' else raise 'There was not enough information during training to do a proper induction for this data element' end
It is a nice way to inspect induction results, and also to execute them:
age_range = '<30' marketing_target = nil eval id3.get_rules puts marketing_target # => 'Y'
# File lib/ai4r/classifiers/id3.rb, line 130 def get_rules #return "Empty ID3 tree" if !@tree rules = @tree.get_rules rules = rules.collect do |rule| "#{rule[0..-2].join(' and ')} then #{rule.last}" end return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end" end
Private Instance Methods
build_node(data_examples, flag_att = [])
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 145 def build_node(data_examples, flag_att = []) return ErrorNode.new if data_examples.length == 0 domain = domain(data_examples) return CategoryNode.new(@data_set.data_labels.last, domain.last[0]) if domain.last.length == 1 min_entropy_index = min_entropy_index(data_examples, domain, flag_att) flag_att << min_entropy_index split_data_examples = split_data_examples(data_examples, domain, min_entropy_index) return CategoryNode.new(@data_set.data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1 nodes = split_data_examples.collect do |partial_data_examples| build_node(partial_data_examples, flag_att) end return EvaluationNode.new(@data_set.data_labels, min_entropy_index, domain[min_entropy_index], nodes) end
domain(data_examples)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 217 def domain(data_examples) #return build_domains(data_examples) domain = [] @data_set.data_labels.length.times { domain << [] } data_examples.each do |data| data.each_index do |i| domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i]) end end return domain end
entropy(freq_grid, total_examples)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 249 def entropy(freq_grid, total_examples) #Calc entropy of each element entropy = 0 freq_grid.each do |att_freq| att_total_freq = ID3.sum(att_freq) partial_entropy = 0 if att_total_freq != 0 att_freq.each do |freq| prop = freq.to_f/att_total_freq partial_entropy += (-1*prop*ID3.log2(prop)) end end entropy += (att_total_freq.to_f/total_examples) * partial_entropy end return entropy end
freq_grid(att_index, data_examples, domain)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 230 def freq_grid(att_index, data_examples, domain) #Initialize empty grid grid_element = [] domain.last.length.times { grid_element << 0} grid = [] domain[att_index].length.times { grid << grid_element.clone } #Fill frecuency with grid data_examples.each do |example| att_val = example[att_index] att_val_index = domain[att_index].index(att_val) category = example.last category_index = domain.last.index(category) freq = grid[att_val_index][category_index] + 1 grid[att_val_index][category_index] = freq end return grid end
min_entropy_index(data_examples, domain, flag_att=[])
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 202 def min_entropy_index(data_examples, domain, flag_att=[]) min_entropy = nil min_index = 0 domain[0..-2].each_index do |index| freq_grid = freq_grid(index, data_examples, domain) entropy = entropy(freq_grid, data_examples.length) if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index) min_entropy = entropy min_index = index end end return min_index end
most_freq(examples, domain)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 171 def most_freq(examples, domain) freqs = [] domain.last.length.times { freqs << 0} examples.each do |example| cat_index = domain.last.index(example.last) freq = freqs[cat_index] + 1 freqs[cat_index] = freq end max_freq = freqs.max max_freq_index = freqs.index(max_freq) domain.last[max_freq_index] end
preprocess_data(data_examples)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 140 def preprocess_data(data_examples) @tree = build_node(data_examples) end
split_data_examples(data_examples, domain, att_index)
click to toggle source
# File lib/ai4r/classifiers/id3.rb, line 185 def split_data_examples(data_examples, domain, att_index) data_examples_array = [] att_value_examples = {} data_examples.each do |example| example_set = att_value_examples[example[att_index]] example_set = [] if !example_set example_set << example att_value_examples.store(example[att_index], example_set) end att_value_examples.each_pair do |att_value, example_set| att_value_index = domain[att_index].index(att_value) data_examples_array[att_value_index] = example_set end return data_examples_array end