Parent

Scrubyt::Pattern

Group more filters into one

Server as an umbrella for filters which are conceptually extracting the same thing - for example a price or a title or ...

Sometimes the same piece of information can not be extracted with one filter across more result instances (for example a price has an XPath in record n, but since in record n+1 has a discount price as well, the real price is pushed to a different XPath etc) - in this case the more filters which extract the same thing are hold in the same pattern.

Constants

PATTERN_OPTIONS

These options can be set upon wrapper creation

VALID_OPTIONS
VALID_OUTPUT_TYPES

Model pattern are shown in the output

OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
#Temp patterns are skipped in the output (their ancestors are appended to the parent
#of the pattrern which was skipped
OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
VALID_PATTERN_EXAMPLE_TYPES

:determine - default value, represent that type of example need determine :string - represent node with example type EXAMPLE_TYPE_STRING

VALID_PATTERN_TYPES

# a root pattern represents a (surprise!) root pattern PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT # a tree pattern represents a HTML region PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE # represents an attribute of the node extracted by the parent pattern PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE # represents a pattern which filters its output with a regexp PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP # represents a pattern which crawls to the detail page and extracts information from there PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE # represents a download pattern PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD # write out the HTML subtree beginning at the matched element PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE

Attributes

children[RW]
constraints[RW]
extractor[RW]
filters[RW]
indices_to_extract[RW]
modifier_calls[RW]
name[RW]
next_page_url[R]
options[RW]
parent[RW]
referenced_extractor[RW]
referenced_pattern[RW]
result_indexer[R]

Public Class Methods

new(name, args=[], extractor=nil, parent=nil, &block) click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 68
def initialize(name, args=[], extractor=nil, parent=nil, &block)
  #init attributes
  @name = name
  @extractor = extractor
  @parent = parent
  @options = {}
  @children = []
  @filters = []
  @constraints = []
  @modifier_calls = []

  #grab any examples that are defined
  examples = look_for_examples(args)

  #parse the options hash if provided
  parse_options_hash(args[-1]) if args[-1].is_a? Hash

  #perform checks for special cases
  examples = check_if_shortcut_pattern() if examples == nil
  check_if_detail_page(block)
  @options[:output_type] = :page_list if name == 'page_list'

  #create filters
  if examples == nil
    @filters << Scrubyt::BaseFilter.create(self) #create a default filter
  else
    examples.each do |example|
      @filters << Scrubyt::BaseFilter.create(self,example) #create a filter
    end
  end

  #by default, generalize the root pattern, but only in the case if
  #@generalize was not set up explicitly
  if @options[:generalize].nil?
    @options[:generalize] = true if parent.nil?
    @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
  end

  #parse child patterns if available
  parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )

  #tree pattern only (TODO: subclass?)
  if type == :tree
    #generate xpaths and regexps
    @filters.each do |filter|
      filter.generate_XPath_for_example(false) unless @name == 'next_page'
      filter.generate_regexp_for_example
    end
    #when the xpaths of this pattern have been created, its children can make their xpaths relative
    xpaths = @filters.collect { |filter| filter.xpath }
    @children.each do |child|
      child.generate_relative_XPaths xpaths
    end
  end
end

Public Instance Methods

check_if_detail_page(block) click to toggle source

Check whether the currently created pattern is a detail pattern (i.e. it refrences a subextractor). Also check if the currently created pattern is an ancestor of a detail pattern , and store this in a hash if yes (to be able to traverse the pattern structure on detail pages as well).

# File lib/scrubyt/core/scraping/pattern.rb, line 162
def check_if_detail_page(block)
  if @name =~ /.+_detail/
    @options[:type] = :detail_page
    @referenced_extractor = block
  end
end
check_if_shortcut_pattern() click to toggle source

Shortcut patterns, as their name says, are a shortcut for creating patterns from predefined rules; for example:

detail_url

is equivalent to

detail_url 'href', type => :attribute

i.e. the system figures out on it's own that because of the postfix, the example should be looked up (but it should never override the user input!) another example (will be available later):

every_img

is equivivalent to

every_img '//img'
# File lib/scrubyt/core/scraping/pattern.rb, line 151
def check_if_shortcut_pattern()
  if @name =~ /.+_url/
    @options[:type] = :attribute
    ['href']
  end
end
current=(value) click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 180
def current=(value)
  @current = value
end
evaluate(source, filter_indices) click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 236
def evaluate(source, filter_indices)
  if type == :detail_page # DIRTY!
    return @filters[0].evaluate(source)
  end

  #we apply all filters if filter_indices is nil
  indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
  #stores the results of all filters
  all_filter_results = []
  #remembers which filters have retured a certain result
  indices_mapping = {}
  #evaluate filters and collect filter results
  indices_to_evaluate.each do |filter_index|
    filter = @filters[filter_index]
    filter_results = filter.evaluate(source)
    filter_results.each do |result|
      #add result to list if not already there
      all_filter_results << result if all_filter_results.index(result).nil?
      #add the current filter's index to the mapping
       (indices_mapping[result] ||= []) << filter_index
    end
  end

  #apply constraints
  if @constraints.size > 0
    all_filter_results = all_filter_results.select do |result|
      @constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
    end
  end
  #apply indexer
  all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?

  #create result nodes and evaluate children
  result_nodes = []
  all_filter_results.each do |result|
    #create result node
    node = ResultNode.new(@name, result, @options)
    node.generated_by_leaf = (@children.size == 0)
    #evaluate children
    @children.each do |child|
      raise if self.filter_count != 1 && child.filter_count != self.filter_count
      if self.filter_count == 1
        #evaluate all child filters
        node.push(*child.evaluate(result, nil))
      else
        #evaluate appropriate child filters
        node.push(*child.evaluate(result, indices_mapping[result]))
      end
    end
    #apply child constraints (ensure_presence_of_pattern)
    required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
    unless required_child_names.empty?
      check = lambda { |node_to_check|
        required_child_names.delete node_to_check.name
        node_to_check.each { |child| check.call child }
      }
      check.call node
    end
    next unless required_child_names.empty?
    #add the current result node to the list
    result_nodes << node
  end
  if result_nodes.empty?
    result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
  end
  case output_type
    when :model
      return result_nodes
    when :page_list
      result_nodes.each do |result_node|
        @extractor.add_to_next_page_list result_node
      end
      return []
  end
end
filter_count() click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 173
def filter_count
  @filters.size
end
generate_relative_XPaths(parent_xpaths) click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 124
def generate_relative_XPaths(parent_xpaths)
  return if type != :tree
  raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
  @filters.each_index do |index|
    @filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
  end
end
method_missing(method_name, *args, &block) click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 183
def method_missing(method_name, *args, &block)
  if method_name.to_s[0..0] == '_'
    #add hash option
    key = method_name.to_s[1..-1].to_sym
    check_option(key)
    args.each do |arg|
      current_value = @current.options[key]
      if current_value.nil?
        @current.options[key] = arg
      else
        @current.options[key] = [current_value] if !current_value.is_a Array
        @current.options[key] << arg
      end
    end
  else
    #create child pattern
    child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
    @current.children << child
    child
  end
end
parent_of_leaf() click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 169
def parent_of_leaf
  @children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
end
parse_child_patterns(&block) click to toggle source
# File lib/scrubyt/core/scraping/pattern.rb, line 177
def parse_child_patterns(&block)
  context = Object.new
  context.instance_eval do
    def current=(value)
      @current = value
    end
    def method_missing(method_name, *args, &block)
      if method_name.to_s[0..0] == '_'
        #add hash option
        key = method_name.to_s[1..-1].to_sym
        check_option(key)
        args.each do |arg|
          current_value = @current.options[key]
          if current_value.nil?
            @current.options[key] = arg
          else
            @current.options[key] = [current_value] if !current_value.is_a Array
            @current.options[key] << arg
          end
        end
      else
        #create child pattern
        child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
        @current.children << child
        child
      end
    end
  end
  context.current = self
  context.instance_eval(&block)
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.