Object
Server as an umbrella for filters which are conceptually extracting the same thing - for example a price or a title or ...
Sometimes the same piece of information can not be extracted with one filter across more result instances (for example a price has an XPath in record n, but since in record n+1 has a discount price as well, the real price is pushed to a different XPath etc) - in this case the more filters which extract the same thing are hold in the same pattern.
These options can be set upon wrapper creation
Model pattern are shown in the output
OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL #Temp patterns are skipped in the output (their ancestors are appended to the parent #of the pattrern which was skipped OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
:determine - default value, represent that type of example need determine :string - represent node with example type EXAMPLE_TYPE_STRING
# a root pattern represents a (surprise!) root pattern PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT # a tree pattern represents a HTML region PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE # represents an attribute of the node extracted by the parent pattern PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE # represents a pattern which filters its output with a regexp PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP # represents a pattern which crawls to the detail page and extracts information from there PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE # represents a download pattern PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD # write out the HTML subtree beginning at the matched element PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
# File lib/scrubyt/core/scraping/pattern.rb, line 68 def initialize(name, args=[], extractor=nil, parent=nil, &block) #init attributes @name = name @extractor = extractor @parent = parent @options = {} @children = [] @filters = [] @constraints = [] @modifier_calls = [] #grab any examples that are defined examples = look_for_examples(args) #parse the options hash if provided parse_options_hash(args[-1]) if args[-1].is_a? Hash #perform checks for special cases examples = check_if_shortcut_pattern() if examples == nil check_if_detail_page(block) @options[:output_type] = :page_list if name == 'page_list' #create filters if examples == nil @filters << Scrubyt::BaseFilter.create(self) #create a default filter else examples.each do |example| @filters << Scrubyt::BaseFilter.create(self,example) #create a filter end end #by default, generalize the root pattern, but only in the case if #@generalize was not set up explicitly if @options[:generalize].nil? @options[:generalize] = true if parent.nil? @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/)) end #parse child patterns if available parse_child_patterns(&block) if ( !block.nil? && type != :detail_page ) #tree pattern only (TODO: subclass?) if type == :tree #generate xpaths and regexps @filters.each do |filter| filter.generate_XPath_for_example(false) unless @name == 'next_page' filter.generate_regexp_for_example end #when the xpaths of this pattern have been created, its children can make their xpaths relative xpaths = @filters.collect { |filter| filter.xpath } @children.each do |child| child.generate_relative_XPaths xpaths end end end
Check whether the currently created pattern is a detail pattern (i.e. it refrences a subextractor). Also check if the currently created pattern is an ancestor of a detail pattern , and store this in a hash if yes (to be able to traverse the pattern structure on detail pages as well).
# File lib/scrubyt/core/scraping/pattern.rb, line 162 def check_if_detail_page(block) if @name =~ /.+_detail/ @options[:type] = :detail_page @referenced_extractor = block end end
Shortcut patterns, as their name says, are a shortcut for creating patterns from predefined rules; for example:
detail_url is equivalent to detail_url 'href', type => :attribute
i.e. the system figures out on it's own that because of the postfix, the example should be looked up (but it should never override the user input!) another example (will be available later):
every_img is equivivalent to every_img '//img'
# File lib/scrubyt/core/scraping/pattern.rb, line 151 def check_if_shortcut_pattern() if @name =~ /.+_url/ @options[:type] = :attribute ['href'] end end
# File lib/scrubyt/core/scraping/pattern.rb, line 180 def current=(value) @current = value end
# File lib/scrubyt/core/scraping/pattern.rb, line 236 def evaluate(source, filter_indices) if type == :detail_page # DIRTY! return @filters[0].evaluate(source) end #we apply all filters if filter_indices is nil indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices #stores the results of all filters all_filter_results = [] #remembers which filters have retured a certain result indices_mapping = {} #evaluate filters and collect filter results indices_to_evaluate.each do |filter_index| filter = @filters[filter_index] filter_results = filter.evaluate(source) filter_results.each do |result| #add result to list if not already there all_filter_results << result if all_filter_results.index(result).nil? #add the current filter's index to the mapping (indices_mapping[result] ||= []) << filter_index end end #apply constraints if @constraints.size > 0 all_filter_results = all_filter_results.select do |result| @constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) } end end #apply indexer all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil? #create result nodes and evaluate children result_nodes = [] all_filter_results.each do |result| #create result node node = ResultNode.new(@name, result, @options) node.generated_by_leaf = (@children.size == 0) #evaluate children @children.each do |child| raise if self.filter_count != 1 && child.filter_count != self.filter_count if self.filter_count == 1 #evaluate all child filters node.push(*child.evaluate(result, nil)) else #evaluate appropriate child filters node.push(*child.evaluate(result, indices_mapping[result])) end end #apply child constraints (ensure_presence_of_pattern) required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target} unless required_child_names.empty? check = lambda { |node_to_check| required_child_names.delete node_to_check.name node_to_check.each { |child| check.call child } } check.call node end next unless required_child_names.empty? #add the current result node to the list result_nodes << node end if result_nodes.empty? result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default] end case output_type when :model return result_nodes when :page_list result_nodes.each do |result_node| @extractor.add_to_next_page_list result_node end return [] end end
# File lib/scrubyt/core/scraping/pattern.rb, line 173 def filter_count @filters.size end
# File lib/scrubyt/core/scraping/pattern.rb, line 124 def generate_relative_XPaths(parent_xpaths) return if type != :tree raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message @filters.each_index do |index| @filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index] end end
# File lib/scrubyt/core/scraping/pattern.rb, line 183 def method_missing(method_name, *args, &block) if method_name.to_s[0..0] == '_' #add hash option key = method_name.to_s[1..-1].to_sym check_option(key) args.each do |arg| current_value = @current.options[key] if current_value.nil? @current.options[key] = arg else @current.options[key] = [current_value] if !current_value.is_a Array @current.options[key] << arg end end else #create child pattern child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block) @current.children << child child end end
# File lib/scrubyt/core/scraping/pattern.rb, line 169 def parent_of_leaf @children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? } end
# File lib/scrubyt/core/scraping/pattern.rb, line 177 def parse_child_patterns(&block) context = Object.new context.instance_eval do def current=(value) @current = value end def method_missing(method_name, *args, &block) if method_name.to_s[0..0] == '_' #add hash option key = method_name.to_s[1..-1].to_sym check_option(key) args.each do |arg| current_value = @current.options[key] if current_value.nil? @current.options[key] = arg else @current.options[key] = [current_value] if !current_value.is_a Array @current.options[key] << arg end end else #create child pattern child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block) @current.children << child child end end end context.current = self context.instance_eval(&block) end
Generated with the Darkfish Rdoc Generator 2.