class Anemone::Core
Constants
- DEFAULT_OPTS
Attributes
Hash of options for the crawl
Public Class Methods
Convenience method to start a new crawl
# File lib/anemone/core.rb, line 89 def self.crawl(urls, opts = {}) self.new(urls, opts) do |core| yield core if block_given? core.run end end
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block
# File lib/anemone/core.rb, line 72 def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts yield self if block_given? end
Public Instance Methods
Add a block to be executed on the PageStore after the crawl is finished
# File lib/anemone/core.rb, line 100 def after_crawl(&block) @after_crawl_blocks << block self end
Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.
# File lib/anemone/core.rb, line 140 def focus_crawl(&block) @focus_crawl_block = block self end
Add a block to be executed on every Page as they are encountered during the crawl
# File lib/anemone/core.rb, line 118 def on_every_page(&block) @on_every_page_blocks << block self end
Add a block to be executed on Page objects with a URL matching one or more patterns
# File lib/anemone/core.rb, line 127 def on_pages_like(*patterns, &block) if patterns patterns.each do |pattern| @on_pages_like_blocks[pattern] << block end end self end
Perform the crawl
# File lib/anemone/core.rb, line 148 def run process_options @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } end @urls.each{ |url| link_queue.enq(url) } loop do page = page_queue.deq @pages.touch_key page.url puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] links = links_to_follow page links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @pages.touch_keys links @pages[page.url] = page # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? @tentacles.size.times { link_queue << :END } break end end end @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end
Add one ore more Regex patterns for URLs which should not be followed
# File lib/anemone/core.rb, line 109 def skip_links_like(*patterns) @skip_link_patterns.concat [patterns].flatten.compact self end
Private Instance Methods
Returns true
if we are obeying robots.txt and the link is
granted access in it. Always returns true
when we are not
obeying robots.txt.
# File lib/anemone/core.rb, line 267 def allowed(link) @opts[:obey_robots_txt] ? @robots.allowed?(link) : true rescue false end
Execute the #after_crawl blocks
# File lib/anemone/core.rb, line 220 def do_after_crawl_blocks @after_crawl_blocks.each { |block| block.call(@pages) } end
Execute the #on_every_page blocks for page
# File lib/anemone/core.rb, line 227 def do_page_blocks(page) @on_every_page_blocks.each do |block| block.call(page) end @on_pages_like_blocks.each do |pattern, blocks| blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern end end
Freeze the opts Hash so that no options can be modified once the crawl begins
# File lib/anemone/core.rb, line 211 def freeze_options @opts.freeze @opts.each_key { |key| @opts[key].freeze } @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil end
Return an Array of links to follow from the given page. Based on whether or not the link has already been crawled, and the block given to #focus_crawl()
# File lib/anemone/core.rb, line 242 def links_to_follow(page) links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links links.select { |link| visit_link?(link, page) }.map { |link| link.dup } end
# File lib/anemone/core.rb, line 197 def process_options @opts = DEFAULT_OPTS.merge @opts @opts[:threads] = 1 if @opts[:delay] > 0 storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash) @pages = PageStore.new(storage) @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt] freeze_options end
Returns true
if link should not be visited
because its URL matches a skip_link pattern.
# File lib/anemone/core.rb, line 297 def skip_link?(link) @skip_link_patterns.any? { |pattern| link.path =~ pattern } end
Returns true
if link should not be visited
because it has a query string and skip_query_strings
is true.
# File lib/anemone/core.rb, line 289 def skip_query_string?(link) @opts[:skip_query_strings] && link.query end
Returns true
if we are over the page depth limit. This only
works when coming from a page and with the depth_limit
option
set. When neither is the case, will always return false
.
# File lib/anemone/core.rb, line 277 def too_deep?(from_page) if from_page && @opts[:depth_limit] from_page.depth >= @opts[:depth_limit] else false end end
Returns true
if link has not been visited
already, and is not excluded by a skip_link pattern… and is not excluded by
robots.txt… and is not deeper than the depth limit Returns
false
otherwise.
# File lib/anemone/core.rb, line 254 def visit_link?(link, from_page = nil) !@pages.has_page?(link) && !skip_link?(link) && !skip_query_string?(link) && allowed(link) && !too_deep?(from_page) end