Parent

Files

Anemone::Core

Constants

DEFAULT_OPTS

Attributes

opts[R]

Hash of options for the crawl

pages[R]

PageStore storing all Page objects encountered during the crawl

Public Class Methods

crawl(urls, opts = {}) click to toggle source

Convenience method to start a new crawl

# File lib/anemone/core.rb, line 89
def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end
new(urls, opts = {}) click to toggle source

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

# File lib/anemone/core.rb, line 72
def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts

  yield self if block_given?
end

Public Instance Methods

after_crawl(&block) click to toggle source

Add a block to be executed on the PageStore after the crawl is finished

# File lib/anemone/core.rb, line 100
def after_crawl(&block)
  @after_crawl_blocks << block
  self
end
focus_crawl(&block) click to toggle source

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.

# File lib/anemone/core.rb, line 140
def focus_crawl(&block)
  @focus_crawl_block = block
  self
end
on_every_page(&block) click to toggle source

Add a block to be executed on every Page as they are encountered during the crawl

# File lib/anemone/core.rb, line 118
def on_every_page(&block)
  @on_every_page_blocks << block
  self
end
on_pages_like(*patterns, &block) click to toggle source

Add a block to be executed on Page objects with a URL matching one or more patterns

# File lib/anemone/core.rb, line 127
def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end
run() click to toggle source

Perform the crawl

# File lib/anemone/core.rb, line 148
def run
  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.