In Files

Parent

Namespace

Files

Class/Module Index [+]

Quicksearch

Robotex

Attributes

user_agent[R]

Public Class Methods

get_robots_txt(uri, user_agent) click to toggle source
# File lib/robotex.rb, line 100
def self.get_robots_txt(uri, user_agent)
  begin
    Timeout::timeout(Robotex.timeout) do
      io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
    end 
  rescue Timeout::Error
    STDERR.puts "robots.txt request timed out"
  end
end
new(user_agent = nil) click to toggle source
# File lib/robotex.rb, line 118
def initialize(user_agent = nil)
  user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
  @user_agent = user_agent
  @last_accessed = Time.at(1)
  @parsed = {}
end
timeout() click to toggle source
# File lib/robotex.rb, line 114
def self.timeout
  @timeout || DEFAULT_TIMEOUT
end
timeout=(t) click to toggle source
# File lib/robotex.rb, line 110
def self.timeout=(t)
  @timeout = t
end

Public Instance Methods

allowed?(uri) click to toggle source

Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise

# File lib/robotex.rb, line 133
def allowed?(uri)
  parse_host(uri).allowed?(uri, @user_agent)
end
delay(uri) click to toggle source

Return the value of the Crawl-Delay directive, or nil if none

# File lib/robotex.rb, line 139
def delay(uri)
  parse_host(uri).delay(@user_agent)
end
delay!(uri) click to toggle source

Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server

# File lib/robotex.rb, line 146
def delay!(uri)
  delay = delay(uri)
  sleep delay - (Time.now - @last_accessed) if !!delay
  @last_accessed = Time.now
end
parse_host(uri) click to toggle source
# File lib/robotex.rb, line 125
def parse_host(uri)
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.