Object
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
# File lib/anemone/page.rb, line 198 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => Marshal.load(hash['headers']), '@data' => Marshal.load(hash['data']), '@body' => hash['body'], '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end
Create a new page
# File lib/anemone/page.rb, line 36 def initialize(url, params = {}) @url = url @data = OpenStruct.new @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
# File lib/anemone/page.rb, line 138 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end
The content-type returned by the HTTP request for this page
# File lib/anemone/page.rb, line 106 def content_type headers['content-type'].first end
Delete the Nokogiri document and response body to conserve memory
# File lib/anemone/page.rb, line 83 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end
Nokogiri document for the HTML body
# File lib/anemone/page.rb, line 75 def doc return @doc if @doc @doc = Nokogiri::HTML(@body) if @body && html? rescue nil end
Was the page successfully fetched? true if the page was fetched with no error, false otherwise.
# File lib/anemone/page.rb, line 92 def fetched? @fetched end
Returns true if the page is a HTML document, returns false otherwise.
# File lib/anemone/page.rb, line 114 def html? !!(content_type =~ %{^(text/html|application/xhtml+xml)\b}) end
Returns true if uri is in the same domain as the page, returns false otherwise
# File lib/anemone/page.rb, line 171 def in_domain?(uri) uri.host == @url.host end
Array of distinct A tag HREFs from the page
# File lib/anemone/page.rb, line 57 def links return @links unless @links.nil? @links = [] return @links if !doc doc.search("//a[@href]").each do |a| u = a['href'] next if u.nil? or u.empty? abs = to_absolute(u) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end
# File lib/anemone/page.rb, line 175 def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end
# File lib/anemone/page.rb, line 179 def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end
Returns true if the page was not found (returned 404 code), returns false otherwise.
# File lib/anemone/page.rb, line 130 def not_found? 404 == @code end
Returns true if the page is a HTTP redirect, returns false otherwise.
# File lib/anemone/page.rb, line 122 def redirect? (300..307).include?(@code) end
Converts relative URL link into an absolute URL based on the location of the page
# File lib/anemone/page.rb, line 153 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end
# File lib/anemone/page.rb, line 183 def to_hash {'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'data' => Marshal.dump(@data), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'visited' => @visited, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched} end
Generated with the Darkfish Rdoc Generator 2.