module Scraper::Reader
Constants
- DEFAULT_TIMEOUT
- PARSERS
- Page
- Parsed
- REDIRECT_LIMIT
- TIDY_OPTIONS
Public Instance Methods
# File lib/scraper/reader.rb, line 224 def find_tidy() return if Tidy.path begin Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so") rescue LoadError begin Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll") rescue LoadError Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib") end end end
Parses an HTML page and returns the encoding and HTML element. Raises HTMLParseError exceptions if it cannot parse the HTML.
Options are passed to the parser. For example, when using Tidy you can pass Tidy cleanup options in the hash.
The last option specifies which parser to use (see PARSERS). By default Tidy is used.
# File lib/scraper/reader.rb, line 189 def parse_page(content, encoding = nil, options = nil, parser = :tidy) begin # Get the document encoding from the meta header. if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i) if meta = meta[0].match(/charset=([\w-]*)/i) encoding = meta[1] end end encoding ||= "utf8" case (parser || :tidy) when :tidy # Make sure the Tidy path is set and always apply the default # options (these only control things like errors, output type). find_tidy options = (options || {}).update(TIDY_OPTIONS) options[:input_encoding] = encoding.gsub("-", "").downcase document = Tidy.open(options) do |tidy| html = tidy.clean(content) HTML::Document.new(html).find(:tag=>"html") end when :html_parser document = HTML::HTMLParser.parse(content).root else raise HTMLParseError, "No parser #{parser || "unspecified"}" end return Parsed[document, encoding] rescue Exception=>error raise HTMLParseError.new(error) end end
Reads a Web page and return its URL, content and cache control headers.
The request reads a Web page at the specified URL (must be a URI object). It accepts the following options:
-
:last_modified – Last modified header (from a previous request).
-
:etag – ETag header (from a previous request).
-
:redirect_limit – Number of redirects allowed (default is 3).
-
:user_agent – The User-Agent header to send.
-
:timeout – HTTP open connection/read timeouts (in second).
It returns a hash with the following information:
-
:url – The URL of the requested page (may change by permanent redirect)
-
:content – The content of the response (may be nil if cached)
-
:content_type – The HTML page Content-Type header
-
:last_modified – Last modified cache control header (may be nil)
-
:etag – ETag cache control header (may be nil)
-
:encoding – Document encoding for the page
If the page has not been modified from the last request, the content is nil.
Raises HTTPError if an error prevents it from reading the page.
# File lib/scraper/reader.rb, line 109 def read_page(url, options = nil) options ||= {} redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT raise HTTPRedirectLimitError if redirect_limit == 0 if url.is_a?(URI) uri = url else begin uri = URI.parse(url) rescue Exception=>error raise HTTPInvalidURLError.new(error) end end raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/ begin http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = (uri.scheme == "https") http.close_on_empty_response = true http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT path = uri.path.dup # required so we don't modify path path << "?#{uri.query}" if uri.query # TODO: Specify which content types are accepted. # TODO: GZip support. headers = {} headers["User-Agent"] = options[:user_agent] if options[:user_agent] headers["Last-Modified"] = options[:last_modified] if options[:last_modified] headers["ETag"] = options[:etag] if options[:etag] response = http.request_get(path, headers) # TODO: Ignore content types that do not map to HTML. rescue TimeoutError=>error raise HTTPTimeoutError.new(error) rescue Exception=>error raise HTTPUnspecifiedError.new(error) end case response when Net::HTTPSuccess encoding = if content_type = response["Content-Type"] if match = content_type.match(/charset=([^\s]+)/i) match[1] end end return Page[(options[:source_url] || uri), response.body, encoding, response["Last-Modified"], response["ETag"]] when Net::HTTPNotModified return Page[(options[:source_url] || uri), nil, nil, options[:last_modified], options[:etag]] when Net::HTTPMovedPermanently return read_page(response["location"], # New URL takes effect :last_modified=>options[:last_modified], :etag=>options[:etag], :redirect_limit=>redirect_limit-1) when Net::HTTPRedirection return read_page(response["location"], :last_modified=>options[:last_modified], :etag=>options[:etag], :redirect_limit=>redirect_limit-1, :source_url=>(options[:source_url] || uri)) # Old URL still in effect when Net::HTTPNotFound raise HTTPNotFoundError when Net::HTTPUnauthorized, Net::HTTPForbidden raise HTTPNoAccessError when Net::HTTPRequestTimeOut raise HTTPTimeoutError else raise HTTPUnspecifiedError end end