從一個網址開始爬: html
Spidr.start_at('http://tenderlovemaking.com/')
爬一個host: web
Spidr.host('coderrr.wordpress.com')
爬一個網站: ruby
Spidr.site('http://rubyflow.com/')
多個 hosts: ide
Spidr.start_at( 'http://company.com/', :hosts => [ 'company.com', /host\d\.company\.com/ ] )
忽略某個地址: wordpress
Spidr.site('http://matasano.com/', :ignore_links => [/log/])
忽略端口: 網站
Spidr.site( 'http://sketchy.content.com/', :ignore_ports => [8000, 8010, 8080] )
打印出爬到的網址: ui
Spidr.site('http://rubyinside.org/') do |spider| spider.every_url { |url| puts url } end
打印出失敗的網址: url
Spidr.site('http://sketchy.content.com/') do |spider| spider.every_failed_url { |url| puts url } end
找出全部包含壞鏈的頁面: spa
url_map = Hash.new { |hash,key| hash[key] = [] } spider = Spidr.site('http://intranet.com/') do |spider| spider.every_link do |origin,dest| url_map[dest] << origin end end spider.failures.each do |url| puts "Broken link #{url} found in:" url_map[url].each { |page| puts " #{page}" } end
搜索 HTML 或 XML 頁面: code
Spidr.site('http://company.withablog.com/') do |spider| spider.every_page do |page| puts "[-] #{page.url}" page.search('//meta').each do |meta| name = (meta.attributes['name'] || meta.attributes['http-equiv']) value = meta.attributes['content'] puts " #{name} = #{value}" end end end
打印出每一個頁面的title:
Spidr.site('http://www.rubypulse.com/') do |spider| spider.every_html_page do |page| puts page.title end end
訪問表頭來獲取網站用的那種web servers:
servers = Set[] Spidr.host('generic.company.com') do |spider| spider.all_headers do |headers| servers << headers['server'] end end
在被禁止的頁面中止spider:
spider = Spidr.host('overnight.startup.com') do |spider| spider.every_forbidden_page do |page| spider.pause! end end
跳過處理一些頁面:
Spidr.host('sketchy.content.com') do |spider| spider.every_missing_page do |page| spider.skip_page! end end
跳過處理一些連接:
Spidr.host('sketchy.content.com') do |spider| spider.every_url do |url| if url.path.split('/').find { |dir| dir.to_i > 1000 } spider.skip_link! end end end