https://github.com/hellysmile/fake-useragentcss
scrapy使用fake-useragent
在全局配置文件中禁用掉默認的UA,將其設置爲None便可
settings.pyhtml
DOWNLOADER_MIDDLEWARES = { ... 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, }
在中間件中編寫本身的middleware
middlewares.pypython
class RandomUserAgentMiddleware(object): def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) request.headers.setdefault('User-Agent', get_ua())
將本身寫的middleware配置進settings中git
DOWNLOADER_MIDDLEWARES = { 'myproject.middlewares.CustomDownloaderMiddleware': 543, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, }
https://github.com/scrapy-plugins/scrapy-crawlera
爬取西刺IP代理網站獲取IPgithub
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from w3lib.html import remove_tags from ArticleSpider.items import ProxyIpItemLoader, ProxyIpItem class ProxyIpSpider(CrawlSpider): name = 'proxy' allowed_domains = ['www.xicidaili.com'] start_urls = ['http://www.xicidaili.com'] rules = ( Rule(LinkExtractor(allow=('nn/\d*')), callback='parse_detail', follow=True), ) def parse_detail(self, response): ip_list = response.css('#ip_list tr') for ipdata in ip_list[1:]: item_loader = ProxyIpItemLoader(item=ProxyIpItem(), response=response) data = ipdata.css('td') item_loader.add_value('ip', data[1].css('td::text').extract_first()) item_loader.add_value('port', data[2].css('td::text').extract_first()) item_loader.add_value('addr', self.get_addr(data[3])) item_loader.add_value('ishidden', data[4].css('td::text').extract_first()) item_loader.add_value('type', data[5].css('td::text').extract_first()) item_loader.add_value('speed', data[6].css('div::attr(title)').extract_first()) item_loader.add_value('conn_time', data[7].css('div::attr(title)').extract_first()) item_loader.add_value('live_time', data[8].css('td::text').extract_first()) item_loader.add_value('check_time', data[9].css('td::text').extract_first()) proxy_ip_item = item_loader.load_item() yield proxy_ip_item def get_addr(self, value): if value.css('a'): return remove_tags(value.extract()).strip() else: return "未知"
對數據進行簡單處理sql
class ProxyIpItemLoader(ItemLoader): default_output_processor = TakeFirst() def live_time(value): ''' 分鐘, 小時, 天 統一轉換成分鐘 ''' if '分鐘' in value: return int(value.split('分鐘')[0]) elif '小時' in value: value = value.split('小時')[0] return int(value) * 60 elif '天' in value: value = value.split('天')[0] return int(value) * 60 * 24 def ishidden_to_int(value): if '高匿' in value: return int(1) else: return int(0) def check_time(value): return datetime.datetime.strptime(value, "%y-%m-%d %H:%M") class ProxyIpItem(scrapy.Item): ''' {'addr': '陝西西安', 'check_time': '12-12-31 18:52', 'conn_time': '0.82秒', 'ip': '113.133.160.203', 'ishidden': '高匿', 'live_time': '1分鐘', 'port': '6675', 'speed': '3.595秒', 'type': 'socks4/5' } ''' ip = scrapy.Field() port = scrapy.Field() addr = scrapy.Field( input_processor = MapCompose(remove_tags, lambda x:x.strip()) ) ishidden = scrapy.Field( input_processor=MapCompose(ishidden_to_int) ) type = scrapy.Field() speed = scrapy.Field() conn_time = scrapy.Field() live_time = scrapy.Field( input_processor = MapCompose(live_time) ) check_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into proxy_ip(ip, port, addr, ishidden, type, speed, conn_time, live_time, check_time) VALUES (%s, %s, %s, %s,%s,%s, %s, %s, %s) """ params = (self["ip"], self["port"], self["addr"], self["ishidden"], self["type"],self["speed"], self["conn_time"], self["live_time"], self["check_time"]) return insert_sql, params
在pipeline中進行數據的再次清洗,拋棄全部的特殊端口的item,並數據進行保存
在中間件中建立切換IP的中間件,在主配置文件中啓用這個中間件
IP是否可用,只須要請求百度便可api
不必本身寫一個驗證碼識別代碼
能夠使用雲打碼平臺進行驗證碼識別
http://www.yundama.com/
須要分別註冊一個普通用戶和一個開發者帳號app
下載pythonhttp版本
http://www.yundama.com/apidoc/YDM_SDK.html#DLL
解壓后里面有一個3.x的文件,打開後進行配置dom
# 用戶名(普通用戶) username = 'username' # 密碼(普通用戶) password = 'password' # 軟件ID,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到! appid = 1 # 軟件密鑰,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到! appkey = '22cc5376925e9387a23cf797cb9ba745' # 圖片文件 filename = 'getimage.jpg' # 驗證碼類型,# 例:1004表示4位字母數字,不一樣類型收費不一樣。請準確填寫,不然影響識別率。在此查詢全部類型 http://www.yundama.com/price.html codetype = 1004 # 超時時間,秒 timeout = 60