手機App抓包爬蟲
class DouyuspiderItem(scrapy.Item): name = scrapy.Field()# 存儲照片的名字 imagesUrls = scrapy.Field()# 照片的url路徑 imagesPath = scrapy.Field()# 照片保存在本地的路徑
import scrapy import json from douyuSpider.items import DouyuspiderItem class DouyuSpider(scrapy.Spider): name = "douyu" allowd_domains = ["http://capi.douyucdn.cn"] offset = 0 url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" start_urls = [url + str(offset)] def parse(self, response): # 返回從json裏獲取 data段數據集合 data = json.loads(response.text)["data"] for each in data: item = DouyuspiderItem() item["name"] = each["nickname"] item["imagesUrls"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
ITEM_PIPELINES = {'douyuSpider.pipelines.ImagesPipeline': 1} # Images 的存放位置,以後會在pipelines.py裏調用 IMAGES_STORE = "/Users/Power/lesson_python/douyuSpider/Images" # user-agent USER_AGENT = 'DYZB/2.290 (iPhone; iOS 9.3.4; Scale/2.00)'
import scrapy import os from scrapy.pipelines.images import ImagesPipeline from scrapy.utils.project import get_project_settings class ImagesPipeline(ImagesPipeline): IMAGES_STORE = get_project_settings().get("IMAGES_STORE") def get_media_requests(self, item, info): image_url = item["imagesUrls"] yield scrapy.Request(image_url) def item_completed(self, results, item, info): # 固定寫法,獲取圖片路徑,同時判斷這個路徑是否正確,若是正確,就放到 image_path裏,ImagesPipeline源碼剖析可見 image_path = [x["path"] for ok, x in results if ok] os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg") item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"] return item #get_media_requests的做用就是爲每個圖片連接生成一個Request對象,這個方法的輸出將做爲item_completed的輸入中的results,results是一個元組,每一個元組包括(success, imageinfoorfailure)。若是success=true,imageinfoor_failure是一個字典,包括url/path/checksum三個key。
from scrapy import cmdline cmdline.execute('scrapy crawl douyu'.split())
http://wz.sun0769.com/index.php/question/questionType?type=4php
爬取投訴帖子的編號、帖子的url、帖子的標題,和帖子裏的內容。html
import scrapy class DongguanItem(scrapy.Item): # 每一個帖子的標題 title = scrapy.Field() # 每一個帖子的編號 number = scrapy.Field() # 每一個帖子的文字內容 content = scrapy.Field() # 每一個帖子的url url = scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from dongguan.items import DongguanItem import time class SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 每一頁的匹配規則 pagelink = LinkExtractor(allow=('type=4')) # 每一個帖子的匹配規則 contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml') rules = [ # 本案例爲特殊狀況,須要調用deal_links方法處理每一個頁面裏的連接 Rule(pagelink, process_links = "deal_links", follow = True), Rule(contentlink, callback = 'parse_item') ] # 須要從新處理每一個頁面裏的連接,將連接裏的‘Type&type=4?page=xxx’替換爲‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替換爲‘Type?page=xxx&type=4’),不然沒法發送這個連接 def deal_links(self, links): for link in links: link.url = link.url.replace("?","&").replace("Type&", "Type?") print link.url return links def parse_item(self, response): print response.url item = DongguanItem() # 標題 item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 編號 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 文字內容,默認先取出有圖片狀況下的文字內容列表 content = response.xpath('//div[@class="contentext"]/text()').extract() # 若是沒有內容,則取出沒有圖片狀況下的文字內容列表 if len(content) == 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() # content爲列表,經過join方法拼接爲字符串,並去除首尾空格 item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 連接 item['url'] = response.url yield item
# -*- coding: utf-8 -*- # 文件處理類庫,能夠指定編碼格式 import codecs import json class JsonWriterPipeline(object): def __init__(self): # 建立一個只寫文件,指定文本編碼格式爲utf-8 self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8') def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) + "\n" self.filename.write(content) return item def spider_closed(self, spider): self.file.close()
ITEM_PIPELINES = { 'dongguan.pipelines.DongguanPipeline': 300, } # 日誌文件名和處理等級 LOG_FILE = "dg.log" LOG_LEVEL = "DEBUG"
from scrapy import cmdline cmdline.execute('scrapy crawl sunwz'.split())
py2 main.py
爬取新浪網導航頁全部下全部大類、小類、小類裏的子連接,以及子連接頁面的新聞內容。python
效果演示圖:android
import scrapy import sys reload(sys) sys.setdefaultencoding("utf-8") class SinaItem(scrapy.Item): # 大類的標題 和 url parentTitle = scrapy.Field() parentUrls = scrapy.Field() # 小類的標題 和 子url subTitle = scrapy.Field() subUrls = scrapy.Field() # 小類目錄存儲路徑 subFilename = scrapy.Field() # 小類下的子連接 sonUrls = scrapy.Field() # 文章標題和內容 head = scrapy.Field() content = scrapy.Field()
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*- from Sina.items import SinaItem import scrapy import os import sys reload(sys) sys.setdefaultencoding("utf-8") class SinaSpider(scrapy.Spider): name= "sina" allowed_domains= ["sina.com.cn"] start_urls= [ "http://news.sina.com.cn/guide/" ] def parse(self, response): items= [] # 全部大類的url 和 標題 parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract() parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract() # 全部小類的ur 和 標題 subUrls = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract() subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract() #爬取全部大類 for i in range(0, len(parentTitle)): # 指定大類目錄的路徑和目錄名 parentFilename = "./Data/" + parentTitle[i] #若是目錄不存在,則建立目錄 if(not os.path.exists(parentFilename)): os.makedirs(parentFilename) # 爬取全部小類 for j in range(0, len(subUrls)): item = SinaItem() # 保存大類的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 檢查小類的url是否以同類別大類url開頭,若是是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) if_belong = subUrls[j].startswith(item['parentUrls']) # 若是屬於本大類,將存儲目錄放在本大類目錄下 if(if_belong): subFilename =parentFilename + '/'+ subTitle[j] # 若是目錄不存在,則建立目錄 if(not os.path.exists(subFilename)): os.makedirs(subFilename) # 存儲 小類url、title和filename字段數據 item['subUrls'] = subUrls[j] item['subTitle'] =subTitle[j] item['subFilename'] = subFilename items.append(item) #發送每一個小類url的Request請求,獲得Response連同包含meta數據 一同交給回調函數 second_parse 方法處理 for item in items: yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse) #對於返回的小類的url,再進行遞歸請求 def second_parse(self, response): # 提取每次Response的meta數據 meta_1= response.meta['meta_1'] # 取出小類裏全部子連接 sonUrls = response.xpath('//a/@href').extract() items= [] for i in range(0, len(sonUrls)): # 檢查每一個連接是否以大類url開頭、以.shtml結尾,若是是返回True if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls']) # 若是屬於本大類,獲取字段值放在同一個item下便於傳輸 if(if_belong): item = SinaItem() item['parentTitle'] =meta_1['parentTitle'] item['parentUrls'] =meta_1['parentUrls'] item['subUrls'] = meta_1['subUrls'] item['subTitle'] = meta_1['subTitle'] item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] items.append(item) #發送每一個小類下子連接url的Request請求,獲得Response後連同包含meta數據 一同交給回調函數 detail_parse 方法處理 for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse) # 數據解析方法,獲取文章標題和內容 def detail_parse(self, response): item = response.meta['meta_2'] content = "" head = response.xpath('//h1[@id=\"main_title\"]/text()') content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract() # 將p標籤裏的文本內容合併到一塊兒 for content_one in content_list: content += content_one item['head']= head item['content']= content yield item
from scrapy import signals import sys reload(sys) sys.setdefaultencoding("utf-8") class SinaPipeline(object): def process_item(self, item, spider): sonUrls = item['sonUrls'] # 文件名爲子連接url中間部分,並將 / 替換爲 _,保存爲 .txt格式 filename = sonUrls[7:-6].replace('/','_') filename += ".txt" fp = open(item['subFilename']+'/'+filename, 'w') fp.write(item['content']) fp.close() return item settings.py BOT_NAME = 'Sina' SPIDER_MODULES = ['Sina.spiders'] NEWSPIDER_MODULE = 'Sina.spiders' ITEM_PIPELINES = { 'Sina.pipelines.SinaPipeline': 300, } LOG_LEVEL = 'DEBUG'
from scrapy import cmdline cmdline.execute('scrapy crawl sina'.split())
py2 main.py
class CoserItem(scrapy.Item): url = scrapy.Field() name = scrapy.Field() info = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
# -*- coding: utf-8 -*- from scrapy.selector import Selector import scrapy from scrapy.contrib.loader import ItemLoader from Cosplay.items import CoserItem class CoserSpider(scrapy.Spider): name = "coser" allowed_domains = ["bcy.net"] start_urls = ( 'http://bcy.net/cn125101', 'http://bcy.net/cn126487', 'http://bcy.net/cn126173' ) def parse(self, response): sel = Selector(response) for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract(): link = 'http://bcy.net%s' % link request = scrapy.Request(link, callback=self.parse_item) yield request def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
import requests from Cosplay import settings import os class ImageDownloadPipeline(object): def process_item(self, item, spider): if 'image_urls' in item: images = [] dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['image_urls']: us = image_url.split('/')[3:] image_file_name = '_'.join(us) file_path = '%s/%s' % (dir_path, image_file_name) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item['images'] = images return item
ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1} IMAGES_STORE = '../Images' DOWNLOAD_DELAY = 0.25 # 250 ms of delay
from scrapy import cmdline cmdline.execute('scrapy crawl coser'.split())
py2 main.py
爬取豆瓣電影top250movie.douban.com/top250的電影數據,並保存在MongoDB中。web
class DoubanspiderItem(scrapy.Item): # 電影標題 title = scrapy.Field() # 電影評分 score = scrapy.Field() # 電影信息 content = scrapy.Field() # 簡介 info = scrapy.Field()
import scrapy from doubanSpider.items import DoubanspiderItem class DoubanSpider(scrapy.Spider): name = "douban" allowed_domains = ["movie.douban.com"] start = 0 url = 'https://movie.douban.com/top250?start=' end = '&filter=' start_urls = [url + str(start) + end] def parse(self, response): item = DoubanspiderItem() movies = response.xpath("//div[@class=\'info\']") for each in movies: title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] # 以;做爲分隔,將content列表裏全部元素合併成一個新的字符串 item['content'] = ';'.join(content) item['score'] = score[0] item['info'] = info[0] # 提交item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
from scrapy.conf import settings import pymongo class DoubanspiderPipeline(object): def __init__(self): # 獲取setting主機名、端口號和數據庫名 host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbname = settings['MONGODB_DBNAME'] # pymongo.MongoClient(host, port) 建立MongoDB連接 client = pymongo.MongoClient(host=host,port=port) # 指向指定的數據庫 mdb = client[dbname] # 獲取數據庫裏存放數據的表名 self.post = mdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): data = dict(item) # 向指定的表裏添加數據 self.post.insert(data) return item settings.py BOT_NAME = 'doubanSpider' SPIDER_MODULES = ['doubanSpider.spiders'] NEWSPIDER_MODULE = 'doubanSpider.spiders' ITEM_PIPELINES = { 'doubanSpider.pipelines.DoubanspiderPipeline' : 300 } # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' # MONGODB 主機環回地址127.0.0.1 MONGODB_HOST = '127.0.0.1' # 端口號,默認是27017 MONGODB_PORT = 27017 # 設置數據庫名稱 MONGODB_DBNAME = 'DouBan' # 存放本次數據的表名稱 MONGODB_DOCNAME = 'DouBanMovies'
啓動MongoDB數據庫須要兩個命令: mongod:是mongoDB數據庫進程自己 mongo:是命令行shell客戶端 sudo mongod # 首先啓動數據庫服務,再執行Scrapy sudo mongo # 啓動數據庫shell 在mongo shell下使用命令: # 查看當前數據庫 > db # 列出全部的數據庫 > show dbs # 鏈接DouBan數據庫 > use DouBan # 列出全部表 > show collections # 查看錶裏的數據 > db.DouBanMoives.find()
COOKIES_ENABLED
(Cookies中間件) 處於開啓狀態COOKIES_ENABLED = True 或 # COOKIES_ENABLED = False
只要是須要提供post數據的,就能夠用這種方法。下面示例裏post的數據是帳戶密碼:spring
# -*- coding: utf-8 -*- import scrapy class Renren1Spider(scrapy.Spider): name = "renren1" allowed_domains = ["renren.com"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' # FormRequest 是Scrapy發送POST請求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"}, callback = self.parse_page) def parse_page(self, response): with open("mao2.html", "w") as filename: filename.write(response.body)
正統模擬登陸方法:shell
首先發送登陸頁面的get請求,獲取到頁面裏的登陸必須的參數(好比說zhihu登錄界面的 _xsrf)數據庫
而後和帳戶密碼一塊兒post到服務器,登陸成功json
# -*- coding: utf-8 -*- import scrapy class Renren2Spider(scrapy.Spider): name = "renren2" allowed_domains = ["renren.com"] start_urls = ( "http://www.renren.com/PLogin.do", ) # 處理start_urls裏的登陸url的響應內容,提取登錄須要的參數(若是須要的話) def parse(self, response): # 提取登錄須要的參數 #_xsrf = response.xpath("//_xsrf").extract()[0] # 發送請求參數,並調用指定回調函數處理 yield scrapy.FormRequest.from_response( response, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf}, callback = self.parse_page ) # 獲取登陸成功狀態,訪問須要登陸後才能訪問的頁面 def parse_page(self, response): url = "http://www.renren.com/422167102/profile" yield scrapy.Request(url, callback = self.parse_newpage) # 處理響應內容 def parse_newpage(self, response): with open("xiao.html", "w") as filename: filename.write(response.body)
若是實在沒辦法了,能夠用這種方法模擬登陸,雖然麻煩一點,可是成功率100%api
# -*- coding: utf-8 -*- import scrapy class RenrenSpider(scrapy.Spider): name = "renren" allowed_domains = ["renren.com"] start_urls = ( 'http://www.renren.com/111111', 'http://www.renren.com/222222', 'http://www.renren.com/333333', ) cookies = { "anonymid" : "ixrna3fysufnwv", "_r01_" : "1", "ap" : "327550029", "JSESSIONID" : "abciwg61A_RvtaRS3GjOv", "depovince" : "GW", "springskin" : "set", "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950", "t" : "691808127750a83d33704a565d8340ae9", "societyguester" : "691808127750a83d33704a565d8340ae9", "id" : "327550029", "xnsid" : "f42b25cf", "loginfrom" : "syshome" } # 能夠重寫Spider類的start_requests方法,附帶Cookie值,發送POST請求 def start_requests(self): for url in self.start_urls: yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page) # 處理響應內容 def parse_page(self, response): print "===========" + response.url with open("deng.html", "w") as filename: filename.write(response.body)
經過Fiddler抓包工具,能夠抓取手機的網絡通訊,但前提是手機和電腦處於同一局域網內(WI-FI或熱點),而後進行如下設置:
打開Fiddler設置
2.在裏設置容許鏈接遠程計算機,確認後從新啓動FiddlerConnections
1.在命令提示符下輸入查看本機IPipconfig
2.打開Android設備的「設置」->「WLAN」,找到你要鏈接的網絡,在上面長按,而後選擇「修改網絡」,彈出網
絡設置對話框,而後勾選「顯示高級選項」。
3.在「代理」後面的輸入框選擇「手動」,在「代理服務器主機名」後面的輸入框輸入電腦的ip地址,在「代理服務器端口」後面的輸入框輸入8888,而後點擊「保存」按鈕。
4.啓動Android設備中的瀏覽器,訪問網頁便可在Fiddler中能夠看到完成的請求和響應數據。
基本流程差很少,只是手機設置不太同樣:
iPhone手機:點擊設置 > 無線局域網 > 無線網絡 > HTTP代理 > 手動:
代理地址(電腦IP):192.168.xx.xxx
端口號:8888