需求:手機抓包和下載圖片(圖片重命名)php
1.3 手機設置代理服務器python
使用命令ipconfig在windows上查看獵豹免費WiFi的產生的ipandroid
手機設置代理服務器json
2.Letvlive.pywindows
import scrapy import json from Letv.items import LetvItem # LetvliveSpider名字能夠任意,繼承scrapy.Spider,基本爬蟲 class LetvliveSpider(scrapy.Spider): # 爬蟲名稱,在當前項目中名字不能重複發 name = 'Letvlive' # 爬取的網站,只能在這個範圍內容,若是註釋掉,沒有域名的限制,因此的網站均可以爬 allowed_domains = ['letv.com'] page = 1 pre = "http://dynamic.live.app.m.letv.com/android/dynamic.php?luamod=main&mod=live&ctl=liveHuya&act=channelList&pcode=010210000&version=7.17&channelId=2168&pages=" suf = "&country=CN&provinceid=1&districtid=9&citylevel=1&location=%E5%8C%97%E4%BA%AC%E5%B8%82%7C%E6%9C%9D%E9%98%B3%E5%8C%BA&lang=chs®ion=CN" # start_urls裏面的連接不受allowed_domains這裏面的如今 start_urls = [pre + str(page) + suf] def parse(self, response): json_text = response.text # 把json_text 轉換成python_dict python_dict = json.loads(json_text) for item in python_dict["body"]["result"]: letvItem = LetvItem() # 獲取暱稱 nick = item["nick"] image = item["screenshot"] letvItem["nick"] = nick letvItem["image"] = image print(letvItem) # 傳遞給pipelines(管道) yield letvItem if python_dict.get("header").get("status") == "1": self.page += 1 new_url = self.pre + str(self.page) + self.suf # 會有相同的url連接,這個連接請求了,就不去請求 # 把因此添加的連接,作去重處理,請求,當再次添加相同的連接進入的時候,判斷請求過了,就不請求了 # 把添加的,沒有重複的請求後,爬蟲結束了 yield scrapy.Request(new_url, callback=self.parse)
3.pipelines.py服務器
import scrapy from scrapy.pipelines.images import ImagesPipeline # 保存圖片 import json import os from Letv.settings import IMAGES_STORE # from scrapy.utils.project import get_project_settings class LetvImagePipeline(ImagesPipeline): # IMAGES_STORE = get_project_settings().get("IMAGES_STORE") # 添加請求圖片的路徑 def get_media_requests(self, item, info): # 圖片下載路徑 image = item["image"] # 把圖片路徑添加到scrapy引擎裏面,讓對應的下載器幫咱們下載圖片 yield scrapy.Request(image) # 當圖片下載完成後,會調用的方法,而且把下載後的路徑,回傳到這個方法裏 def item_completed(self, results, item, info): print("results===", results) image = [x["path"] for ok, x in results if ok][0] print(image) # 把圖片的名字重命名 old_image_name = IMAGES_STORE + "/" + image # ./images/黑做坊丶小美兒.jpg new_image_name = IMAGES_STORE + "/" + item["nick"] + ".jpg" print("old_image_name==", old_image_name) print("new_image_name==", new_image_name) # 重命名 os.rename(old_image_name, new_image_name) print(image) item["image_path"] = new_image_name return item # 默認是處理文本 class LetvPipeline(object): # 爬蟲開始執行的時候調用 def open_spider(self, spider): self.file = open(spider.name + ".json", "w") def process_item(self, item, spider): python_dict = dict(item) # pyhton 字典-->pyhton str json_str = json.dumps(python_dict, ensure_ascii=False) + "\n" self.file.write(json_str) return item # 當爬蟲結束的時候調用 def close_spider(self, spider): self.file.close()
4.settings.pyapp
# 不遵循爬蟲協議
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = { 'Letv.pipelines.LetvPipeline': 301, # 保存文本 'Letv.pipelines.LetvImagePipeline': 300, # 保存圖片 } # 圖片保存的路徑,必定要寫,不然不去下載圖片,要寫對 IMAGES_STORE = "./images"
5.運行文件 ---start.pydom
from scrapy import cmdline cmdline.execute("scrapy crawl Letvlive".split())