有關函數html
# 可變對象不能作關鍵字參數 # def foo(arg, li=[]): # li.append(arg) # return li # # list1 = foo(21) # list2 = foo(21, [1,]) # list3 = foo(28) # # print(list1) # print(list2) # print(list3) # li.append()沒有返回值 def foo(arg, li=[]): return li.append(arg) list1 = foo(21) list2 = foo(21, [1,]) list3 = foo(28) print(list1) print(list2) print(list3) # list5 = [11, 22, 33, 44, 55] # print(list5[10:]) # 打亂列表的順序 # import random # random.shuffle(list5) # print(list5)
關於爬蟲,核心的一點就是,根據抓包抓到的東西,去分析請求接口,對應建立請求便可。python
代理爬蟲:http://www.goubanjia.com/ 去這個網站找免費的代理mysql
import requests url = 'http://www.baidu.com/s?&wd=ip' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36' } proxy2 = { 'http':'101.4.136.34:81' } response = requests.get(url=url,proxies = proxy2,headers = headers) with open('./daili.html','w',encoding='utf-8') as fp: fp.write(response.text) print('123')
190815 linux
https://www.luffycity.com/micro/play/5070/3074web
一、爬蟲指定多頁數據的爬取正則表達式
二、cookie的做用,用requests.session()方法模擬獲取登陸後的數據redis
三、代理的使用sql
四、第三方平臺解析驗證碼chrome
五、正則回顧數據庫
(一、指定url;二、發起請求;三、獲取頁面數據;四、數據解析;五、持久化存儲)
re模塊知乎爬圖初級:
import requests, re import os,time url = 'https://www.zhihu.com/question/308457217' if not os.path.exists('./zhihuImg'): os.mkdir('zhihuImg') headers = { # 存儲任意的請求頭信息 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response = requests.get(url=url, headers=headers) # print(response.text) pic_list = re.findall('<noscript>.*?<img src=".*?">.*?</noscript>', response.text, re.S) # print(pic_list) new_list = [] for li in pic_list: st = (re.findall('https.*?_hd.jpg', li))[0] # st = st.replace('_hd', '_r') new_list.append(st) # print(new_list) img_path = './zhihuImg/' inx = 0 for index, li in enumerate(new_list): # time.sleep(5) inx = inx + 1 img_data = requests.get(url=li, headers=headers).content img_name = img_path + str(inx) + '.jpg' with open(img_name, 'wb') as f: f.write(img_data) print('over')
lxml模塊知乎爬圖初級:
import requests,os from lxml import etree url = 'https://www.zhihu.com/question/308457217' headers = { # 存儲任意的請求頭信息 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } tree = etree.HTML(requests.get(url = url ,headers = headers).text) pic_li = tree.xpath('//div[@class="List-item"]//img/@data-original') pic_list = [] for li in pic_li: if li not in pic_list: pic_list.append(li) img_path = './zhihuImg2' if not os.path.exists(img_path): os.mkdir(img_path) inx = 0 for li in pic_list : inx = inx + 1 img_data = requests.get(url=li, headers=headers).content img_name = os.path.join(img_path, str(inx) + '.jpg') with open(img_name,'wb') as f: f.write(img_data) print('over')
bs4模塊知乎爬圖初級:
import requests,os from bs4 import BeautifulSoup def down(picurl,dirname,filename): if not os.path.exists(dirname): os.mkdir(dirname) with open('%s/%s.jpg'%(dirname,filename), 'wb') as f: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } # time.sleep(3) response = requests.get(url=picurl,headers=headers) if response: f.write(response.content) print(filename) url = 'https://www.zhihu.com/question/285321190' headers = { # 存儲任意的請求頭信息 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } numb = 0 res = requests.get(url = url ,headers = headers) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, features="html.parser") x = soup.find_all('img') print(x) for index,imgs in enumerate(x): numb = numb + 1 urls = imgs.attrs.get('src') url = urls.replace('\\"','').replace('com/50/','com/').replace('_hd','_r').replace('_ipico','_r').replace('_120x160','_r').replace('_180x120','_r') print(url) if url.startswith('http'): print(index,url) down(url,'女生有一雙好看的腿是怎樣的體驗',str(numb)+str(index))
知乎的api:
url = https://www.zhihu.com/api/v4/questions/68381376/answers?sort_by=default&include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&limit=20&offset=
模擬瀏覽器:需求下載對應的驅動(驅動向下兼容)http://chromedriver.storage.googleapis.com/index.html
from selenium import webdriver import time, os executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/chromedriver.exe' if not os.path.exists(executable_path): print('驅動不存在!後面不寫try catch 寫在else裏面也是ok的') try: bro = webdriver.Chrome(executable_path=executable_path) bro.get('https://www.baidu.com') text = bro.find_element_by_id('kw') text.send_keys('人民幣') button = bro.find_element_by_id('su') button.click() time.sleep(5) bro.quit() except: print('驅動不存在!')
phantomJS:
from selenium import webdriver import time, os executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/phantomjs.exe' if not os.path.exists(executable_path): print('驅動不存在!後面不寫try catch 寫在else裏面也是ok的') else: bro = webdriver.PhantomJS(executable_path=executable_path) bro.get('https://www.baidu.com') bro.save_screenshot('./1.png') # 截圖 text = bro.find_element_by_id('kw') text.send_keys('人民幣') button = bro.find_element_by_id('su') button.click() time.sleep(4) bro.save_screenshot('./2.png') bro.quit()
用PhantomJS模擬瀏覽器:
from selenium import webdriver import time, os # executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/chromedriver.exe' executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/phantomjs.exe' if not os.path.exists(executable_path): print('驅動不存在!後面不寫try catch 寫在else裏面也是ok的') else: dir_name = '0817' dir_path = os.path.join('./', dir_name) if not os.path.exists(dir_path): os.mkdir(dir_path) # bro = webdriver.Chrome(executable_path=executable_path) bro = webdriver.PhantomJS(executable_path=executable_path) js = 'window.scrollTo(0,document.body.scrollHeight)' bro.get('https://www.zhihu.com/question/21471417') print('sleep001') time.sleep(1) bro.save_screenshot('./知乎001.png') # 截圖 bro.execute_script(js) print('sleep002') time.sleep(1) bro.save_screenshot('./知乎002.png') # 截圖 bro.quit() print('exec_done')
scrapy安裝:
i. linux mac os:pip install scrapy ii. win: 1. pip install wheel 2. 下載twisted:https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted pip install 下載好的框架.whl 3. pip install pywin32 4. pip install scrapy
scrapy的持久化存儲(文件和管道兩個方式):firstScrapy.zip
基於mysql的存儲:qiubaiPro.zip
(我本身寫的時候使勁兒犯了一個錯:使用pymysql連接數據庫的時候,port = 3306 而不能寫成 port = '3306' ,糾結了一個下午,各類看不懂報錯!!)
scrapy同時用三種方式存儲爬到的東西,主要是在 pipelines.py 中定義對應的類,而且在setting文件中配置好對應的類型。注:示例中的方式都對,可是redis的語法可能跟稍有問題,不必定能成功存儲。qiubaiPro.zip
一次循環多個有規律的url:firstScrapy多個url.zip
post請求(在parse(函數前加start_requests()函數):postPro.zip
# -*- coding: utf-8 -*- import scrapy #需求:百度翻譯中指定詞條對應的翻譯結果進行獲取 class PostdemoSpider(scrapy.Spider): name = 'postDemo' #allowed_domains = ['www.baidu.com'] start_urls = ['https://fanyi.baidu.com/sug'] #該方法實際上是父類中的一個方法:該方法能夠對star_urls列表中的元素進行get請求的發送 #發起post: #1.將Request方法中method參數賦值成post #2.FormRequest()能夠發起post請求(推薦) def start_requests(self): print('start_requests()') #post請求的參數 data = { 'kw': 'dog', } for url in self.start_urls: #formdata:請求參數對應的字典 yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse) def parse(self, response): print(response.text)
post登陸以後跳轉查詢我的主頁用scrapy框架的時候會自動帶上上次的cookie的doubanPro.zip
class DoubanSpider(scrapy.Spider): name = 'douban' #allowed_domains = ['www.douban.com'] start_urls = ['https://www.douban.com/accounts/login'] #重寫start_requests方法 def start_requests(self): #將請求參數封裝到字典 data = { 'source': 'index_nav', 'form_email': '15027900535', 'form_password': 'bobo@15027900535' } for url in self.start_urls: yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse) #針對我的主頁頁面數據進行解析操做 def parseBySecondPage(self,response): fp = open('second.html', 'w', encoding='utf-8') fp.write(response.text) #能夠對當前用戶的我的主頁頁面數據進行指定解析操做 def parse(self, response): #登陸成功後的頁面數據進行存儲 fp = open('main.html','w',encoding='utf-8') fp.write(response.text) #獲取當前用戶的我的主頁 url = 'https://www.douban.com/people/185687620/' yield scrapy.Request(url=url,callback=self.parseBySecondPage)
代理的使用proxyPro.zip:代理要使用下載中間件,(在middlewares.py)中寫一個本身的類(父類是object),重寫process_request的方法
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals #自定義一個下載中間件的類,在類中事先process_request(處理中間價攔截到的請求)方法 class MyProxy(object): def process_request(self,request,spider): #請求ip的更換 request.meta['proxy'] = "https://178.128.90.1:8080"
而後去settings.py中啓用當前的下載中間件:
DOWNLOADER_MIDDLEWARES = { 'proxyPro.middlewares.MyProxy': 543,# 項目名.默認值.類名 }
日誌等級:在settings.py中
日誌等級(種類): ERROR:錯誤 WARNING:警告 INFO:通常信息 DEBUG:調試信息(默認) 指定輸入某一中日誌信息: settings:LOG_LEVEL = ‘ERROR’ 將日誌信息存儲到制定文件中,而並不是顯示在終端裏: settings:LOG_FILE = ‘log.txt’
請求傳參(即部份內容需在子頁面中獲取):moviePro.zip
# 要點:主程序調用解析子頁面的函數的時候經過meta參數把主函數值傳過去 # -*- coding: utf-8 -*- import scrapy from moviePro.items import MovieproItem class MovieSpider(scrapy.Spider): name = 'movie' #allowed_domains = ['www.id97.com'] start_urls = ['http://www.id97.com/movie'] #專門用於解析二級子頁面中的數據值 def parseBySecondPage(self,response): actor = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first() language = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()').extract_first() longTime = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()').extract_first() #取出Request方法的meta參數傳遞過來的字典(response.meta) item = response.meta['item'] item['actor'] = actor item['language'] = language item['longTime'] = longTime #將item提交給管道 yield item def parse(self, response): #名稱,類型,導演,語言,片長 div_list = response.xpath('/html/body/div[1]/div[1]/div[2]/div') for div in div_list: name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first() #以下xpath方法返回的是一個列表,切列表元素爲4 kind = div.xpath('.//div[@class="otherinfo"]//text()').extract() #將kind列表轉化成字符串 kind = "".join(kind) url = div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first() print(kind) #建立items對象 item = MovieproItem() item['name'] = name item['kind'] = kind #問題:如何將剩下的電影詳情數據存儲到item對象(meta) #須要對url發起請求,獲取頁面數據,進行指定數據解析 #meta參數只能夠賦值一個字典(將item對象先封裝到字典) yield scrapy.Request(url=url,callback=self.parseBySecondPage,meta={'item':item})
CrawlSpider:crawlSpider代碼.zip
問題:若是咱們想要對某一個網站的全站數據進行爬取? 解決方案: 1. 手動請求的發送 2. CrawlSpider(推薦) CrawlSpider概念:CrawlSpider其實就是Spider的一個子類。CrawlSpider功能更增強大(連接提取器,規則解析器)。 代碼: 1. 建立一個基於CrawlSpider的爬蟲文件 a) scrapy genspider –t crawl 爬蟲名稱 起始url
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class ChoutiSpider(CrawlSpider): name = 'chouti' #allowed_domains = ['dig.chouti.com'] start_urls = ['https://dig.chouti.com/'] #實例化了一個連接提取器對象 #連接提取器:用來提取指定的連接(url) #allow參數:賦值一個正則表達式 #連接提取器就能夠根據正則表達式在頁面中提取指定的連接 #提取到的連接會所有交給規則解析器 link = LinkExtractor(allow=r'/all/hot/recent/\d+') rules = ( #實例化了一個規則解析器對象 #規則解析器接受了連接提取器發送的連接後,就會對這些連接發起請求,獲取連接對應的頁面內容,就會根據指定的規則對頁面內容中指定的數據值進行解析 #callback:指定一個解析規則(方法/函數) #follow:是否將連接提取器繼續做用到鏈接提取器提取出的連接所表示的頁面數據中 Rule(link, callback='parse_item', follow=True), ) def parse_item(self, response): print(response)
建立項目等終端操做:
scrapy startproject firstScrapy # 建立項目 scrapy genspider (-t crawl) firstScrapy www.baidu.com # -t crawl建立的是crawlSpider程序 firstScrapy是爬蟲程序名 www.baidu.com是爬蟲程序起始地址 scrapy crawl firstScrapy --nolog # 執行爬蟲程序
分佈式爬蟲:wangyiPro.zip
scrappy框架中使用selenium模塊:
基於scrapy-redis的第二種形式的分佈式爬蟲: 1. 基於RedisSpider實現的分佈式爬蟲(網易新聞) a) 代碼修改(爬蟲類): i. 導包:from scrapy_redis.spiders import RedisSpider ii. 將爬蟲類的父類修改爲RedisSpider iii. 將起始url列表註釋,添加一個redis_key(調度器隊列的名稱)的屬性 b) redis數據庫配置文件的配置redisxxx.conf: i. #bind 127.0.0.1 ii. protected-mode no c) 對項目中settings進行配置: i. #配置redis服務的ip和端口 REDIS_HOST = 'redis服務的ip地址' REDIS_PORT = 6379 #REDIS_PARAMS = {‘password’:’123456’} ii. # 使用scrapy-redis組件的去重隊列 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis組件本身的調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 是否容許暫停 SCHEDULER_PERSIST = True iii. 使用能夠被共享的管道 ITEM_PIPELINES = { #'wangyiPro.pipelines.WangyiproPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } d) 開啓redis數據庫的服務:redis-server 配置文件 e) 執行爬蟲文件:scrapy runspider wangyi.py f) 向調度器的隊列中扔一個起始url: i. 開啓redis客戶端 ii. 向調度器隊列中扔一個起始url lpush wangyi https://news.163.com 2. UA池: a) 在中間價類中進行導包: from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware b) 封裝一個基於UserAgentMiddleware的類,且重寫該類的process_requests方法 3. 代理池:注意請求url的協議後究竟是http•仍是https 4. selenium如何被應用到scrapy a) 在爬蟲文件中導入webdriver類 b) 在爬蟲文件的爬蟲類的構造方法中進行了瀏覽器實例化的操做 c) 在爬蟲類的closed方法中進行瀏覽器關閉的操做 d) 在下載中間件的process_response方法中編寫執行瀏覽器自動化的操做 需求:爬取的是基於文字的新聞數據(國內,國際,軍事,航空) PROXY = [ '173.82.219.113:3128', '92.243.6.37:80', '117.102.96.59:8080', '213.234.28.94:8080', '101.51.123.88:8080', '158.58.131.214:41258' ] user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
# 類的構造方法中實例化一個瀏覽器對象 # 而後在中間件中重寫process_response方法,來攔截本來的response(由於頁面內容都是動態加載出來的,因此這邊必須用selenium模塊模擬瀏覽器操做) class WangyiproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None #攔截到響應對象(下載器傳遞給Spider的響應對象) #request:響應對象對應的請求對象 #response:攔截到的響應對象 #spider:爬蟲文件中對應的爬蟲類的實例 def process_response(self, request, response, spider): #響應對象中存儲頁面數據的篡改 # 五個請求url,一個起始,四個想要爬去的子連接,這邊只有四個子連接是動態加載的 if request.url in['http://news.163.com/domestic/','http://news.163.com/world/','http://news.163.com/air/','http://war.163.com/']: spider.bro.get(url=request.url) js = 'window.scrollTo(0,document.body.scrollHeight)' spider.bro.execute_script(js) time.sleep(2) #必定要給與瀏覽器必定的緩衝加載數據的時間 #頁面數據就是包含了動態加載出來的新聞數據對應的頁面數據 page_text = spider.bro.page_source #篡改響應對象 return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request) else: return response
UA池做用:代碼幾秒發起了幾百個請求,會被反爬蟲機制識別到,因此要經過ua池和代理池來分發壓力。也是寫在下載中間件中 基於RedisSpider的分佈式爬蟲.zip
from scrapy import signals from scrapy.http import HtmlResponse import time from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware import random #UA池代碼的編寫(單獨給UA池封裝一個下載中間件的一個類) #1,導包UserAgentMiddlware類 class RandomUserAgent(UserAgentMiddleware): def process_request(self, request, spider): #從列表中隨機抽選出一個ua值 ua = random.choice(user_agent_list) #ua值進行當前攔截到請求的ua的寫入操做 request.headers.setdefault('User-Agent',ua) #批量對攔截到的請求進行ip更換 class Proxy(object): def process_request(self, request, spider): #對攔截到請求的url進行判斷(協議頭究竟是http仍是https) #request.url返回值:http://www.xxx.com h = request.url.split(':')[0] #請求的協議頭 if h == 'https': ip = random.choice(PROXY_https) request.meta['proxy'] = 'https://'+ip else: ip = random.choice(PROXY_http) request.meta['proxy'] = 'http://' + ip