xpath的包含
- `//div[contains(@class,'i')]`css
實現爬蟲的套路
- 準備url
- 準備start_url
- url地址規律不明顯,總數不肯定
- 經過代碼提取下一頁的url
- xpath
- 尋找url地址,部分參數在當前的響應中(好比,當前頁碼數和總的頁碼數在當前的響應中)
- 準備url_list
- 頁碼總數明確
- url地址規律明顯html
- 發送請求,獲取響應
- 添加隨機的User-Agent,反反爬蟲
- 添加隨機的代理ip,反反爬蟲
- 在對方判斷出咱們是爬蟲以後,應該添加更多的headers字段,包括cookie
- cookie的處理可使用session來解決
- 準備一堆能用的cookie,組成cookie池
- 若是不登陸
- 準備剛開始可以成功請求對方網站的cookie,即接收對方網站設置在response的cookie
- 下一次請求的時候,使用以前的列表中的cookie來請求
- 若是登陸
- 準備多個帳號
- 使用程序獲取每一個帳號的cookie
- 以後請求登陸以後才能訪問的網站隨機的選擇cookienode
- 提取數據
- 肯定數據的位置
- 若是數據在當前的url地址中
- 提取的是列表頁的數據
- 直接請求列表頁的url地址,不用進入詳情頁
- 提取的是詳情頁的數據
- 1. 肯定url
- 2. 發送請求
- 3. 提取數據
- 4. 返回python
- 若是數據不在當前的url地址中
- 在其餘的響應中,尋找數據的位置
- 1. 從network中從上往下找
- 2. 使用chrome中的過濾條件,選擇出了js,css,img以外的按鈕
- 3. 使用chrome的search all file,搜索數字和英文
- 數據的提取
- xpath,從html中提取整塊的數據,先分組,以後每一組再提取
- re,提取max_time,price,html中的json字符串
- jsonweb
- 保存
- 保存在本地,text,json,csv
- 保存在數據庫chrome
# coding=utf-8
import requests
from lxml import etree
import json
class TiebaSpider:
def __init__(self,tieba_name):
self.tieba_name = tieba_name
self.start_url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw="+tieba_name+"&pn=0"
self.part_url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/"
self.headers= {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}
def parse_url(self,url):#發送請求,獲取響應
print(url)
response = requests.get(url,headers=self.headers)
return response.content
def get_content_list(self,html_str):#提取數據
html = etree.HTML(html_str)
div_list = html.xpath("//div[contains(@class,'i')]") #根據div分組
content_list = []
for div in div_list:
item = {}
item["title"] = div.xpath("./a/text()")[0] if len(div.xpath("./a/text()"))>0 else None
item["href"] = self.part_url+div.xpath("./a/@href")[0] if len(div.xpath("./a/@href"))>0 else None
item["img_list"] = self.get_img_list(item["href"],[])
itemp["img_list"] = [requests.utils.unquote(i).split("src=")[-1] for i in item["img_list"]]
content_list.append(item)
#提取下一頁的url地址
next_url = self.part_url+html.xpath("//a[text()='下一頁']/@href")[0] if len(html.xpath("//a[text()='下一頁']/@href"))>0 else None
return content_list,next_url
def get_img_list(self,detail_url,total_img_list): #獲取帖子中的全部的圖片
#3.2請求列表頁的url地址,獲取詳情頁的第一頁
detail_html_str = self.parse_url(detail_url)
detail_html = etree.HTML(detail_html_str)
#3.3提取詳情頁第一頁的圖片,提取下一頁的地址
img_list = detail_html.xpath("//img[@class='BDE_Image']/@src")
total_img_list.extend(img_list)
#3.4請求詳情頁下一頁的地址,進入循環3.2-3.4
detail_next_url = detail_html.xpath("//a[text()='下一頁']/@href")
if len(detail_next_url)>0:
detail_next_url = self.part_url + detail_next_url[0]
return self.get_img_list(detail_next_url,total_img_list)
# else:
# return total_img_list
return total_img_list
def save_content_list(self,content_list): #保存數據
file_path = self.tieba_name+".txt"
with open(file_path,"a",encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False,indent=2))
f.write("\n")
print("保存成功")
def run(self):#實現主要邏輯
next_url = self.start_url
while next_url is not None:
#1.start_url
#2.發送請求,獲取響應
html_str = self.parse_url(next_url)
#3.提取數據,提取下一頁的url地址
#3.1提取列表頁的url地址和標題
#3.2請求列表頁的url地址,獲取詳情頁的第一頁
#3.3提取詳情頁第一頁的圖片,提取下一頁的地址
#3.4請求詳情頁下一頁的地址,進入循環3.2-3.4
content_list,next_url = self.get_content_list(html_str)
#4.保存數據
self.save_content_list(content_list)
#5.請求下一頁的url地址,進入循環2-5不
if __name__ == '__main__':
tieba_spider = TiebaSpider("作頭髮")
tieba_spider.run()
# coding=utf-8
import requests
from lxml import etree
class QiubaiSpdier:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
def get_url_list(self):
return [self.url_temp.format(i) for i in range(1,14)]
def parse_url(self,url):
print(url)
response = requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,html_str): #提取數據
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div") #分組
content_list = []
for div in div_list:
item= {}
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["content"] = [i.replace("\n","") for i in item["content"]]
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
item["author_img"] = "https:"+item["author_img"][0] if len(item["author_img"])>0 else None
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
content_list.append(item)
return content_list
def save_content_list(self,content_list): #保存
for i in content_list:
print(i)
def run(self): #實現主要邏輯
#1.url_list
url_list = self.get_url_list()
#2.遍歷,發送請求,獲取響應
for url in url_list:
html_str = self.parse_url(url)
#3.提取數據
content_list = self.get_content_list(html_str)
#4.保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai = QiubaiSpdier()
qiubai.run()
# coding=utf-8
import requests
from lxml import etree
import threading
from queue import Queue
class QiubaiSpdier:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
# return [self.url_temp.format(i) for i in range(1,14)]
for i in range(1,4):
self.url_queue.put(self.url_temp.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url,headers=self.headers)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
def get_content_list(self): #提取數據
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div") #分組
content_list = []
for div in div_list:
item= {}
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["content"] = [i.replace("\n","") for i in item["content"]]
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
item["author_img"] = "https:"+item["author_img"][0] if len(item["author_img"])>0 else None
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
def save_content_list(self): #保存
while True:
content_list = self.content_queue.get()
for i in content_list:
# print(i)
pass
self.content_queue.task_done()
def run(self): #實現主要邏輯
thread_list = []
#1.url_list
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
#2.遍歷,發送請求,獲取響應
for i in range(20):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
#3.提取數據
for i in range(2):
t_html = threading.Thread(target=self.get_content_list)
thread_list.append(t_html)
#4.保存
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) #把子線程設置爲守護線程,該線程不重要,主線程結束,子線程結束
t.start()
for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join() #讓主線程等待阻塞,等待隊列的任務完成以後再完成
print("主線程結束")
'''
多線程精髓:
1.讓子線程依賴主線程,即給子線程設置:t.setDaemon(True);而後再讓主線程以來隊列中的任務,則q.join()
2.數據經過隊列來傳遞
'''
if __name__ == '__main__':
qiubai = QiubaiSpdier()
qiubai.run()
import os
import re
from lxml import etree
from retrying import retry
import requests
class Tieba():
def __init__(self):
self.start_url = "https://tieba.baidu.com/f?ie=utf-8&kw=DOTA2&fr=search"
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36"}
@retry(stop_max_attempt_number = 3)
def _parse_url(self, url):
response = requests.get(url, headers = self.headers, timeout=8)
assert response.status_code == 200
try:
html_str = response.content.decode()
except:
html_str = response.text
return html_str
def parse_url(self, url):
try:
html_str = self._parse_url(url)
except Exception as e:
print(e)
html_str = None
return html_str
def analysis(self, html_str):
html_str = re.sub(r'<!--|-->', "", html_str)
nodes = etree.HTML(html_str)
title = nodes.xpath('//ul//a[@rel="noreferrer" and @class="j_th_tit "]')
ret = [] # 這個列表用於保存字典——{ 標題:地址 }
for t in title:
temp_dict = dict()
item = {}
item["title"] = t.xpath("@title")[0]
item["href"] = t.xpath("@href")[0] if len(t.xpath("@href"))>0 else None
item["img_list"] = self.get_pic()
ret.append(temp_dict)
return ret
def get_pic(self, item_list):
for item in item_list:
if list(item.values())[0] is None:
continue
url = "https://tieba.baidu.com{}".format(list(item.values())[0])
try:
html_str = self.parse_url(url)
html_str = re.sub(r'<!--|-->', "", html_str)
ret = etree.HTML(html_str)
pic_list = ret.xpath("//img[]/@src")
return pic_list
except Exception as e:
print(e)
os.mkdir(list(item.keys())[0])
for pic in pic_list:
if pic.startswith("//"): # 發現貼吧裏有的圖片是以//開頭的,因此須要給這些圖片加上http:
pic_url = "http:"+ pic
else:
pic_url = pic
pic_str = requests.get(pic_url, headers=self.headers).content
if pic_str is not None:
pic_rex = re.search(r"(\.jpg|\.png|\.gif)", pic_url) # 獲取圖片的格式,是jpg, png 仍是gif。
if pic_rex is not None:
pic_style = pic_rex.group(0)
else:
pic_style = ".jpg"
with open(list(item.keys())[0]+"/"+str(pic_list.index(pic))+ pic_style, "wb") as f:
f.write(pic_str)
def get_next(self, html_str):
ret = etree.HTML(html_str)
next_page = ret.xpath('//a[text()="下一頁>"]/@href')
print(next_page)
return next_page
def run(self):
html_str = self.parse_url(self.start_url)
if html_str is not None:
# html_str = re.sub(r'<!--|-->', "", html_str)
item_list = self.analysis(html_str)
self.get_pic(item_list)
ret = self.get_next(html_str)
self.save_ret(ret)
while len(ret)>0:
ret[0] = "http:"+ ret[0]
html_str = self.parse_url(ret[0])
# html_str = re.sub(r'<!--|-->', "", html_str)
item_list = self.analysis(html_str)
self.get_pic(item_list)
ret = self.get_next(html_str)
if __name__ == '__main__':
tieba = Tieba()
tieba.run()
驗證碼的識別
- url不變,驗證碼不變
- 請求驗證碼的地址,得到相應,識別數據庫
- url不變,驗證碼會變
- 思路:對方服務器返回驗證碼的時候,會和每一個用戶的信息和驗證碼進行一個對應,以後,在用戶發送post請求的時候,會對比post請求中法的驗證碼和當前用戶真正的存儲在服務器端的驗證碼是否相同json
- 1.實例化session
- 2.使用seesion請求登陸頁面,獲取驗證碼的地址
- 3.使用session請求驗證碼,識別
- 4.使用session發送post請求’瀏覽器
- 使用selenium登陸,遇到驗證碼
- url不變,驗證碼不變,同上
- url不變,驗證碼會變
- 1.selenium請求登陸頁面,同時拿到驗證碼的地址
- 2.獲取登陸頁面中driver中的cookie,交給requests模塊發送驗證碼的請求,識別
- 3.輸入驗證碼,點擊登陸服務器
selenium使用的注意點
- 獲取文本和獲取屬性
- 先定位到元素,而後調用`.text`或者`get_attribute`方法來去
- selenium獲取的頁面數據是瀏覽器中elements的內容
- find_element和find_elements的區別
- find_element返回一個element,若是沒有會報錯
- find_elements返回一個列表,沒有就是空列表
- 在判斷是否有下一頁的時候,使用find_elements來根據結果的列表長度來判斷
- 若是頁面中含有iframe、frame,須要先調用driver.switch_to.frame的方法切換到frame中才能定位元素
- selenium請求第一頁的時候回等待頁面加載完了以後在獲取數據,可是在點擊翻頁以後,hi直接獲取數據,此時可能會報錯,由於數據尚未加載出來,須要time.sleep(3)
- selenium中find_element_by_class_name智能接收一個class對應的一個值,不能傳入多個
# coding=utf-8
from selenium import webdriver
import time
#實例化一個瀏覽器
driver = webdriver.Chrome()
# driver = webdriver.PhantomJS()
#設置窗口大小
# driver.set_window_size(1920,1080)
#最大化窗口
driver.maximize_window()
#發送請求
driver.get("http://www.baidu.com")
#進行頁面截屏
driver.save_screenshot("./baidu.png")
#元素定位的方法
driver.find_element_by_id("kw").send_keys("python")
driver.find_element_by_id("su").click()
# driver 獲取html字符串
# print(driver.page_source) #瀏覽器中elements的內容
print(driver.current_url)
#driver獲取cookie
# cookies = driver.get_cookies()
# print(cookies)
# print("*"*100)
# cookies = {i["name"]:i["value"] for i in cookies}
# print(cookies)
#退出瀏覽器
time.sleep(3)
driver.quit()
# coding=utf-8
from selenium import webdriver
import time
import requests
from yundama.dama import indetify
#實例化driver
driver = webdriver.Chrome()
driver.get("https://www.douban.com/")
driver.find_element_by_id("form_email").send_keys("784542623@qq.com")
driver.find_element_by_id("form_password").send_keys("zhoudawei123")
#識別驗證碼
captcha_image_url = driver.find_element_by_id("captcha_image").get_attribute("src")
captcha_content = requests.get(captcha_image_url).content
captcha_code = indetify(captcha_content)
print("驗證碼的識別結果爲:",captcha_code)
#輸入驗證碼
driver.find_element_by_id("captcha_field").send_keys(captcha_code)
driver.find_element_by_class_name("bn-submit").click()
#獲取cookie
cookies = {i["name"]:i["value"] for i in driver.get_cookies()}
print(cookies)
time.sleep(3)
driver.quit()
# coding=utf-8
from selenium import webdriver
driver = webdriver.Chrome()
# driver.get("http://neihanshequ.com/")
driver.get("https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&ch=&tn=baidu&bar=&wd=python&rn=&oq=&rsv_pq=87739988000939bf&rsv_t=b194dxdCny6hrJFXQrh4D6bavkKZwfpeT4s7j7V6AvGfiiAvTgxqGAvWbCM&rqlang=cn")
# ret1 = driver.find_elements_by_xpath("//ul[@id='detail-list']/li")
# # print(ret1)
# # print(ret1)
# for li in ret1:
# print(li.find_element_by_xpath(".//h1/p").text)
# print(li.find_element_by_xpath(".//a[@class='image share_url1']").get_attribute("href"))
#find_element_by_link_text
print(driver.find_element_by_link_text("下一頁>").get_attribute("href"))
#find_element_by_partial_link_text文本中包含下一頁的a標籤
print(driver.find_element_by_partial_link_text("下一頁").get_attribute("href"))
driver.quit()
# coding=utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://mail.qq.com/")
#切換到iframe
driver.switch_to.frame("login_frame")
driver.find_element_by_id("u").send_keys("12312312312")
time.sleep(3)
driver.quit()
# coding=utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://www.bilibili.com/v/kichiku/mad/#/all/stow")
print(driver.find_element_by_xpath("//ul[@class='vd-list mod-2']/li//a[@class='title']").text)
#翻頁
driver.find_element_by_xpath("//button[@class='nav-btn iconfont icon-arrowdown3']").click()
time.sleep(3)
print(driver.find_element_by_xpath("//ul[@class='vd-list mod-2']/li//a[@class='title']").text)
driver.quit()