import requests headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' }
方法一:html
url = 'https://pic.qiushibaike.com/system/pictures/12217/122176396/medium/OM37E794HBL3OFFF.jpg' img_data = requests.get(url=url,headers=headers).content #content返回的是byte類型的數據 with open('./123.jpg','wb') as fp: fp.write(img_data)
方法二:python
from urllib import request url = 'https://pic.qiushibaike.com/system/pictures/12217/122176396/medium/OM37E794HBL3OFFF.jpg' request.urlretrieve(url,'./456.jpg')
方法二不能夠使用UA假裝的機制,urllib就是一個比較老的網絡請求的模塊,在requests模塊沒有出現以前,請求發送的操做使用的都是urllibcookie
需求:爬取糗事百科中糗圖數據網絡
import re import os dir_name = './qiutuLibs' if not os.path.exists(dir_name): os.mkdir(dir_name) url = 'https://www.qiushibaike.com/pic/' page_text = requests.get(url,headers=headers).text #數據解析:圖片地址 ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>' img_src_list = re.findall(ex,page_text,re.S) for src in img_src_list: src = 'https:'+src img_name = src.split('/')[-1] img_path = dir_name+'/'+img_name #對圖片地址單獨發起請求獲取圖片數據 request.urlretrieve(src,img_path) print(img_name,'下載成功!!!')
from bs4 import BeautifulSoup fp = open('html/test.html', 'r', encoding='utf-8') soup = BeautifulSoup(fp, 'lxml') # 將即將被解析的頁面源碼加載到該對象中 print(soup.p) soup.find('div', class_='song') soup.find_all('div', class_='song') soup.select('.tang') soup.select('#feng') soup.select('.tang > ul > li') soup.select('.tang li') li_6 = soup.select('.tang > ul > li')[6] i_tag = li_6.i i_tag.string soup.find('div', class_='tang').text soup.find('a', id="feng")['href']
需求:爬取三國演義整篇小說內容http://www.shicimingju.com/book/sanguoyanyi.html佈局
#在首頁中解析章節名稱&每個章節詳情頁的url url = 'http://www.shicimingju.com/book/sanguoyanyi.html' page_text = requests.get(url,headers=headers).text soup = BeautifulSoup(page_text,'lxml') a_list = soup.select('.book-mulu > ul > li > a') fp = open('sanguo.txt','w',encoding='utf-8') for a in a_list: detail_url = 'http://www.shicimingju.com'+a['href'] chap_title = a.string #對章節詳情頁的url發起請求,解析詳情頁中的章節內容 detail_page_text = requests.get(detail_url,headers=headers).text soup = BeautifulSoup(detail_page_text,'lxml') chap_content = soup.find('div',class_="chapter_content").text fp.write(chap_title+':'+chap_content+'\n') print(chap_title,'爬取成功!') fp.close()
from lxml import etree tree = etree.parse('./test.html') tree.xpath('/html/head/meta')[0] #絕對路徑 tree.xpath('//meta')[0] #相對路徑,將整個頁面源碼中全部的meta進行定位 tree.xpath('/html//meta')[0] #屬性定位 tree.xpath('//div[@class="song"]') #索引定位 tree.xpath('//div[@class="tang"]/ul/li[3]') #該索引是從1開始 tree.xpath('//div[@class="tang"]//li[3]') #該索引是從1開始 #取文本 tree.xpath('//p[1]/text()') tree.xpath('//div[@class="song"]//text()') #取屬性 tree.xpath('//a[@id="feng"]/@href')
需求:爬取boss的招聘信息編碼
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', 'cookie':'lastCity=101010100; __c=1566877560; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1566877561; _uab_collina=156687756118178796315757; __l=l=%2Fwww.zhipin.com%2F&r=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DidbSvNzz2fLSl1WXiEmtINauVHUZYSNqejHny725pc5RTwaHqh5uDx1LewpyGmaT%26wd%3D%26eqid%3Dbadf667700040677000000025d64a772&friend_source=0&friend_source=0; __zp_stoken__=91d9QItKEtUk5dMMnDG7lwzq8mBW1g%2FkEsFOHXIi%2FwMd%2FPRRXc%2FPMKjsDYwsfC4b7vAT3FVnTmYBjGp8gW1OeZ5TdA%3D%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1566879753; __a=69160831.1566877560..1566877560.16.1.16.16' }
url = 'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position=' page_text = requests.get(url,headers=headers).text #數據解析 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="job-list"]/ul/li') for li in li_list: # 須要將li表示的局部頁面源碼數據中的相關數據進行提取 # 若是xpath表達式被做用在了循環中,表達式要以./或者.//開頭 detail_url = 'https://www.zhipin.com'+li.xpath('.//div[@class="info-primary"]/h3/a/@href')[0] job_title = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()')[0] salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()')[0] company = li.xpath('.//div[@class="info-company"]/div/h3/a/text()')[0] #對詳情頁的url發請求解析出崗位職責 detail_page_text = requests.get(detail_url,headers=headers).text tree = etree.HTML(detail_page_text) job_desc = tree.xpath('//div[@class="text"]//text()') job_desc = ''.join(job_desc) print(job_title,salary,company,job_desc)
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', } url = 'https://www.qiushibaike.com/text/page/4/' page_text = requests.get(url,headers=headers).text tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="content-left"]/div') for div in div_list: author = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()')[0] content = div.xpath('.//div[@class="content"]/span//text()') content = ''.join(content) print(author,content)
#指定一個通用的url模板 url = 'http://pic.netbian.com/4kmeishi/index_%d.html' for page in range(1,3): if page == 1: new_url = 'http://pic.netbian.com/4kmeishi/' else: new_url = format(url%page) response = requests.get(new_url,headers=headers) #response.encoding = 'utf-8' page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li') for li in li_list: img_src = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0] img_name = li.xpath('./a/b/text()')[0] img_name = img_name.encode('iso-8859-1').decode('gbk')