聚焦爬蟲:數據解析html
數據解析的原理java
python實現數據解析的方式:python
使用正則進行數據解析:爬取糗事百科中的圖片數據後端
#使用正則進行數據解析:爬取糗事百科中的圖片數據 import requests import re,os from urllib import request if not os.path.exists('./qiutuLibs'): os.mkdir('./qiutuLibs') headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } #定義一個通用的url模板,使用%d實現更換頁碼 url = 'https://www.qiushibaike.com/pic/page/%d/?s=5201079' for page in range(1,36): #某一個頁碼對應的完整的url new_url = format(url%page) #使用通用爬蟲對當前url對應的一整張頁面源碼數據進行爬取 page_text = requests.get(url=new_url,headers=headers).text #數據解析:全部的圖片地址 ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>' #re.S是正則中專門用來處理換行的 img_src = re.findall(ex,page_text,re.S) for src in img_src: src = 'https:'+src #切片把路徑的最後端做爲圖片名字 img_name = src.split('/')[-1] img_path = './qiutuLibs/'+img_name request.urlretrieve(src,img_path) print(img_name,'下載成功')
解析原理編碼
環境的安裝url
BeautifulSoup對象的實例化spa
引用案例.net
from bs4 import BeautifulSoup fp = open('./test.html','r',encoding='utf-8') soup = BeautifulSoup(fp,'lxml') #標籤訂位 # soup.tagName:定位到的是源碼中第一次出現的該標籤 # print(soup.div) # soup.find('tagName',attrName='value')屬性定位 # print(soup.find('div',class_='tang')),屬性上要加下劃線,返回定位的單數 # print(soup.find_all('div',class_='tang')),屬性上要加下劃線,返回定位的複數 # select('選擇器'):標籤,類,id,層級 選擇器 # print(soup.select('#feng'))id選擇器 # print(soup.select('.tang > ul > li'))類選擇器 # print(soup.select('.tang li')) 空格表示的是多個層級,大於號表示一個層級 #數據提取 # print(soup.p.string) ,獲取的是標籤中直系的文本內容 # print(soup.p.text) ,獲取的是標籤中全部的文本內容 # print(soup.p.get_text()) ,獲取的是標籤中全部的文本內容 # 區別 # print(soup.select('.song')[0].get_text()) #取屬性 # print(soup.img['src'])
爬取三國演義小說全篇內容code
#爬取三國演義小說全篇內容 import requests from bs4 import BeautifulSoup url = 'http://www.shicimingju.com/book/sanguoyanyi.html' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } page_text = requests.get(url=url,headers=headers).text #數據解析,章節標題 soup = BeautifulSoup(page_text,'lxml') a_list = soup.select('.book-mulu>ul>li>a') fp = open('./sanguo.txt','w',encoding='utf-8') for a in a_list: title = a.string detail_url = 'http://www.shicimingju.com'+a['href'] #獲取詳情頁數據 detail_page_text = requests.get(url=detail_url,headers=headers).text soup = BeautifulSoup(detail_page_text, 'lxml') content = soup.find('div',class_="chapter_content").text fp.write(title+':'+content+'\n') print(title,'下載完畢') fp.flush() fp.close() #bs4缺點在於只能在python中使用,可是能夠爬取帶標籤的文本內容
解析原理orm
環境安裝
etree對象的實例化
xpath方法使用
from lxml import etree #引入模塊 tree = etree.parse('./test/html') title = tree.xpath('/html/head/title')#從根節點開始一層一層的尋找指定的標籤 titles = tree.xpath('//title')#不是從根節點開始尋找 #屬性定位 div = tree.xpath('//div[@class="song"]') #索引定位 li = tree.xpath('//div[@class="tang"]/ul/li[5]') #索引是從1開始 lis = tree.xpath('//div[@class="tang"]//li[5]') #索引是從1開始 #取值 /text()直系的文本內容 //text()全部的文本內容 a = tree.xpath('//div[@class="tang"]/a[1]/text()') print(''.join(a)) divs = tree.xpath('//div[@class="song"]//text()') #取屬性 a_href = tree.xpath('//div[@class="song:]/a[1]/@href') print(a_href[0])
爬取boss中崗位的名稱,薪資,公司名稱
#爬取boss中崗位的名稱,薪資,公司名稱 import requests from lxml import etree url = 'https://www.zhipin.com/c101010100/?query=java&page={}&ka=page-{}' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } fp = open('./java.txt','w',encoding='utf-8') for page in range(1,11): new_url = url.format(page,page) page_text = requests.get(url=new_url,headers=headers).text #數據解析 tree = etree.HTML(page_text) #數據的提取,xpath解析全局源碼的內容不須要加'.' li_list = tree.xpath('//div[@class="job-list"]/ul/li') for li in li_list: #xpath解析指定標籤局部內容須要加上'.' job_title = li.xpath('.//div[@class="job-title"]/text()')[0] salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()')[0] company = li.xpath('.//div[@class="company-text"]/h3/a/text()')[0] fp.write(job_title+' '+salary+' '+company+'\n') fp.flush() fp.close()
爬取 http://pic.netbian.com/4kmeinv/ 全部的圖片
import requests from lxml import etree from urllib import request import os if not os.path.exists('./4k'): os.mkdir('./4k') url = 'http://pic.netbian.com/4kmeinv/index_%d.html' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } #拿到頁面 for page in range(1,197): if page == 1: new_url = 'http://pic.netbian.com/4kmeinv/' else: new_url = format(url%page) response = requests.get(url=new_url,headers=headers) # response.encoding='utf-8' page_text = response.text #數據解析,圖片地址 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] img_name = li.xpath('./a/img/@alt')[0]+'.jpg' img_name = img_name.encode('iso-8859-1').decode('gbk') img_path = './4k/'+img_name request.urlretrieve(img_src,img_path) print(img_name,'下載成功')
爬取全國城市的名稱https://www.aqistudy.cn/historydata/
#爬取全國城市的名稱https://www.aqistudy.cn/historydata/ import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } page_text = requests.get(url=url,headers=headers).text #數據解析:全國城市名稱和熱門城市名稱 tree = etree.HTML(page_text) # host_city_name = tree.xpath('//div[@class="bottom"]/ul/li/a/text()') # all_city_name = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()') #xpath表達式能夠使用按位或"|"的方式多個合成一個,拓展性強 city_name = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()') print(city_name)
爬取58二手房的房源信息(房屋名稱,價格,概況(存在於詳情頁中的))
# 爬取58二手房的房源信息(房屋名稱,價格,概況(存在於詳情頁中的)) import requests from lxml import etree #建立一個url模板 url = 'https://bj.58.com/shahe/ershoufang/pn%d/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } for page in range(1,3): new_url = format(url%page) page_text = requests.get(url=new_url,headers=headers).text # 數據解析:詳情頁的url,房屋名稱和價格 tree = etree.HTML(page_text) li_list = tree.xpath('/html/body/div[5]/div[5]/div[1]/ul/li') for li in li_list: title = li.xpath('./div[@class="list-info"]/h2/a/text()')[0] #//text()會拿到全部的文本數據,使用join拼接轉換爲字符串操做 price = ''.join(li.xpath('./div[3]//text()')) detail_url = li.xpath('./div[2]/h2/a/@href')[0] # 對詳情頁發起請求獲取源碼數據而且解析出概述對應的數據值 detail_page_text = requests.get(url=detail_url,headers=headers).text detail_tree = etree.HTML(detail_page_text) desc = ''.join(detail_tree.xpath('//*[@id="generalSituation"]//text()')) print(title,price,desc)
http://sc.chinaz.com/tupian/rentiyishu.html 當前頁中全部的圖片進行下載,懶加載
# http://sc.chinaz.com/tupian/rentiyishu.html 當前頁中全部的圖片進行下載,懶加載 import requests from lxml import etree url = 'http://sc.chinaz.com/tupian/rentiyishu.html' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } #獲取頁面文本數據 response = requests.get(url=url,headers=headers) #圖片涉及2進制,須要使用指定utf8編碼,再text,解決亂碼 response.encoding='utf-8' page_text = response.text #解析頁面數據(獲取頁面中的圖片連接) #建立etree對象 tree = etree.HTML(page_text) div_list = tree.xpath('//*[@id="container"]/div') #解析獲取圖片地址和圖片的名稱 for div in div_list: image_url = div.xpath('.//img/@src2')#src2僞屬性 image_name = div.xpath('.//img/@alt') print(image_url) # 打印圖片連接 print(image_name) # 打印圖片名稱