import re import requests def handler_url(url): header={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" } reponse=requests.get(url,header) text=reponse.text titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) years=re.findall(r'<p\sclass="source"><a.*?>(.*?)</a>',text,re.DOTALL) autors=re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL) contents_tmp=re.findall(r'<p\sclass="source">.*?<div\sclass="contson"\s.*?>(.*?)</div>',text,re.DOTALL) contents=[] for c in contents_tmp: x=re.sub(r'<.*?>','',c) contents.append(x.strip()) poems=[] for value in zip(titles,years,autors,contents): title,year,autor,content=value poem={ "title":title, "year":year, "autor":autor, "content":content } poems.append(poem) for a in poems: print(a) def main(): base_url="https://www.gushiwen.org/default_{}.aspx" for i in range(1,7): url=base_url.format(i) handler_url(url) if __name__ == '__main__': main()
import re import requests HEADERS={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" } def get_detail_page(url): resp=requests.get(url,headers=HEADERS) text=resp.text detail_pages=re.findall(r'<div\sclass="main-bd">.*?<a\shref="(.*?)">.*?</a>',text,re.DOTALL) return detail_pages def handle_detail_page(url): book={} resp=requests.get(url,headers=HEADERS) text=resp.text titles=re.findall(r'<div\sclass="article">.*?<span.*?>(.*?)</span>',text,re.DOTALL) articles_tmp=re.findall(r'<div\sid="link-report">.*?<p>.*</p>',text,re.DOTALL) articles=[] for a in articles_tmp: x=re.sub(r'<.*?>','',a) y=x.replace('\r','').replace('\n','') a=re.sub(r'<div\s.*?>','',y) b=a.replace(' ','') articles.append(b) # print(articles) # print(titles) book={ 'title':titles, 'article':articles } print(book) def main(): urls=[] base_url="https://book.douban.com/review/best/?start={}" for i in range(0,41,20): url=base_url.format(i) urls.append(url) for a in urls: detail_urls=get_detail_page(a) for f in detail_urls: handle_detail_page(f) if __name__ == '__main__': main()
這兩個小的爬蟲項目,網站都沒有進行反爬處理,因此說對於咱們爬出頁面信息仍是比較簡單的,關鍵在於數據怎麼解析,最近一直再看正則表達式,因此找了兩個小的項目練一練,這兩個小項目也可使用beatifulsoup或者xml庫進行進行,找時間再試試別的方法解析數據把~正則表達式