使用requests庫進行javascript
import requests target='https://www.csdn.net/' req=requests.get(target) print(req.text)
輸出:html
</head> <body data-category="home" data-host_type="www"> <script id="toolbar-tpl-scriptId" prod="download" skin="black" src="//csdnimg.cn/public/common/toolbar/js/content_toolbar.js" type="text/javascript" domain="http://blog.csdn.net"></script> <div class="container clearfix"> <nav id="nav" class="clearfix"> <div class="clearfix"> <div class="nav_com"> <ul> <li class="active"><a href="/">推薦</a></li> <li class=""><a href="/nav/watchers">關注</a></li> <li class=""><a href="/nav/career">程序人生</a></li>
使用爬蟲爬取csdn博客html文件java
import requests import re import time import numpy as np
def getHtml(url): res = requests.get(url,timeout=2,headers={'User-Agent':'Baiduspider'}) return res.text
def getURL(list_html): begin = """<h4 class=\"\"> <a href=\"""" end = """\" target=\"_blank\">""" r = r'(?<=' + begin + ').*(?=' + end + ')' res = re.findall(r, list_html) return res
def saveFile(file, url): fout = open(url, 'w', encoding='UTF-8') fout.write(file) fout.close() def loadFile(url): fread = open(url, 'r', encoding='utf-8') file = fread.read() fread.close() return file
def getTitle(html): return re.search(r'(?<=<title>).*(?=_)', html)[0] # 全部的博客連接 blog_urls = [] # 獲取博客連接 def Init(): # 博客頁數 page = 36 for index in range(1, page + 1, 1): list_url = 'https://jkchen.blog.csdn.net/article/list/' + index.__str__() list_html = getHtml(list_url) blog_url_ar = getURL(list_html) for url in blog_url_ar: blog_urls.append(url) np.save('blog_url.npy', blog_urls) if __name__ == '__main__': # 是否須要更新目錄 refresh = False if refresh: Init() # 是否須要保存html源文件(文件夾須要先建立好) toSave = False saveUrl = 'HTMLs/' blog_urls = np.load('blog_url.npy') epoch = 100 for T in range(epoch): np.random.shuffle(blog_urls) index = 0 for url in blog_urls: index += 1 while(True): try: html = getHtml(url) break except: print("Banned, and retry. ") time.sleep(4) title = getTitle(html) if toSave: saveFile(html, saveUrl + title + '.html') print('epoch: {}, index: {}, title: {}'.format(T + 1, index, title)) time.sleep(10*np.random.rand())