python爬蟲初探

使用requests庫進行javascript

import requests

target='https://www.csdn.net/'
req=requests.get(target)
print(req.text)

輸出:html

</head>
<body data-category="home" data-host_type="www">
    <script id="toolbar-tpl-scriptId" prod="download" skin="black" src="//csdnimg.cn/public/common/toolbar/js/content_toolbar.js" type="text/javascript" domain="http://blog.csdn.net"></script>
    <div class="container clearfix">
      <nav id="nav" class="clearfix">
        <div class="clearfix">
        <div class="nav_com">
          <ul>
                  <li class="active"><a href="/">推薦</a></li>
                      <li class=""><a href="/nav/watchers">關注</a></li>
                      <li class=""><a href="/nav/career">程序人生</a></li>

使用爬蟲爬取csdn博客html文件java

import requests
import re
import time
import numpy as np

獲取指定連接的html內容

def getHtml(url):
    res = requests.get(url,timeout=2,headers={'User-Agent':'Baiduspider'})
    return res.text

獲取csdn目錄頁內部的各個blog的連接

def getURL(list_html):
    begin = """<h4 class=\"\">
        <a href=\""""
    end = """\" target=\"_blank\">"""
    r = r'(?<=' + begin + ').*(?=' + end + ')'

    res = re.findall(r, list_html)
    return res

文件形式保存html,url爲本地保存地址

def saveFile(file, url):
    fout = open(url, 'w', encoding='UTF-8')
    fout.write(file)
    fout.close()


def loadFile(url):
    fread = open(url, 'r', encoding='utf-8')
    file = fread.read()
    fread.close()
    return file

獲取html文件的標題

def getTitle(html):
    return re.search(r'(?<=<title>).*(?=_)', html)[0]


# 全部的博客連接
blog_urls = []


# 獲取博客連接
def Init():
    # 博客頁數
    page = 36
    for index in range(1, page + 1, 1):
        list_url = 'https://jkchen.blog.csdn.net/article/list/' + index.__str__()
        list_html = getHtml(list_url)
        blog_url_ar = getURL(list_html)
        for url in blog_url_ar:
            blog_urls.append(url)
    np.save('blog_url.npy', blog_urls)


if __name__ == '__main__':
    # 是否須要更新目錄
    refresh = False
    if refresh:
        Init()

    # 是否須要保存html源文件(文件夾須要先建立好)
    toSave = False
    saveUrl = 'HTMLs/'

    blog_urls = np.load('blog_url.npy')

    epoch = 100
    for T in range(epoch):
        np.random.shuffle(blog_urls)
        index = 0
        for url in blog_urls:
            index += 1
            while(True):
                try:
                    html = getHtml(url)
                    break
                except:
                    print("Banned, and retry. ")
                    time.sleep(4)
            title = getTitle(html)

            if toSave:
                saveFile(html, saveUrl + title + '.html')
            print('epoch: {}, index: {}, title: {}'.format(T + 1, index, title))
            time.sleep(10*np.random.rand())
相關文章
相關標籤/搜索