python3 爬蟲繼續爬筆趣閣 ,,,,,,,

時間 2019-11-07

原文原文鏈接

學如逆水行舟,不進則退html

今天想看小說..找了半天,沒有資源..web

只能本身爬了cookie

想了半天.,,,忘記了這個古老的技能app

撿了一下 url

import requests
from bs4 import BeautifulSoup
cookies = {
    'bcolor': 'null',
    'font': 'null',
    'size': 'null',
    'color': 'null',
    'width': 'null',
    'clickbids': '18836',
    'Hm_lvt_30876ba2abc5f5253467ef639ca0ad48': '1571030311,1571030949,1571031218',
    'Hm_lpvt_30876ba2abc5f5253467ef639ca0ad48': '1571031588',
}

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
}

response = requests.get('http://www.biquku.la/18/18836/', headers=headers, cookies=cookies)

# print(response.text)
class downloder(object):
    def __init__(self):
        self.server = 'http://www.biqukan.com'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  #存放章節名字
        self.urls = [] #存放章節連接
        self.nums = 0 # 章節數量
    def get_download_url(self):
        req = requests.get('http://www.biquku.la/18/18836/', headers=headers, cookies=cookies)

        html = req.text
        # print(html)
        div_bf = BeautifulSoup(html)
        div = div_bf.find_all('div',id='list')
        a_bf = BeautifulSoup(str(div[0]))
        a = a_bf.find_all('a')
        for each in a:
            self.names.append(each.string)
            self.urls.append('http://www.biquku.la/18/18836/'+each.get('href'))
        self.nums = len(a)

    def writer(self, name, path, text):
        write_flag = True
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name + '\n')
            f.writelines(text)
            f.writelines('\n\n')

    def get_contents(self, target):
        req = requests.get(url=target)
        html = req.content
        # print('html',html)
        bf = BeautifulSoup(html)
        texts = bf.find_all('div', id='content')

        texts=str(texts[0]).replace('<br/>','\n')
        # print('texts',texts)
        # texts = texts[0].text.replace('&nbsp', '\n\n')
        # texts = texts[0].text.replace('<br/>', '\n\n')
        # texts = texts[0].text.replace('<br/>', '\n\n')
        # texts = texts[0].text.replace('<br>', '\n\n')

        return texts


if __name__ == '__main__':
    dl = downloder()
    dl.get_download_url()
    # print(dl.urls)
    print(dl.nums)
    print('開始下載')
    for i in range(dl.nums):
        dl.writer(dl.names[i], '用點.txt', dl.get_contents(dl.urls[i]))
        print('第'+str(i)+'章下載完')
    print("下載完成")

不是什麼難的東西....spa

不懂得留言code

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。