【轉】傑奇 jieqi 多線程自動採集同步源站 python源碼

時間 2019-11-13

原文原文鏈接

該工具爲python代碼，對目標源站進行循環採集，同步更新。
採用多線程採集，保證採集速度。採集線程數可根據本身服務器壓力自由調整。
採用小說字數比對，僅噹噹前字數大於已採集字數時才認爲該小說有章節更新而進行採集，從而減小沒必要要的資源浪費。
測試目標站爲17K小說網，本身使用的時候請配合本身網站後臺規則修改siteid。同時根據目標列表頁的代碼，修改正則規則。
該正則規則獲取3個參數 [0]爲書號 [1]爲書名 [2]爲採集時候的字數。php

使用本工具後，無需使用關關等採集器，能夠作到單Linux服務器運行小說站。
使用本工具的小說站演示:武林書盟html

#!coding=UTF-8
#######VPSKK.com原創做品，轉載請註明出處##########
import urllib
import urllib2
import commands
import time
import threading
import os
import re
import syspython

thlen = 10
#定義同時採集的線程數
books = []
#定義須要採集的書庫
tsk = []
#定義採集線程數組
bookdict = {}
#定義已採集圖書字典，key爲目標站書號,value爲字數
domain = ‘www.vpskk.com’
adminuser = ‘admin’
adminpass = ‘******’
siteid = ’23’
# notaddnew = ‘0’數組

frompage = ‘http://all.17k.com/lib/book/2_0_0_0_0_0_2_0_1.html’服務器

def addbooklist():
while 1:
time.sleep(30)
print ‘[‘ + time.strftime(‘%H:%M:%S’) + ‘] 採集更新列表線程啓動。’
start = time.time()
try:
response = urllib2.urlopen(frompage, timeout = 12)
content = response.read()
except:
continue
response.close()
elapsed = (time.time() – start)
bookattr = re.findall(r’<a class=\"jt\" rel=\"/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class=\"td5\">([0-9]+)</td>’,content,re.M)
print ‘[‘ + time.strftime(‘%H:%M:%S’) + ‘] 採集更新列表結束，用時：’ + str(elapsed) + ‘秒’
for ii in range(len(bookattr)):
newbookid = bookattr[ii][0]
newbookname = bookattr[ii][1]
newbooksize = bookattr[ii][2]
inlist = False
for tt in range(len(books)):
if (books[tt][0]==newbookid):
inlist = True
if not inlist:
#書號不在待採集數組裏
if (newbookid in bookdict.keys()):
#書號在已採集過的字典裏(須要根據字數來判斷是否有更新)
if (int(newbooksize)>int(bookdict[newbookid])):
#採集到書籍字數大於已採集字典裏的字數（添加到待採集列表）
books.append([newbookid,newbookname,newbooksize])
print ‘書號:’ + newbookid + ‘有更新，舊字數:’+ bookdict[newbookid] + ‘ 新字數:’+ newbooksize + ‘ 添加到待採集列表。’
else:
#書號不在已採集過的字典裏（添加到待採集列表）
books.append([newbookid,newbookname,newbooksize])
print ‘書號:’ + newbookid + ‘最近未採集，添加到待採集列表。’
print ‘[‘ + time.strftime(‘%H:%M:%S’) + ‘] 採集更新列表線程完成，線程休眠。’
def caiji(bookid,bookname,booksize):
print ‘正在採集書號[‘ + bookid + ‘] 書名:’ + bookname
url = ‘http://’+ domain + ‘/modules/article/admin/batchcollect.php?action=bcollect&siteid=’ + siteid + ‘&batchids=’ + bookid + ‘&jieqi_username=’ + adminuser + ‘&jieqi_userpassword=’ + adminpass
start = time.time()
page = urllib2.urlopen(url,timeout=3600)
data = page.read(8192)
while data:
data = page.read(8192)
page.close()
elapsed = (time.time() – start)
time.sleep(5) #採集完等5秒生成全書
print ‘書號[‘ + bookid + ‘] 書名:’ + bookname + ‘字數:’ + booksize + ‘k 採集完成！用時：’ + str(elapsed) + ‘秒’
print ‘書號[‘ + bookid + ‘] 書名:’ + bookname + ‘字數:’ + booksize + ‘k 添加到最近採集書目字典。’
# 從網頁獲取要採集的文章ID和文章名字（首次）
start = time.time()
response = urllib2.urlopen(frompage, timeout = 12)
content = response.read()
response.close()
elapsed = (time.time() – start)
getattr = re.findall(r’<a class=\"jt\" rel=\"/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class=\"td5\">([0-9]+)</td>’,content,re.M)
#getsize = re.findall(r’<td class=\"td5\">([0-9]+)</td>’,content,re.M)
print ‘首次獲取要採集的文章共’ + str(len(getattr)) +’篇，用時：’ + str(elapsed) + ‘秒’
books = books + getattr
if (len(books)<3):
print(‘獲取列表頁失敗,退出！’)
exit()多線程

#啓動書籍列表採集線程
thaddbooklist = threading.Thread(target=addbooklist,name=’taddbooklist’)
thaddbooklist.start()app

for x in range(thlen):
bookid = books[0][0]
bookname = books[0][1]
booksize = books[0][2]
tname = ‘t’ + str(x)
th = threading.Thread(target=caiji,name=tname,args=(bookid,bookname,booksize))
th.start()
del books[0]
bookdict[bookid] = booksize
tsk.append(th)dom

#檢測空閒線程，當線程閒置時，若待採集列表不爲空時，啓用該線程進行採集
while 1:
time.sleep(5)
for i in range(len(tsk)):
if not tsk[i].is_alive():
print tsk[i].name + ‘線程空閒’
if len(books) > 0:
bookid = books[0][0]
bookname = books[0][1]
booksize = books[0][2]
th = threading.Thread(target=caiji,name=tsk[i].name,args=(bookid,bookname,booksize))
th.start()
del books[0]
bookdict[bookid] = booksize
tsk[i] = th工具