#------------------------------------------------------------------------------------ # 理想論壇爬蟲1.08, # 增長斷點續傳模式,這樣能夠有空再下載了。 # 2018年4月29日 #------------------------------------------------------------------------------------ from bs4 import BeautifulSoup import requests import threading import re import time import datetime import os import json import colorama from colorama import Fore, Back, Style colorama.init() user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' headers={'User-Agent':user_agent} # 主帖數組 topics=[] # 存儲數據文件的目錄 folder="" # 最終要下載的帖子數組 finalTopics=[] # 失敗的帖子數組 failedTopics=[] #------------------------------------ # 在論壇頁中尋找主貼 # pageUrl:論壇頁url #------------------------------------ def findTopics(pageUrl): print("\n開始讀取頁面"+pageUrl+"的帖子"); try: rsp=requests.get(pageUrl,headers=headers) rsp.encoding = 'gb18030' #解決中文亂碼問題的關鍵 soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='gb2312') for tbodys in soup.find_all('tbody'): pageCount=1 url='none' title='none' for spans in tbodys.find_all('span',class_="forumdisplay"): for link in spans.find_all('a'): if link and link.get("href"): url="http://www.55188.com/"+link.get("href") title=link.text for spans in tbodys.find_all('span',class_="threadpages"): for link in spans.find_all('a'): pageCount=link.text if url!='none' and title!='none': topic={'pageCount':pageCount,'url':url,'title':title} #print("topic="+str(topic)) topics.append(topic) #print("讀取頁面"+pageUrl+"的帖子完畢"); except Exception as e: log("findTopics出現異常:"+str(e),'red') #------------------------------------ # 以不一樣顏色在控制檯輸出文字 # pageUrl:論壇頁url #------------------------------------ def log(text,color): if color=='red': print(Fore.RED + text+ Style.RESET_ALL) elif color=='green': print(Fore.GREEN + text+ Style.RESET_ALL) elif color=='yellow': print(Fore.YELLOW + text+ Style.RESET_ALL) else: print(text) #------------------------------------ # 找到並保存帖子的細節 # index:序號,url:地址,title:標題 #------------------------------------ def saveTopicDetail(index,url,title): try: # 訪問url rsp=requests.get(url,headers=headers) rsp.encoding = 'gb18030' #解決中文亂碼問題的關鍵 session = requests.session() session.keep_alive = False soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='gb2312') # 看文件有沒有 filename=folder+"/"+str(index)+'.json' isFileExist=os.path.isfile(filename); if isFileExist: log(currTime()+"已經存在url="+url+",index="+str(index)+"對應的文件",'yellow') else: # 取頁面信息 infos=fetchTopicInfos(soup,url,title) n=len(infos) # 存文件 if n>0: with open(filename,'w',encoding='utf-8') as fObj: json.dump(infos,fObj) else: log(currTime()+"沒法取得url="+url+"的信息",'red') # 在控制檯輸出狀態並保存文件 if index % 50==0: log(currTime()+"處理到第"+str(index)+"個數據",'green') saveData={'folder':folder,'finalTopics':finalTopics,'failedTopics':failedTopics} savefilename="./save.dat" with open(savefilename,'w',encoding='utf-8') as fObj: json.dump(saveData,fObj) except Exception as e: failedTopic={'index':index,'url':url,'title':title} failedTopics.append(failedTopic) log("saveTopicDetail訪問"+str(failedTopic)+"時出現異常:"+str(e),'red') #------------------------------------ # 取得當前時間 #------------------------------------ def currTime(): currTime=time.strftime('%H:%M:%S ',time.localtime(time.time())) return currTime #------------------------------------ # 取得帖子裏全部的子貼信息 # soup:帖子html全文解析完的soup對象,url:地址,title:標題 #------------------------------------ def fetchTopicInfos(soup,url,title): infos=[] # 找到的子貼信息 for tds in soup.find_all('td',class_="postcontent"): info={} # 找樓層,做者,日期,時間等信息 for divs in tds.find_all('div',class_="postinfo"): # 用正則表達式將多個空白字符替換成一個空格 RE = re.compile(r'(\s+)') line=RE.sub(" ",divs.text) arr=line.split(' ') n=len(arr) if n==7: info={'樓層':arr[1], '做者':arr[2].replace('只看:',''), '日期':arr[4], '時間':arr[5], 'title':title, 'url':url} elif n==8: info={'樓層':arr[1], '做者':arr[2].replace('只看:',''), '日期':arr[5], '時間':arr[6], 'title':title, 'url':url} # 找帖子內容 for div in tds.find_all('div',class_="postmessage"): for div2 in div.find_all('div',class_="t_msgfont"): for div3 in div2.find_all('div',class_="t_msgfont"): info['內容']=div3.text # 找齊七顆龍珠纔算完成任務 if len(info.items())==7: #print("info="+str(info)); infos.append(info) return infos #------------------------------------ # 【新的開始】函數 # start:起始頁,end:終止頁 #------------------------------------ def newGame(start,end): # 建立目錄 currTime=time.strftime('%H_%M_%S',time.localtime(time.time())) global folder folder="./"+currTime os.makedirs(folder) print("目錄"+folder+"建立完成") # 獲取主貼 print('\n將從如下頁面獲取主貼:'); for i in range(start,end+1): pageUrl='http://www.55188.com/forum-8-'+str(i)+'.html' # 這個頁是論壇頁,即第1頁,第2頁等 findTopics(pageUrl); n=len(topics) log("共讀取到:"+str(n)+"個主貼",'green') # 獲取主貼及其子貼 index=0 for topic in topics: end=int(topic['pageCount'])+1 title=topic['title'] for i in range(1,end): pattern='-(\d+)-(\d+)-(\d+)' newUrl=re.sub(pattern,lambda m:'-'+m.group(1)+'-'+str(i)+'-'+m.group(3),topic['url']) #print(newUrl) newTopic={'index':index,'url':newUrl,'title':title} finalTopics.append(newTopic) index=index+1 n=len(finalTopics) log("共讀取到:"+str(n)+"個帖子",'green') # 遍歷finalTopics while(len(finalTopics)>0): topic=finalTopics.pop(); saveTopicDetail(topic['index'],topic['url'],topic['title']) #------------------------------------ # 【再續前緣】函數 #------------------------------------ def load(): savefilename="./save.dat" with open(savefilename,'r',encoding='utf-8') as fObj: saveData=json.load(fObj) global folder folder=saveData['folder'] if os.path.isdir(folder)==False: os.makedirs(folder) global finalTopics finalTopics=saveData['finalTopics'] global failedTopics failedTopics=saveData['failedTopics'] finalTopics.extend(failedTopics) #曾經下載不了的融合到所有中繼續下載 failedTopics=[] # 遍歷finalTopics while(len(finalTopics)>0): topic=finalTopics.pop(); saveTopicDetail(topic['index'],topic['url'],topic['title']) #------------------------------------ # 入口函數 #------------------------------------ def main(): msg=input("選擇【新的開始】輸入0,選擇【再續前緣】輸入1:") if msg=='0': text=input("請輸入起始頁碼和終止頁碼,以逗號分隔:") arr=text.split(',') newGame(int(arr[0]),int(arr[1])) else: load() # 開始 main()
控制檯輸出示例:html
C:\Users\horn1\Desktop\python\30>python lixiang.py 選擇【新的開始】輸入0,選擇【再續前緣】輸入1: C:\Users\horn1\AppData\Local\Programs\Python\Python36\lib\site-packages\bs4\__init__.py:146: UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored. warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") 15:47:20 已經存在url=http://www.55188.com/thread-5673944-450-2.html,index=5849對應的文 件 15:47:20 已經存在url=http://www.55188.com/thread-5673944-449-2.html,index=5848對應的文 件 15:47:21 已經存在url=http://www.55188.com/thread-5673944-448-2.html,index=5847對應的文 件 15:47:52 處理到第5800個數據 15:48:30 處理到第5750個數據 15:49:14 處理到第5700個數據 15:49:49 處理到第5650個數據 15:50:24 處理到第5600個數據 15:50:58 處理到第5550個數據 15:51:32 處理到第5500個數據 15:52:06 處理到第5450個數據 15:52:41 處理到第5400個數據
插數據入庫的insertDB.py:python
#------------------------------------------------------------------------------------ # insertDB1.01,讀取理想論壇爬蟲生成的數據,而後寫入DB # 2018年4月29日 #------------------------------------------------------------------------------------ import pymysql import time import datetime import os import json #------------------------------------ # 取得當前時間 #------------------------------------ def currTime(): currTime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) return currTime #------------------------------------ # 入口函數 #------------------------------------ def main(folder): starttime = datetime.datetime.now() # 打開目錄從文件裏取數據 allinfos=[] for filename in os.listdir(folder): filePathname=folder+"/"+filename with open(filePathname,'r',encoding='utf-8') as fObj: infos=json.load(fObj) allinfos.extend(infos) print("擬向數據庫插入"+str(len(allinfos))+"條記錄") # 打開數據庫並插值 conn=pymysql.connect(host='127.0.0.1',user='root',passwd='12345678',db='test',charset='utf8') cur=conn.cursor(); sum=0 for info in allinfos: try: arr=[info['樓層'],info['做者'],info['日期'],info['時間'],currTime(),info['url'],info['title'],info['內容']] count=cur.execute('insert into test.topic17(floor,author,tdate,ttime,addtime,url,title,content) values (%s,%s,%s,%s,%s,%s,%s,%s)',arr) sum+=count except Exception as e: print("出現異常:"+str(e)+",此時info="+str(info)) continue; conn.commit() conn.close() print("已向數據庫插入"+str(sum)+"條記錄") # 計算用時 endtime = datetime.datetime.now() print("插數據用時"+str((endtime - starttime).seconds)+"秒") # 開始 main("./15_38_48")
控制檯輸出示例:mysql
C:\Users\horn1\Desktop\python\31>python insertDB.py 擬向數據庫插入115149條記錄 出現異常:'utf-8' codec can't encode character '\ud83d' in position 275: surrogates not allowed,此時info={'樓層': '7283樓', '做者': '愛麗說', '日期': '2014-10-22', '時間': '11:33', 'title': ' 擁抱陽光龍理論2018成功的路上並不擁擠,咱們一塊兒邁步前行,找到好老師就有好方法! ', 'url': 'http://www.55188.com/thread-5673944-365-2.html', '內容': '倒黴的我,1 號沒買到,買了2-3號,寶箱更新太不及時,強烈要求老師微信同步\ud83d😓😓😓😓😓😓😭😭'} 出現異常:'utf-8' codec can't encode character '\ud83d' in position 275: surrogates not allowed,此時info={'樓層': '7285樓', '做者': '愛麗說', '日期': '2014-10-22', '時間': '11:37', 'title': ' 擁抱陽光龍理論2018成功的路上並不擁擠,咱們一塊兒邁步前行,找到好老師就有好方法! ', 'url': 'http://www.55188.com/thread-5673944-365-2.html', '內容': '倒黴的我,1 號沒買到,買了2-3號,寶箱更新太不及時,強烈要求老師微信同步\ud83d😓😓😓😓😓😓😭😭'} 已向數據庫插入115147條記錄 插數據用時26秒 C:\Users\horn1\Desktop\python\31>
DB截圖:正則表達式
2018年4月29日sql