【Python】理想論壇每小時發帖量統計圖表

時間 2020-06-29

原文原文鏈接

寫如下代碼的目的是分析一天中各時段理想論壇中用戶發帖回帖的活躍程度，得到結尾那張圖表是核心。html

如下代碼兩種爬蟲協助，論壇爬蟲先爬主貼，爬到主貼後啓動帖子爬蟲爬子貼，而後把每一個子貼的發表時間等存入數據庫。python

再用一個程序對各個時段中發帖次數進行統計，而後用Excel生產圖表。mysql

獲取數據的爬蟲代碼以下：正則表達式

# 論壇爬蟲，用於爬取主貼再爬子貼
from bs4 import BeautifulSoup
import requests
import threading
import re
import pymysql

user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
headers={'User-Agent':user_agent}

# 論壇爬蟲類（多線程）
class forumCrawler(threading.Thread):
    def __init__(self,name,url):
        threading.Thread.__init__(self,name=name)
        self.name=name
        self.url=url
        self.infos=[]
    
    def run(self):
        print("線程"+self.name+"開始爬取頁面"+self.url);

        try:
            rsp=requests.get(self.url,headers=headers)
            soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')
            #print(rsp.text); # rsp.text是全文

            # 找出span
            for spans in soup.find_all('span',class_="forumdisplay"):
                #找出link
                for link in spans.find_all('a'):
                    if link and link.get("href"): 
                        #print(link.get("href"))
                        #print(link.text+'\n')
                        topicLink="http://www.55188.com/"+link.get("href")

                        tc=topicCrawler(name=self.name+'_tc#'+link.get("href"),url=topicLink)
                        tc.start()


        except Exception as e:
            print("線程"+self.name+"發生異常。")# 無論怎麼出現的異常，就讓它一直爬到底
            print(e);

# 帖子爬蟲類（多線程）
class topicCrawler(threading.Thread):
    def __init__(self,name,url):
        threading.Thread.__init__(self,name=name)
        self.name=name
        self.url=url
        self.infos=[]
    
    def run(self):
        while(self.url!="none"):
            print("線程"+self.name+"開始爬取頁面"+self.url);

            try:
                rsp=requests.get(self.url,headers=headers)
                self.url="none"#用完以後置空，看下一頁可否取到值
                soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')
                #print(rsp.text); # rsp.text是全文

                # 找出一頁裏每條發言
                for divs in soup.find_all('div',class_="postinfo"):
                    #print(divs.text) # divs.text包含做者和發帖時間的文字
                    
                    # 用正則表達式將多個空白字符替換成一個空格
                    RE = re.compile(r'(\s+)')
                    line=RE.sub(" ",divs.text)

                    arr=line.split(' ')

                    #print(len(arr)) 
                    arrLength=len(arr)

                    if arrLength==7:
                        info={'樓層':arr[1],
                              '做者':arr[2].replace('只看：',''),
                              '日期':arr[4],
                              '時間':arr[5]}
                        self.infos.append(info);
                    elif arrLength==8:
                        info={'樓層':arr[1],
                              '做者':arr[2].replace('只看：',''),
                              '日期':arr[5],
                              '時間':arr[6]}
                        self.infos.append(info);



                #找下一頁所在地址
                for pagesDiv in soup.find_all('div',class_="pages"):
                    for strong in pagesDiv.find_all('strong'):
                        print('當前爲第'+strong.text+'頁')

                        # 找右邊的兄弟節點
                        nextNode=strong.next_sibling
                        if nextNode and nextNode.get("href"): # 右邊的兄弟節點存在，且其有href屬性
                            #print(nextNode.get("href"))
                            self.url='http://www.55188.com/'+nextNode.get("href")


                if self.url!="none":
                    print("有下一頁，線程"+self.name+"前往下一頁")
                    continue
                else:
                    print("無下一頁，線程"+self.name+'爬取結束，開始打印...')
                    
                    for info in self.infos:
                        print('\n')
                        for key in info:
                            print(key+":"+info[key])

                    print("線程"+self.name+'打印結束.')

                    insertDB(self.name,self.infos)


            except Exception as e:
                print("線程"+self.name+"發生異常。從新爬行")# 無論怎麼出現的異常，就讓它一直爬到底
                print(e);
                continue

# 數據庫插值
def insertDB(crawlName,infos):
    conn=pymysql.connect(host='127.0.0.1',user='root',passwd='12345678',db='test',charset='utf8')
    

    for info in infos:
        sql="insert into test.topic(floor,author,tdate,ttime,crawlername,addtime) values ('"+info['樓層']+"','"+info['做者']+"','"+info['日期']+"','"+info['時間']+"','"+crawlName+"',now() )"
        print(sql)
        conn.query(sql)
    
    conn.commit()# 寫操做以後commit不可少
    conn.close()


# 入口函數
def main():
    for i in range(1,10):
        url='http://www.55188.com/forum-8-'+str(i)+'.html'
        tc=forumCrawler(name='fc#'+str(i),url=url)
        tc.start()

# 開始
main()

控制檯輸出太多就不貼了，把插入數據後的數據庫展現一下，ttime字段就是想要得到的關鍵數據：sql

再作一個小程序對發帖時間進行統計，代碼以下：數據庫

# 對發帖時間進行統計
import pymysql

# 入口函數
def main():
    dic={'00':0,'01':0,'02':0,'03':0,'04':0,'05':0,'06':0,'07':0,'08':0,'09':0,'10':0,'11':0,'12':0,'13':0,'14':0,'15':0,'16':0,'17':0,'18':0,'19':0,'20':0,'21':0,'22':0,'23':0}

    conn=pymysql.connect(host='127.0.0.1',user='root',passwd='12345678',db='test',charset='utf8')

    cs=conn.cursor()
    cs.execute("select * from topic")
    results = cs.fetchall()

    for row in results:
        ttime=row[4]
        hour=ttime.split(':')[0]
        dic[hour]=dic[hour]+1

    conn.close()

    print(dic)
# 開始
main()

輸出字典以下：小程序

C:\Users\horn1\Desktop\python\17>python sum.py
{'00': 99, '01': 65, '02': 23, '03': 14, '04': 11, '05': 19, '06': 126, '07': 290, '08': 669, '09': 810, '10': 697, '11': 596, '12': 585, '13': 653, '14': 588, '15': 815, '16': 565, '17': 597, '18': 603, '19': 516, '20': 561, '21': 638, '22': 425, '23': 388}

用Excel來個圖形化看看：c#

從上圖能夠得出如下結論：windows

1.早上0點-6點是交易者最閒的時候，他們大部分都在睡覺，3-5點睡得最熟。多線程

2.發帖峯值一個是在9-10點，一個是15-16點。股市在9：30開盤，低開高開也出來了，行情也走了一段，你們開始熱情高漲了發帖，以後就逐步回落，午休落入低谷，下午15點收盤後，你們又爭相發表對這天行情的見解，但能一天走勢能談多少，因而一個小時就消停了。另外15-16點也是股評家發表股評的黃金時段。

3.入夜了，雖然早已收盤，你們依舊在瀏覽論壇，但願從帖子裏發現什麼或者討論什麼，直到23-24點還有很多人從事這項活動。

呵呵，我也玩票了一把數據分析。

2018年4月4日16點14分