%%%%html
#coding:utf-8 import requests from xlwt import Workbook from bs4 import BeautifulSoup import time import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') def baidu(url): response = requests.get(url) soup = BeautifulSoup(response.text,'html.parser') tiezilist = soup.find_all('li',class_=" j_thread_list clearfix") datalist = [] for tiezi in tiezilist: huifu = tiezi.span.string title = ''.join(tiezi.find('div',class_="threadlist_abs threadlist_abs_onlyline ").text.split()) try: author = tiezi.find('span',class_="tb_icon_author ").text except: author = "None" createtime = tiezi.find('span',class_="pull-right is_show_create_time").string if ':' in createtime: date = datetime.datetime.now().strftime('%Y-%m-%d') createtime =date+' '+createtime try: endhuifuren = tiezi.find('span',class_="tb_icon_author_rely j_replyer")['title'] except: endhuifuren ="None" try: endhuifurentime = ''.join(tiezi.find('span',class_="threadlist_reply_date pull_right j_reply_data").text.split()) except: endhuifurentime = "None" data = [title,author,createtime,endhuifuren,endhuifurentime,huifu] datalist.append(data) return datalist def saveToExecl(name,start,end): datalistnew = [] book = Workbook(encoding='utf-8') # 設置execl編碼格式 sheet1 = book.add_sheet('Sheet 1') # 操做execl表格 sheet1.write(0, 0, u'標題') sheet1.write(0, 1, u'發貼人') sheet1.write(0, 2, u'帖子建立時間') sheet1.write(0, 3, u'最後回覆人') sheet1.write(0, 4, u'最後回覆時間') sheet1.write(0, 5, u'回覆數') startnew = int(start)*50 endnew = int(end)*50 for page in range(startnew,endnew, 50): url = "http://tieba.baidu.com/f?kw=%s&ie=utf-8&pn=%s" % (name,page) try: print page/50 datalist = baidu(url) time.sleep(1) except: print page time.sleep(10) datalist = baidu(url) datalistnew = datalistnew + datalist datalist = datalistnew execlNmae = name+'貼吧'+str(start)+'-'+str(end)+'頁數據'+'.xls' try: for data in range(0, len(datalist)): # 遍歷數據列表,而後把數據寫入表格中 title = datalist[data][0] author = datalist[data][1] createtime = datalist[data][2] endhuifuren = datalist[data][3] endhuifurentime = datalist[data][4] huifu = datalist[data][5] sheet1.write(data + 1, 0, title) sheet1.write(data + 1, 1, author) sheet1.write(data + 1, 2, createtime) sheet1.write(data + 1, 3, endhuifuren) sheet1.write(data + 1, 4, endhuifurentime) sheet1.write(data + 1, 5, huifu) book.save(u"%s" % execlNmae) except: book.save(u"%s" % execlNmae) pass if __name__=='__main__': print "*********************百度貼吧數據爬取軟件*********************" baiduPostBar =raw_input("please input Baidu Post Bar Name:") start = raw_input("please input start page number: ") end = raw_input("please input end page number: ") saveToExecl(baiduPostBar,start,end)
%%%%python