# -*- coding: utf-8 -*- from lxml import etree from multiprocessing.dummy import Pool as threadpool import requests import json import sys reload(sys) sys.setdefaultencoding('utf-8') def towrite(item): f.writelines(u'回帖人: ' + str(item['user_name']) + '\n') f.writelines(u'回帖內容: ' + str(item['topic_reply_content']) + '\n') f.writelines(u'回帖時間: ' + str(item['topic_reply_time']) + '\n\n') def spider(url): html = requests.get(url) selector = etree.HTML(html.text) # 採用requests獲得一個頁面的源碼,並進行xpath content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') # xpath獲得一個頁面不一樣回貼的列表,先抓大後抓小 item = {} # 獲得的內容放入字典中 for each in content_field: # 後抓小,開始進行解析 print each.xpath('@data-field')[0] # json 文件 print json.loads(each.xpath('@data-field')[0]) # 變爲字典 reply_info = json.loads(each.xpath('@data-field')[0]) # 經過xpath獲得一個相應內容的列表,並把列表內容解析獲得一個字典 author = reply_info['author']['user_name'] #獲得字典內容 reply_time = reply_info['content']['date'] content = each.xpath('div[@class="d_post_content_main"]/div/cc/div/text()')[0].replace(' ','') # 用xpath獲得文本內容 print content print reply_time print author item['user_name'] = author # 保存到 item字典中 item['topic_reply_content'] = content item['topic_reply_time'] = reply_time towrite(item) # 寫入文件 if __name__ == '__main__': pool = threadpool(1) # 採用多線程爬取,定義進程池 f = open('content.txt', 'a') page = [] for i in range(1,2): # 爬取的頁數,存入列表中 newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i) page.append(newpage) results = pool.map(spider, page) # 採用map函數爬取 pool.close() f.close()
一個百度貼吧的爬蟲,並只在內容。html