import requests from lxml import etree import urllib.parse class ImageSpider: def __init__(self): self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"} self.baseurl = "" # 獲取全部帖子URL列表 def getPageUrl(self,url): # 獲取校花吧頁面的html res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text # 提取頁面中全部帖子的URL parseHtml = etree.HTML(html) t_list = parseHtml.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') # t_list : ["/p/32093","/p/203920","","",""] for t_link in t_list: t_url = "http://tieba.baidu.com" + t_link self.getImageUrl(t_url) # 獲取每一個帖子中圖片的URL列表 def getImageUrl(self,t_url): # 獲取1個帖子的響應html res = requests.get(t_url,headers=self.headers) res.encoding = "utf-8" html = res.text # 從html響應中獲取圖片URL列表 parseHtml = etree.HTML(html) img_list = parseHtml.xpath('//div[@class="d_post_content j_d_post_content clearfix"]/img/@src') for img_link in img_list: self.writeImage(img_link) # 保存圖片 def writeImage(self,img_link): # 獲取每一個圖片的二進制 res = requests.get(img_link,headers=self.headers) res.encoding = "utf-8" html = res.content # 保存到本地(以圖片連接的後12位做爲文件名) filename = img_link[-8:] with open(filename,"wb") as f: f.write(html) print(filename,"下載成功") # 主函數 def workOn(self): name = input("請輸入貼吧名:") begin = int(input("請輸入開始頁:")) end = int(input("請輸入結束頁:")) for pn in range(begin,end+1): # 拼接貼吧頁面URL pn = (pn-1)*50 kw = {"kw":name} kw = urllib.parse.urlencode(kw) fullurl = \ "http://tieba.baidu.com/f?" + \ kw + "&pn=" + str(pn) # 直接調類內函數 self.getPageUrl(fullurl) if __name__ == "__main__": spider = ImageSpider() spider.workOn()
import requests import pymongo from lxml import etree class QiushiSpider: def __init__(self): self.headers = {"User-Agent":"Mozilla/5.0"} # 鏈接對象 self.conn = pymongo.MongoClient("localhost", 27017) # 庫對象 self.db = self.conn["Qiushi"] # 集合對象 self.myset = self.db["qiushiinfo"] # 獲取頁面 def getPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析並存入數據庫 def parsePage(self,html): # 建立解析對象 parseHtml = etree.HTML(html) # 獲取每一個段子的節點對象列表 base_list = parseHtml.xpath('//div[contains(@id,"qiushi_tag_")]') for base in base_list: # 節點對象可調用xpath # 用戶暱稱 username = base.xpath('./div/a/h2') if len(username) == 0: username = "匿名用戶" else: username = base.xpath('./div/a/h2')[0].text.strip() # 段子內容 content = base.xpath('.//div[@class="content"]/span') # 可笑數量 laughNum = base.xpath('.//i')[0] # 評論數量 pingNum = base.xpath('.//i')[1] d = { "username":username, "content":content[0].text.strip(), "laughNum":laughNum.text, "pingNum":pingNum.text } self.myset.insert(d) print("成功") if __name__ == "__main__": spider = QiushiSpider() spider.getPage("https://www.qiushibaike.com/8hr/page/1/")