項目源碼:python
1 # -*- coding:utf-8 -*- 2 3 from spider import SpiderHTML 4 from multiprocessing import Pool 5 import sys,urllib,http,os,random,re,time 6 __author__ = 'waiting' 7 ''' 8 使用了第三方的類庫 BeautifulSoup4,請自行安裝 9 須要目錄下的spider.py文件 10 運行環境:python3.4,windows7 11 ''' 12 13 #收藏夾的地址 14 url = 'https://www.zhihu.com/collection/30822111' #page參數改成代碼添加 15 16 #本地存放的路徑,不存在會自動建立 17 store_path = 'E:\\zhihu\收藏夾\\會員才知道的世界' 18 19 class zhihuCollectionSpider(SpiderHTML): 20 def __init__(self,pageStart, pageEnd, url): 21 self._url = url 22 self._pageStart = int(pageStart) 23 self._pageEnd = int(pageEnd)+1 24 self.downLimit = 0 #低於此贊同的答案不收錄 25 26 def start(self): 27 for page in range(self._pageStart,self._pageEnd): #收藏夾的頁數 28 url = self._url + '?page='+str(page) 29 content = self.getUrl(url) 30 questionList = content.find_all('div',class_='zm-item') 31 for question in questionList: #收藏夾的每一個問題 32 Qtitle = question.find('h2',class_='zm-item-title') 33 if Qtitle is None: #被和諧了 34 continue 35 36 questionStr = Qtitle.a.string 37 Qurl = 'https://www.zhihu.com'+Qtitle.a['href'] #問題題目 38 Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string) #windows文件/目錄名不支持的特殊符號 39 try: 40 print('-----正在獲取問題:'+Qtitle+'-----') #獲取到問題的連接和標題,進入抓取 41 except UnicodeEncodeError: 42 print(r'---問題含有特殊字符沒法顯示---') 43 try: 44 Qcontent = self.getUrl(Qurl) 45 except: 46 print('!!!!獲取出錯!!!!!') 47 pass 48 answerList = Qcontent.find_all('div',class_='zm-item-answer zm-item-expanded') 49 self._processAnswer(answerList,Qtitle) #處理問題的答案 50 time.sleep(5) 51 52 53 def _processAnswer(self,answerList,Qtitle): 54 j = 0 55 for answer in answerList: 56 j = j + 1 57 58 upvoted = int(answer.find('span',class_='count').string.replace('K','000')) #得到此答案贊同數 59 if upvoted < self.downLimit: 60 continue 61 authorInfo = answer.find('div',class_='zm-item-answer-author-info') #獲取做者信息 62 author = {'introduction':'','link':''} 63 try: 64 author['name'] = authorInfo.find('a',class_='author-link').string #得到做者的名字 65 author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #得到做者的簡介 66 author['link'] = authorInfo.find('a',class_='author-link')['href'] 67 except AttributeError: 68 author['name'] = '匿名用戶'+str(j) 69 except TypeError: #簡介爲空的狀況 70 pass #匿名用戶沒有連接 71 72 file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt') 73 if os.path.exists(file_name): #已經抓取過 74 continue 75 76 self.saveText(file_name,'{introduction}\r\n{link}'.format(**author)) #保存做者的信息 77 print('正在獲取用戶`{name}`的答案'.format(**author)) 78 answerContent = answer.find('div',class_='zm-editable-content clearfix') 79 if answerContent is None: #被舉報的用戶沒有答案內容 80 continue 81 82 imgs = answerContent.find_all('img') 83 if len(imgs) == 0: #答案沒有上圖 84 pass 85 else: 86 self._getImgFromAnswer(imgs,Qtitle,**author) 87 88 #收錄圖片 89 def _getImgFromAnswer(self,imgs,Qtitle,**author): 90 i = 0 91 for img in imgs: 92 if 'inline-image' in img['class']: #不抓取知乎的小圖 93 continue 94 i = i + 1 95 imgUrl = img['src'] 96 extension = os.path.splitext(imgUrl)[1] 97 path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension) 98 try: 99 self.saveImg(imgUrl,path_name) #捕獲各類圖片異常,流程不中斷 100 except: 101 pass 102 103 #收錄文字 104 def _getTextFromAnswer(self): 105 pass 106 107 #命令行下運行,例:zhihu.py 1 5 獲取1到5頁的數據 108 if __name__ == '__main__': 109 page, limit, paramsNum= 1, 0, len(sys.argv) 110 if paramsNum>=3: 111 page, pageEnd = sys.argv[1], sys.argv[2] 112 elif paramsNum == 2: 113 page = sys.argv[1] 114 pageEnd = page 115 else: 116 page,pageEnd = 1,1 117 118 spider = zhihuCollectionSpider(page,pageEnd,url) 119 spider.start()
不少初學者,對Python的概念都是模糊不清的,C語言、Python能作什麼,學的時候,該按照什麼線路去學習,學完往哪方面發展,想深刻了解,詳情能夠點擊有道雲筆記連接瞭解:http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aawindows