這個爬蟲程序有別於以前寫的兩個,這個是本身寫的,使用的是python庫requests、redis、lxml。html
一共有三個文件分別是config.ini用戶名和cookie配置文件,zhihusp.py爬取用戶名,get-info.py爬取用戶其餘信息。node
下面分別將三個文件貼出來,兩個python文件註釋比較詳細,應該都看得懂。python
config.ini
[info] phone_num = 15********* password = ************ [cookies] q_c1 = 5fd5e96aa1cc40f587e2fcaa621030ee|1448986627000|1448986627000 cap_id = Zjk3N2I3MjU1ZmIyNGJkNWJIDOxYmE3ZDEzN2QyOGE=|1449289675| 612bbfbnjd2e3bca76d397a2c67c921fe7c852b _za = b7e8ab32-03b3-473b-87e6-68fe9f9e7933 __utmt = 1 __utma = 51854390.1168696635.1449128833.1449239113.1449289659.5 __utmb = 51854390.6.10.1449289659 __utmc = 51854390 __utmz = 51854390.1449223233.4.2.utmcsr=zhihu.coccn=(referral)| utmcmd=referral|utmcct=/people/excited-vczh/followers __utmv = 51854390.100-2|2=re=1^3=entry_date=20151202=1 z_c0 = QUJDTXpzbTNGd2tYQUFBdffabXowaVZZdHBZbnJIS3FhYjZBQnRTWllWQlZ1T 1kyc1dnPT0=|1449289708|7020f5e7c6c95b043e48c02afffb3a9c40035a77 unlock_ticket = QUJDTXpzbTNGd2tYQUFBQVlRSlZUZlJ1WWxaUDlzRGpZTVocGdn Ul8xZkVNbDNBPT0=|1554289708|d906b57006b0cd84c58c4f6d6e1eb16e17e64
zhihusp.py 主要用戶從關注着列表抓取關注者idredis
1 # -*- coding: utf-8 -*- 2 ''' 3 網絡爬蟲之爬取知乎用戶信息 4 ''' 5 import requests, json, re, redis, sqlite3 6 import ConfigParser 7 from lxml import etree 8 import sys 9 reload(sys) 10 sys.setdefaultencoding("utf-8") 11 12 class ZhihuSpider(object): 13 """docstring for ZhihuSpider""" 14 r = redis.Redis(host='127.0.0.1',port=6379,db=1) 15 cf = ConfigParser.ConfigParser() 16 cf.read('config.ini') 17 cookies = cf.items('cookies') 18 cookies = dict(cookies) 19 session = requests.session() 20 conn = sqlite3.connect('zhihuuser.db') 21 conn.text_factory = str 22 cur = conn.cursor() 23 24 # 建立連接,若是使用用戶名、密碼登陸不上,則改用cookie登陸 25 def create_session(self): 26 27 from pprint import pprint 28 pprint(self.cookies) 29 phone_num = self.cf.get('info', 'phone_num') 30 password = self.cf.get('info', 'password') 31 login_data = {'phone_num': phone_num, 'password': password} 32 header = { 33 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 34 AppleWebKit/537.36 35 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', 36 'Host': 'www.zhihu.com', 37 'Referer': 'http://www.zhihu.com/' 38 } 39 r = self.session.post('http://www.zhihu.com/login/phone_num', 40 data=login_data, 41 headers=header) 42 if r.json()['r'] == 1: 43 print 'Login Failed, reason is:', 44 for m in r.json()['data']: 45 print r.json()['data'][m] 46 print 'So we use cookies to login in...' 47 has_cookies = False 48 for key in self.cookies: 49 if key != '__name__' and self.cookies[key] != '': 50 has_cookies = True 51 break 52 if has_cookies is False: 53 raise ValueError('請填寫config.ini文件中的cookies項.') 54 else: 55 r=self.session.get('http://www.zhihu.com/login/phone_num', 56 cookies=self.cookies) # 實現cookie登錄 57 58 with open('login.html', 'w') as fp: 59 fp.write(r.content) 60 61 # 請求用戶關注者或關注了誰的頁面 62 def follow(self, userid): 63 print "NOW Follow:",userid 64 self.r.set(userid, False) 65 follower_url = "http://www.zhihu.com/people/"+userid+"/followers" 66 follower, followee, user_urls = self.getinfo(userid) 67 # print user_urls 68 for u_url in user_urls: 69 userid = u_url.split('/')[-1] 70 # print "FFFFFFFLLLLLLLL@*******",userid 71 if self.not_in(userid): 72 self.r.set(userid, True) 73 # print type(follower),follower 74 if follower > 20: 75 self.doprofiles(follower,follower_url) 76 77 #提取關注的人的第一頁的userid 78 followee_url = "http://www.zhihu.com/people/"+userid+"/followees" 79 response=self.session.get(followee_url,cookies=self.cookies) 80 .content 81 page = etree.HTML(response) 82 user_urls = page.xpath('//h2/a[class="zg-link"]/@href') 83 84 for u_url in user_urls: 85 userid = u_url.split('/')[-1] 86 # print "WWWWWWWW*****",userid 87 if self.not_in(userid): 88 self.r.set(userid, True) 89 if followee > 20: 90 self.doprofiles(followee,followee_url) 91 92 # 動態獲取「更多」裏面的內容 93 def doprofiles(self,attention,url): 94 thisheader = { 95 'Host': 'www.zhihu.com', 96 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 97 AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.2357.124 98 Safari/537.36', 99 'Accept': '*/*', 100 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 101 'Accept-Encoding': 'gzip, deflate', 102 'Content-Type':'application/x-www-form-urlencoded; 103 charset=UTF-8', 104 'X-Requested-With': 'XMLHttpRequest', 105 'Pragma': 'no-cache', 106 'Cache-Control': 'no-cache', 107 'Referer': url, 108 'Content-Length': 171, 109 'Cookie': '填本身的' 110 } 111 hash_id = '填本身的' 112 xsrf = '填本身的' 113 # 計算頁數,獲取更多裏面的關注者信息 114 pages = attention/20 + 1 115 # if pages > 600: 116 # pages = 600 117 for x in xrange(1,pages): 118 offset = x * 20 119 params = json.dumps({"offset":offset, 120 "order_by":"created", 121 "hash_id":hash_id}) 122 payload = {"method":"next","params":params,"_xsrf":xsrf} 123 content = self.session.post("http://www.zhihu.com/node/ 124 ProfileFollowersListV2",headers = thisheader 125 ,data = payload).content 126 load = json.loads(content) 127 # print type(load) 128 lists = load['msg'] 129 for item in lists: 130 try: 131 userpeople = re.search(r'people/[\w+\d+-]+',item) 132 # print userpeople 133 if userpeople is not None: 134 people = userpeople.group() 135 userid = people.split('/')[-1] 136 print "PPPPPPPPPPPPPP-------",userid 137 if self.not_in(userid): 138 self.r.set(userid, True) 139 except AttributeError: 140 print "ERROR" 141 # self.num += 1 142 self.gofollow() 143 144 145 # 繼續Follow 146 def gofollow(self): 147 for key in self.r.keys(): 148 if self.r.get(key) == 'True': 149 self.follow(key) 150 151 # 檢查用戶名是否在redis裏已經存在 152 def not_in(self, userid): 153 if self.r.exists(userid): 154 return False 155 else: 156 return True 157 158 def getinfo(self,userid): 159 follower_url = "http://www.zhihu.com/people/"+userid+"/followers" 160 response = self.session.get(follower_url, cookies=self.cookies) 161 .content 162 page = etree.HTML(response) 163 user_urls = page.xpath('//h2/a[@class="zg-link"]/@href') 164 # 獲取姓名、城市、工做、性別、教育等信息,並存入數據庫 165 followee = int(page.xpath('//div[@class="zm-profile-side- 166 following zg-clear"]/a[1]/strong/text()')[0]) 167 follower = int(page.xpath('//div[@class="zm-profile-side- 168 following zg-clear"]/a[2]/strong/text()')[0]) 169 return follower, followee, user_urls 170 171 if __name__ == '__main__': 172 zhihu = ZhihuSpider() 173 # 建立表 174 zhihu.cur.execute('''create table if not exists userstb 175 (userid text primary key, 176 username text, gender text, followee integer, 177 follower integer, location text, 178 business text, employment text, 179 position text, education text, college text, 180 question_num integer, answer_num text)''') 181 zhihu.conn.commit() 182 zhihu.create_session() 183 184 # 幾個知乎大V 185 first_users = ['excited-vczh', 'warfalcon','gejinyuban'] 186 for user in first_users: 187 if zhihu.r.exists(user): 188 continue 189 else: 190 zhihu.follow(user) 191 # 從redis裏面查找沒有被follow的用戶id 192 for key in zhihu.r.keys(): 193 if zhihu.r.exists(key): 194 if zhihu.r.get(key)=='True': 195 zhihu.follow(key)
get-info.py 主要訪問每一個id的主頁,提取信息sql
1 # -*- coding: utf-8 -*- 2 ''' 3 網絡爬蟲之爬取知乎用戶信息 4 ''' 5 import requests, json, re, redis, sqlite3 6 import ConfigParser 7 from lxml import etree 8 from time import ctime 9 import sys 10 reload(sys) 11 sys.setdefaultencoding("utf-8") 12 13 class GetInfo(object): 14 """docstring for GetInfo""" 15 16 r1 = redis.Redis(host='127.0.0.1',port=6379,db=1) 17 r2 = redis.Redis(host='127.0.0.1',port=6379,db=2) 18 cf = ConfigParser.ConfigParser() 19 cf.read('config.ini') 20 cookies = cf.items('cookies') 21 cookies = dict(cookies) 22 session = requests.session() 23 conn = sqlite3.connect('zhihuuser.db') 24 cur = conn.cursor() 25 itemlist = [] 26 useridlist = [] 27 flag = 0 28 29 # 請求用戶主頁,獲取信息,並存入數據庫 30 def getinfo(self, userid): 31 url = "http://www.zhihu.com/people/"+userid 32 print "GET:%s---%s" %(userid,ctime()) 33 34 # 異常處理,必要!! 35 try: 36 response = self.session.get(url,cookies=self.cookies).content 37 page = etree.HTML(response) 38 username = page.xpath('//div[@class="title-section ellipsis"] 39 /span[@class="name"]/text()')[0] 40 location = page.xpath('//div[@data-name="location"]/span 41 /span[@class="location item"]/@title') 42 business = page.xpath('//div[@data-name="location"]/span 43 /span[@class="business item"]/@title') 44 gendertit = page.xpath('//div[@data-name="location"]/span 45 /span[@class="item gender"]/i/@class') 46 # 沒辦法直接取出性別,曲線救國 47 if len(gendertit)==0: 48 gender = 'notsure' 49 elif re.search(r'female', gendertit[0]): 50 gender = u'女' 51 else: 52 gender = u'男' 53 employment = page.xpath('//div[@data-name="employment"] 54 /span/span[@class="employment item"]/@title') 55 position = page.xpath('//div[@data-name="employment"] 56 /span/span[@class="position item"]/@title') 57 education = page.xpath('//div[@data-name="education"] 58 /span/span[@class="education item"]/@title') 59 college = page.xpath('//div[@data-name="education"] 60 /span/span[@class="education-extra item"]/@title') 61 followee = int(page.xpath('//div[@class="zm-profile-side- 62 following zg-clear"]/a[1]/strong/text()')[0]) 63 follower = int(page.xpath('//div[@class="zm-profile-side- 64 following zg-clear"]/a[2]/strong/text()')[0]) 65 question_num = int(page.xpath('//div[@class="profile-navba 66 r clearfix"]/a[2]/span/text()')[0]) 67 answer_num =int(page.xpath('//div[@class="profile-navbar 68 clearfix"]/a[3]/span/text()')[0]) 69 70 # 有些字段用戶沒有填寫,因此須要判斷是否爲空 71 if len(location) == 0: 72 location = None 73 else: 74 location = location[0] 75 if len(business) == 0: 76 business = None 77 else: 78 business = business[0] 79 if len(employment) == 0: 80 employment = None 81 else: 82 employment = employment[0] 83 if len(position) == 0: 84 position = None 85 else: 86 position = position[0] 87 if len(education) == 0: 88 education = None 89 else: 90 education = education[0] 91 if len(college) == 0: 92 college = None 93 else: 94 college = college[0] 95 96 # 存入數據庫並提交 97 item = (userid,username,gender,followee,follower, 98 location,business,employment,position,education, 99 college,question_num,answer_num) 100 print userid,username 101 has_in = self.cur.execute("insert into userstb 102 values(?,?,?,?,?,?,?,?,?,?,?,?,?)",item) 103 self.conn.commit() 104 if has_in: 105 print u"存入成功" 106 self.r2.set(userid,True) 107 else: 108 print u"存入失敗" 109 except requests.exceptions.RequestException: 110 print u'鏈接異常' 111 self.main() 112 except Exception: 113 self.r2.set(userid,True) 114 self.main() 115 116 # 主循環,從redis裏面取出沒有查詢過的 117 def main(self): 118 while True: 119 for key in self.r1.keys(): 120 if self.r2.exists(key): 121 continue 122 else: 123 self.getinfo(key) 124 125 if __name__ == '__main__': 126 begin = GetInfo() 127 begin.main()
GG數據庫