爬蟲之知乎用戶信息爬取

時間 2019-11-13

原文原文鏈接

這個爬蟲程序有別於以前寫的兩個，這個是本身寫的，使用的是python庫requests、redis、lxml。html

一共有三個文件分別是config.ini用戶名和cookie配置文件，zhihusp.py爬取用戶名，get-info.py爬取用戶其餘信息。node

下面分別將三個文件貼出來，兩個python文件註釋比較詳細，應該都看得懂。python

config.ini

[info]
phone_num = 15*********
password = ************

[cookies]
q_c1 = 5fd5e96aa1cc40f587e2fcaa621030ee|1448986627000|1448986627000
cap_id = Zjk3N2I3MjU1ZmIyNGJkNWJIDOxYmE3ZDEzN2QyOGE=|1449289675|
612bbfbnjd2e3bca76d397a2c67c921fe7c852b
_za = b7e8ab32-03b3-473b-87e6-68fe9f9e7933
__utmt = 1
__utma = 51854390.1168696635.1449128833.1449239113.1449289659.5
__utmb = 51854390.6.10.1449289659
__utmc = 51854390
__utmz = 51854390.1449223233.4.2.utmcsr=zhihu.coccn=(referral)|
utmcmd=referral|utmcct=/people/excited-vczh/followers
__utmv = 51854390.100-2|2=re=1^3=entry_date=20151202=1
z_c0 = QUJDTXpzbTNGd2tYQUFBdffabXowaVZZdHBZbnJIS3FhYjZBQnRTWllWQlZ1T
1kyc1dnPT0=|1449289708|7020f5e7c6c95b043e48c02afffb3a9c40035a77
unlock_ticket = QUJDTXpzbTNGd2tYQUFBQVlRSlZUZlJ1WWxaUDlzRGpZTVocGdn
Ul8xZkVNbDNBPT0=|1554289708|d906b57006b0cd84c58c4f6d6e1eb16e17e64

zhihusp.py 主要用戶從關注着列表抓取關注者idredis

  1 # -*- coding: utf-8 -*-
  2 '''
  3 網絡爬蟲之爬取知乎用戶信息
  4 '''
  5 import requests, json, re, redis, sqlite3
  6 import ConfigParser
  7 from lxml import etree
  8 import sys
  9 reload(sys)
 10 sys.setdefaultencoding("utf-8")
 11 
 12 class ZhihuSpider(object):
 13     """docstring for ZhihuSpider"""
 14     r = redis.Redis(host='127.0.0.1',port=6379,db=1)
 15     cf = ConfigParser.ConfigParser()
 16     cf.read('config.ini')
 17     cookies = cf.items('cookies')
 18     cookies = dict(cookies)
 19     session = requests.session()
 20     conn = sqlite3.connect('zhihuuser.db')
 21     conn.text_factory = str
 22     cur = conn.cursor()
 23 
 24     # 建立連接，若是使用用戶名、密碼登陸不上，則改用cookie登陸
 25     def create_session(self):
 26         
 27         from pprint import pprint
 28         pprint(self.cookies)
 29         phone_num = self.cf.get('info', 'phone_num')
 30         password = self.cf.get('info', 'password')
 31         login_data = {'phone_num': phone_num, 'password': password}
 32         header = {
 33             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 
 34             AppleWebKit/537.36 
 35             (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',
 36             'Host': 'www.zhihu.com',
 37             'Referer': 'http://www.zhihu.com/'
 38         }
 39         r = self.session.post('http://www.zhihu.com/login/phone_num',
 40          data=login_data,
 41          headers=header)
 42         if r.json()['r'] == 1:
 43             print 'Login Failed, reason is:',
 44             for m in r.json()['data']:
 45                 print r.json()['data'][m]
 46             print 'So we use cookies to login in...'
 47             has_cookies = False
 48             for key in self.cookies:
 49                 if key != '__name__' and self.cookies[key] != '':
 50                     has_cookies = True
 51                     break
 52             if has_cookies is False:
 53                 raise ValueError('請填寫config.ini文件中的cookies項.')
 54             else:
 55                 r=self.session.get('http://www.zhihu.com/login/phone_num',
 56                  cookies=self.cookies) # 實現cookie登錄
 57 
 58         with open('login.html', 'w') as fp:
 59             fp.write(r.content)
 60 
 61     # 請求用戶關注者或關注了誰的頁面
 62     def follow(self, userid):
 63         print "NOW Follow:",userid
 64         self.r.set(userid, False)
 65         follower_url = "http://www.zhihu.com/people/"+userid+"/followers"
 66         follower, followee, user_urls = self.getinfo(userid)
 67         # print user_urls
 68         for u_url in user_urls:
 69             userid = u_url.split('/')[-1]
 70             # print "FFFFFFFLLLLLLLL@*******",userid
 71             if self.not_in(userid):
 72                 self.r.set(userid, True)
 73         # print type(follower),follower
 74         if follower > 20:
 75             self.doprofiles(follower,follower_url)
 76 
 77         #提取關注的人的第一頁的userid
 78         followee_url = "http://www.zhihu.com/people/"+userid+"/followees"
 79         response=self.session.get(followee_url,cookies=self.cookies)
 80             .content
 81         page = etree.HTML(response)
 82         user_urls = page.xpath('//h2/a[class="zg-link"]/@href')
 83         
 84         for u_url in user_urls:
 85             userid = u_url.split('/')[-1]
 86             # print "WWWWWWWW*****",userid
 87             if self.not_in(userid):
 88                 self.r.set(userid, True)
 89         if followee > 20:
 90             self.doprofiles(followee,followee_url)
 91 
 92     # 動態獲取「更多」裏面的內容
 93     def doprofiles(self,attention,url):
 94         thisheader = {
 95             'Host': 'www.zhihu.com',
 96             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)
 97              AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.2357.124
 98               Safari/537.36',
 99             'Accept': '*/*',
100             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
101             'Accept-Encoding': 'gzip, deflate',
102             'Content-Type':'application/x-www-form-urlencoded;
103                                charset=UTF-8',
104             'X-Requested-With': 'XMLHttpRequest',
105             'Pragma': 'no-cache',
106             'Cache-Control': 'no-cache',
107             'Referer': url,
108             'Content-Length': 171,
109             'Cookie': '填本身的'
110         }
111         hash_id = '填本身的'
112         xsrf = '填本身的'
113         # 計算頁數，獲取更多裏面的關注者信息
114         pages = attention/20 + 1
115         # if pages > 600:
116         #     pages = 600
117         for x in xrange(1,pages):
118             offset = x * 20
119             params = json.dumps({"offset":offset,
120                 "order_by":"created",
121                 "hash_id":hash_id})
122             payload = {"method":"next","params":params,"_xsrf":xsrf}
123             content = self.session.post("http://www.zhihu.com/node/
124             ProfileFollowersListV2",headers = thisheader
125             ,data = payload).content
126             load = json.loads(content)
127             # print type(load)
128             lists = load['msg']
129             for item in lists:
130                 try:
131                     userpeople = re.search(r'people/[\w+\d+-]+',item)
132                     # print userpeople
133                     if userpeople is not None:
134                         people = userpeople.group()
135                         userid = people.split('/')[-1]
136                         print "PPPPPPPPPPPPPP-------",userid
137                         if self.not_in(userid):
138                             self.r.set(userid, True)
139                 except AttributeError:
140                     print "ERROR"
141                 # self.num += 1
142         self.gofollow()
143 
144 
145     # 繼續Follow
146     def gofollow(self):
147         for key in self.r.keys():
148             if self.r.get(key) == 'True':
149                 self.follow(key)
150 
151     # 檢查用戶名是否在redis裏已經存在
152     def not_in(self, userid):
153         if self.r.exists(userid):
154             return False
155         else:
156             return True
157 
158     def getinfo(self,userid):
159         follower_url = "http://www.zhihu.com/people/"+userid+"/followers"
160         response = self.session.get(follower_url, cookies=self.cookies)
161             .content
162         page = etree.HTML(response)
163         user_urls = page.xpath('//h2/a[@class="zg-link"]/@href')
164         # 獲取姓名、城市、工做、性別、教育等信息，並存入數據庫
165         followee = int(page.xpath('//div[@class="zm-profile-side-
166         following zg-clear"]/a[1]/strong/text()')[0])
167         follower = int(page.xpath('//div[@class="zm-profile-side-
168         following zg-clear"]/a[2]/strong/text()')[0])
169         return follower, followee, user_urls
170 
171 if __name__ == '__main__':
172     zhihu = ZhihuSpider()
173     # 建立表
174     zhihu.cur.execute('''create table if not exists userstb
175         (userid text primary key,
176         username text, gender text, followee integer,
177         follower integer, location text,
178         business text, employment text, 
179         position text, education text, college text, 
180         question_num integer, answer_num text)''')
181     zhihu.conn.commit()
182     zhihu.create_session()
183 
184     # 幾個知乎大V
185     first_users = ['excited-vczh', 'warfalcon','gejinyuban']
186     for user in first_users:
187         if zhihu.r.exists(user):
188             continue
189         else:
190             zhihu.follow(user)
191     # 從redis裏面查找沒有被follow的用戶id
192     for key in zhihu.r.keys():
193         if zhihu.r.exists(key):
194             if zhihu.r.get(key)=='True':
195                 zhihu.follow(key)

get-info.py 主要訪問每一個id的主頁，提取信息sql

  1 # -*- coding: utf-8 -*-
  2 '''
  3 網絡爬蟲之爬取知乎用戶信息
  4 '''
  5 import requests, json, re, redis, sqlite3
  6 import ConfigParser
  7 from lxml import etree
  8 from time import ctime
  9 import sys
 10 reload(sys)
 11 sys.setdefaultencoding("utf-8")
 12 
 13 class GetInfo(object):
 14     """docstring for GetInfo"""
 15 
 16     r1 = redis.Redis(host='127.0.0.1',port=6379,db=1)
 17     r2 = redis.Redis(host='127.0.0.1',port=6379,db=2)
 18     cf = ConfigParser.ConfigParser()
 19     cf.read('config.ini')
 20     cookies = cf.items('cookies')
 21     cookies = dict(cookies)
 22     session = requests.session()
 23     conn = sqlite3.connect('zhihuuser.db')
 24     cur = conn.cursor()
 25     itemlist = []
 26     useridlist = []
 27     flag = 0
 28 
 29     # 請求用戶主頁，獲取信息，並存入數據庫
 30     def getinfo(self, userid):
 31         url = "http://www.zhihu.com/people/"+userid
 32         print "GET:%s---%s" %(userid,ctime())
 33 
 34         # 異常處理，必要！！
 35         try:
 36             response = self.session.get(url,cookies=self.cookies).content
 37             page = etree.HTML(response)
 38             username = page.xpath('//div[@class="title-section ellipsis"]
 39             /span[@class="name"]/text()')[0]
 40             location = page.xpath('//div[@data-name="location"]/span
 41             /span[@class="location item"]/@title')
 42             business = page.xpath('//div[@data-name="location"]/span
 43             /span[@class="business item"]/@title')
 44             gendertit = page.xpath('//div[@data-name="location"]/span
 45             /span[@class="item gender"]/i/@class')
 46             # 沒辦法直接取出性別，曲線救國
 47             if len(gendertit)==0:
 48                 gender = 'notsure'
 49             elif re.search(r'female', gendertit[0]):
 50                 gender = u'女'
 51             else:
 52                 gender = u'男'
 53             employment = page.xpath('//div[@data-name="employment"]
 54                 /span/span[@class="employment item"]/@title')
 55             position = page.xpath('//div[@data-name="employment"]
 56                 /span/span[@class="position item"]/@title')
 57             education = page.xpath('//div[@data-name="education"]
 58                 /span/span[@class="education item"]/@title')
 59             college = page.xpath('//div[@data-name="education"]
 60                 /span/span[@class="education-extra item"]/@title')
 61             followee = int(page.xpath('//div[@class="zm-profile-side-
 62                 following zg-clear"]/a[1]/strong/text()')[0])
 63             follower = int(page.xpath('//div[@class="zm-profile-side-
 64                 following zg-clear"]/a[2]/strong/text()')[0])
 65             question_num = int(page.xpath('//div[@class="profile-navba
 66             r clearfix"]/a[2]/span/text()')[0])
 67             answer_num =int(page.xpath('//div[@class="profile-navbar
 68              clearfix"]/a[3]/span/text()')[0])
 69 
 70             # 有些字段用戶沒有填寫，因此須要判斷是否爲空
 71             if len(location) == 0:
 72                 location = None
 73             else:
 74                 location = location[0]
 75             if len(business) == 0:
 76                 business = None
 77             else:
 78                 business = business[0]
 79             if len(employment) == 0:
 80                 employment = None
 81             else:
 82                 employment = employment[0]
 83             if len(position) == 0:
 84                 position = None
 85             else:
 86                 position = position[0]
 87             if len(education) == 0:
 88                 education = None
 89             else:
 90                 education = education[0]
 91             if len(college) == 0:
 92                 college = None
 93             else:
 94                 college = college[0]
 95 
 96             # 存入數據庫並提交
 97             item = (userid,username,gender,followee,follower,
 98             location,business,employment,position,education,
 99             college,question_num,answer_num)
100             print userid,username
101             has_in = self.cur.execute("insert into userstb 
102                 values(?,?,?,?,?,?,?,?,?,?,?,?,?)",item)
103             self.conn.commit()
104             if has_in:
105                 print u"存入成功"
106                 self.r2.set(userid,True)
107             else:
108                 print u"存入失敗"
109         except requests.exceptions.RequestException:
110             print u'鏈接異常'
111             self.main()
112         except Exception:
113             self.r2.set(userid,True)
114             self.main()
115 
116     # 主循環，從redis裏面取出沒有查詢過的
117     def main(self):
118         while True:
119             for key in self.r1.keys():
120                 if self.r2.exists(key):
121                     continue
122                 else:
123                     self.getinfo(key)
124 
125 if __name__ == '__main__':
126     begin = GetInfo()
127     begin.main()

GG數據庫

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。