這裏給出新浪微博電腦端(http://weibo.com)和手機端(http://weibo.cn)的爬蟲代碼。php
新浪微博電腦端和手機端的登錄機制並不同,本文不作詳細介紹,給只給出代碼html
新浪微博 手機端(http://weibo.cn)python
#/usr/bin/env python # coding=utf-8 import urllib2 import urllib import cookielib import time import lxml.html as HTML class Fetcher(object): def __init__(self, username=None, pwd=None, cookie_filename=None): self.cj = cookielib.LWPCookieJar() if cookie_filename is not None: self.cj.load(cookie_filename) self.cookie_processor = urllib2.HTTPCookieProcessor(self.cj) self.opener = urllib2.build_opener(self.cookie_processor, urllib2.HTTPHandler) urllib2.install_opener(self.opener) self.username = username self.pwd = pwd self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1', 'Referer':'','Content-Type':'application/x-www-form-urlencoded'} def get_rand(self, url): headers = {'User-Agent':'Mozilla/5.0 (Windows;U;Windows NT 5.1;zh-CN;rv:1.9.2.9)Gecko/20100824 Firefox/3.6.9', 'Referer':''} req = urllib2.Request(url ,urllib.urlencode({}), headers) resp = urllib2.urlopen(req) login_page = resp.read() rand = HTML.fromstring(login_page).xpath("//form/@action")[0] passwd = HTML.fromstring(login_page).xpath("//input[@type='password']/@name")[0] vk = HTML.fromstring(login_page).xpath("//input[@name='vk']/@value")[0] return rand, passwd, vk def login(self, username=None, pwd=None, cookie_filename=None): if self.username is None or self.pwd is None: self.username = username self.pwd = pwd assert self.username is not None and self.pwd is not None url = 'http://3g.sina.com.cn/prog/wapsite/sso/login.php?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=' rand, passwd, vk = self.get_rand(url) data = urllib.urlencode({'mobile': self.username, passwd: self.pwd, 'remember': 'on', 'backURL': 'http://weibo.cn/', 'backTitle': '新浪微博', 'vk': vk, 'submit': '登陸', 'encoding': 'utf-8'}) url = 'http://3g.sina.com.cn/prog/wapsite/sso/' + rand req = urllib2.Request(url, data, self.headers) resp = urllib2.urlopen(req) page = resp.read() link = HTML.fromstring(page).xpath("//a/@href")[0] if not link.startswith('http://'): link = 'http://weibo.cn/%s' % link req = urllib2.Request(link, headers=self.headers) urllib2.urlopen(req) if cookie_filename is not None: self.cj.save(filename=cookie_filename) elif self.cj.filename is not None: self.cj.save() print 'login success!' def fetch(self, url, timeout): req = urllib2.Request(url, headers=self.headers) return urllib2.urlopen(req, None, timeout).read() if __name__ == '__main__' username='xx@xx.com'; password='xxxx; fetcher = Fetcher(username, password); fetcher.login(); seed_url='http://weibo.cn/xiaomishouji?filter=0'; try : htmlContent = fetcher.fetch(seed_url, 3): except Exception,e : print 'time out';
新浪微博電腦端ajax
#/usr/bin/env python # coding=utf-8 import re import json import urllib import base64 import rsa import binascii import urllib2 import cookielib def sServerData(serverData): "Search the server time & nonce from server data" p = re.compile('\((.*)\)') jsonData = p.search(serverData).group(1) data = json.loads(jsonData) serverTime = str(data['servertime']) nonce = data['nonce'] pubkey = data['pubkey']# rsakv = data['rsakv']# print "Server time is:", serverTime print "Nonce is:", nonce return serverTime, nonce, pubkey, rsakv def sRedirectData(text): p = re.compile('location\.replace\([\'"](.*?)[\'"]\)') loginUrl = p.search(text).group(1) print 'loginUrl:',loginUrl return loginUrl def PostEncode(userName, passWord, serverTime, nonce, pubkey, rsakv): "Used to generate POST data" encodedUserName = GetUserName(userName) encodedPassWord = get_pwd(passWord, serverTime, nonce, pubkey) postPara = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'userticket': '1', 'ssosimplelogin': '1', 'vsnf': '1', 'vsnval': '', 'su': encodedUserName, 'service': 'miniblog', 'servertime': serverTime, 'nonce': nonce, 'pwencode': 'rsa2', 'sp': encodedPassWord, 'encoding': 'UTF-8', 'prelt': '115', 'rsakv': rsakv, 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META' } postData = urllib.urlencode(postPara) return postData def GetUserName(userName): "Used to encode user name" userNameTemp = urllib.quote(userName) userNameEncoded = base64.encodestring(userNameTemp)[:-1] return userNameEncoded def get_pwd(password, servertime, nonce, pubkey): rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) passwd = rsa.encrypt(message, key) passwd = binascii.b2a_hex(passwd) return passwd class Fetcher: def __init__(self, user, pwd, enableProxy = False): print "Initializing WeiboLogin..." self.userName = user self.passWord = pwd self.enableProxy = enableProxy self.serverUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.11)&_=1379834957683" self.loginUrl = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.11)" self.postHeader = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'} def Login(self): self.EnableCookie(self.enableProxy) serverTime, nonce, pubkey, rsakv = self.GetServerTime() postData = PostEncode(self.userName, self.passWord, serverTime, nonce, pubkey, rsakv)# print "Post data length:\n", len(postData) req = urllib2.Request(self.loginUrl, postData, self.postHeader) print "Posting request..." result = urllib2.urlopen(req) text = result.read() try: loginUrl = sRedirectData(text) urllib2.urlopen(loginUrl) except: print 'Login error!' return False print 'Login sucess!' return True def EnableCookie(self, enableProxy): "Enable cookie & proxy (if needed)." cookiejar = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cookiejar) if enableProxy: proxy_support = urllib2.ProxyHandler({'http':'http://xxxxx.pac'}) opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) print "Proxy enabled" else: opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) def GetServerTime(self): "Get server time and nonce, which are used to encode the password" print "Getting server time and nonce..." serverData = urllib2.urlopen(self.serverUrl).read() print serverData try: serverTime, nonce, pubkey, rsakv = sServerData(serverData) return serverTime, nonce, pubkey, rsakv except: print 'Get server time & nonce error!' return None def fetch(self, url, timeout) : req = urllib2.Request(url, headers=self.postHeader) return urllib2.urlopen(req, None, timeout).read() if __name__ == '__main__': username = 'xx@xx.com' passwd = 'xxxx' fetcher = Fetcher(username, passwd) if fetcher.Login() == True: print "Login success!" seed_url='http://weibo.com/p/1006061771925961/weibo' try : htmlContent = fetcher.fetch(seed_url, 3); except Exception,e : print 'time out';