Java模擬登錄新浪微博抓取數據【轉載】

  1 package com.shiyimm.crawler.weibo;
  2 
  3 import java.io.FileNotFoundException;
  4 import java.io.FileReader;
  5 import java.io.IOException;
  6 import java.io.UnsupportedEncodingException;
  7 import java.net.URLDecoder;
  8 import java.net.URLEncoder;
  9 import java.util.ArrayList;
 10 import java.util.Date;
 11 import java.util.List;
 12 import java.util.regex.Matcher;
 13 import java.util.regex.Pattern;
 14 
 15 import javax.script.Invocable;
 16 import javax.script.ScriptEngine;
 17 import javax.script.ScriptEngineManager;
 18 import javax.script.ScriptException;
 19 
 20 import net.sf.json.JSONObject;
 21 
 22 import org.apache.commons.codec.binary.Base64;
 23 import org.apache.http.NameValuePair;
 24 import org.apache.http.client.ClientProtocolException;
 25 import org.apache.http.client.HttpClient;
 26 import org.apache.http.impl.client.DefaultHttpClient;
 27 import org.apache.http.message.BasicNameValuePair;
 28 
 29 import com.shiyimm.crawler.util.MyUrlUtil;
 30 import com.shiyimm.crawler.util.UrlUtil;
 31 
 32 public class SinaWeibo {
 33     private HttpClient client;
 34     private String username;    //登陸賬號(明文)
 35     private String password;    //登陸密碼(明文)
 36     private String su;            //登陸賬號(Base64加密)
 37     private String sp;            //登陸密碼(各類參數RSA加密後的密文)
 38     private long servertime;    //初始登陸時,服務器返回的時間戳,用以密碼加密以及登陸用
 39     private String nonce;        //初始登陸時,服務器返回的一串字符,用以密碼加密以及登陸用
 40     private String rsakv;        //初始登陸時,服務器返回的一串字符,用以密碼加密以及登陸用
 41     private String pubkey;        //初始登陸時,服務器返回的RSA公鑰
 42     
 43     private String errInfo;        //登陸失敗時的錯誤信息
 44     private String location;    //登陸成功後的跳轉鏈接
 45     
 46     private String url;
 47     
 48     public SinaWeibo(String username,String password){
 49         client = new DefaultHttpClient();
 50         this.username = username;
 51         this.password = password;
 52     }
 53     
 54     
 55     /**
 56      * 初始登陸信息<br>
 57      * 返回false說明初始失敗
 58      * @return
 59      */
 60     public boolean preLogin(){
 61         boolean flag = false;
 62         try {
 63             su = new String(Base64.encodeBase64(URLEncoder.encode(username, "UTF-8").getBytes()));
 64             String url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&rsakt=mod&checkpin=1&" +
 65                     "client=ssologin.js(v1.4.5)&_="+getTimestamp();
 66             url += "&su="+su;
 67             String content;
 68             content = HttpTools.getRequest(client, url);
 69             //System.out.println(content);
 70             System.out.println("content------------"+content);
 71             JSONObject json = JSONObject.fromObject(content);
 72             System.out.println(json);
 73             servertime = json.getLong("servertime");
 74             nonce = json.getString("nonce");
 75             rsakv = json.getString("rsakv");
 76             pubkey = json.getString("pubkey");
 77             flag = encodePwd();
 78         } catch (UnsupportedEncodingException e) {
 79             // TODO Auto-generated catch block
 80             //e.printStackTrace();
 81         } catch (ClientProtocolException e) {
 82             // TODO Auto-generated catch block
 83             //e.printStackTrace();
 84         } catch (IOException e) {
 85             // TODO Auto-generated catch block
 86             //e.printStackTrace();
 87         }
 88         return flag;
 89     }
 90     
 91     /**
 92      * 登陸
 93      * @return true:登陸成功
 94      */
 95     public boolean login(){
 96         if(preLogin()){
 97             String url = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";
 98             List<NameValuePair> parms = new ArrayList<NameValuePair>();
 99             parms.add(new BasicNameValuePair("entry", "weibo"));
100             parms.add(new BasicNameValuePair("geteway", "1"));
101             parms.add(new BasicNameValuePair("from", ""));
102             parms.add(new BasicNameValuePair("savestate", "7"));
103             parms.add(new BasicNameValuePair("useticket", "1"));
104             parms.add(new BasicNameValuePair("pagerefer", "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%2F"));
105             parms.add(new BasicNameValuePair("vsnf", "1"));
106             parms.add(new BasicNameValuePair("su", su));
107             parms.add(new BasicNameValuePair("service", "miniblog"));
108             parms.add(new BasicNameValuePair("servertime", servertime+""));
109             parms.add(new BasicNameValuePair("nonce", nonce));
110             parms.add(new BasicNameValuePair("pwencode", "rsa2"));
111             parms.add(new BasicNameValuePair("rsakv", rsakv));
112             parms.add(new BasicNameValuePair("sp", sp));
113             parms.add(new BasicNameValuePair("encoding", "UTF-8"));
114             parms.add(new BasicNameValuePair("prelt", "182"));
115             parms.add(new BasicNameValuePair("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack"));
116             parms.add(new BasicNameValuePair("returntype", "META"));
117             try {
118                 String content = HttpTools.postRequest(client, url, parms);
119                 System.out.println("content----------"+content);
120                 String regex = "location\\.replace\\(\"(.+?)\"\\);";
121                 Pattern p = Pattern.compile(regex);
122                 Matcher m = p.matcher(content);
123                 if(m.find()){
124                     location = m.group(1);
125                     if(location.contains("reason=")){
126                         errInfo = location.substring(location.indexOf("reason=")+7);
127                         errInfo = URLDecoder.decode(errInfo, "GBK");
128                     }else{
129                         String result = HttpTools.getRequest(client, location);
130                         System.out.println("result--------------"+result);
131                         return true;
132                     }
133                 }
134             } catch (ClientProtocolException e) {
135                 // TODO Auto-generated catch block
136                 e.printStackTrace();
137             } catch (IOException e) {
138                 // TODO Auto-generated catch block
139                 e.printStackTrace();
140             }
141 //            url = "http://www.weibo.com/hm";
142 //            System.out.println(MyUrlUtil.getResource(url));
143         }
144         return false;
145     }
146     
147     /**
148      * 密碼進行RSA加密&lt;br&gt;
149      * 返回false說明加密失敗
150      * @return
151      */
152     private boolean encodePwd(){
153         ScriptEngineManager sem = new ScriptEngineManager();
154         ScriptEngine se = sem.getEngineByName("javascript");
155         try {
156             FileReader fr = new FileReader("E:\\encoder.js");
157             se.eval(fr);
158             Invocable invocableEngine = (Invocable) se;
159             String callbackvalue = (String) invocableEngine.invokeFunction("encodePwd",pubkey,servertime,nonce,password);
160             sp = callbackvalue;
161             return true;
162         } catch (FileNotFoundException e) {
163             // TODO Auto-generated catch block
164             System.out.println("加密腳本encoder.sj未找到");
165         } catch (ScriptException e) {
166             // TODO Auto-generated catch block
167             //e.printStackTrace();
168         } catch (NoSuchMethodException e) {
169             // TODO Auto-generated catch block
170             //e.printStackTrace();
171         }
172         errInfo = "密碼加密失敗!";
173         return false;
174     }
175     
176     public String getErrInfo() {
177         return errInfo;
178     }
179     
180     /**
181      * 獲取時間戳
182      * @return
183      */
184     private long getTimestamp(){
185         Date now = new Date();
186         return now.getTime();
187     }
188     
189     public static void main(String[] args) throws ClientProtocolException, IOException {
190         SinaWeibo weibo = new SinaWeibo("帳號", "密碼");
191         if(weibo.login()){
192             System.out.println("登錄成功!");
193             String url = "http://www.weibo.com/hm";
194 //            String source = MyUrlUtil.getResource(url);
195 //            System.out.println(source);
196         }else{
197             System.out.println("登陸失敗!");
198         }
199     }
200 }
201 <pre class="brush:java; toolbar: true; auto-links: false;">package com.shiyimm.crawler.weibo;
202 
203 import java.io.IOException;
204 import java.util.List;
205 
206 import org.apache.http.HttpEntity;
207 import org.apache.http.HttpResponse;
208 import org.apache.http.NameValuePair;
209 import org.apache.http.client.ClientProtocolException;
210 import org.apache.http.client.HttpClient;
211 import org.apache.http.client.entity.UrlEncodedFormEntity;
212 import org.apache.http.client.methods.HttpGet;
213 import org.apache.http.client.methods.HttpPost;
214 import org.apache.http.util.EntityUtils;
215 
216 public class HttpTools {
217     /**
218      * 正常GET方式HTTP請求
219      * @param client
220      * @param url
221      * @return
222      * @throws ClientProtocolException
223      * @throws IOException
224      */
225     public static String getRequest(HttpClient client,String url) throws ClientProtocolException, IOException{
226         HttpGet get = new HttpGet(url);
227         get.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
228         //get.addHeader(&quot;Referer&quot;, &quot;http://2013.weibo.com/&quot;);
229         HttpResponse response = client.execute(get);
230         HttpEntity entity = response.getEntity();
231         String content = EntityUtils.toString(entity,"GBK");
232         //System.out.println(content);
233         /*EntityUtils.consume(entity);*/
234         return content;
235     }
236     
237     /**
238      * 正常POST方式HTTP請求
239      * @param client
240      * @param url
241      * @param parms
242      * @return
243      * @throws ClientProtocolException
244      * @throws IOException
245      */
246     public static String postRequest(HttpClient client,String url,List<NameValuePair> parms) throws ClientProtocolException, IOException{
247         HttpPost post = new HttpPost(url);
248         post.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
249         post.addHeader("Content-Type", "application/x-www-form-urlencoded");
250         //post.addHeader(&quot;Referer&quot;, &quot;http://2013.weibo.com/&quot;);
251         UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(parms, "UTF-8");
252         post.setEntity(postEntity);
253         HttpResponse response = client.execute(post);
254         HttpEntity entity = response.getEntity();
255         String content = EntityUtils.toString(entity,"GBK");
256         /*EntityUtils.consume(entity);*/
257         return content;
258     }
259 }
相關文章
相關標籤/搜索