如下代碼保存爲utf8文本格式html
環境:ActivePerl v5.16 built for MSWin32-x86cookie
兩個要調整的地方:less
for my $i (17..45) { 這裏改爲本身對應的頁碼,好比發了30頁的閃存,就改爲1..30post
還有就是這部分改爲本身對應的ID和密碼,中文ID還未測試過測試
tbUserName=>"ID",
tbPassword=>"密碼",ui
抓取的結果參考截圖:spa
=info code: vicyang mail: 523066680@163.com date: 2015-02-07 =cut #!/usr/bin/perl use v5.16; use strict; use utf8; use Encode; use IO::Handle; use LWP::UserAgent; binmode(STDOUT, ":encoding(gbk)"); STDOUT->autoflush(1); my $ua = LWP::UserAgent->new; $ua->cookie_jar( {} ); my $randserial=getRandHex(32); my $response = $ua->get( 'http://passport.cnblogs.com/BotDetectCaptcha.ashx?get=image&c=c_login_logincaptcha&t='.$randserial ); print "$randserial\n"; if ($response->is_success) { # 抓取驗證碼圖片 # open my $check_code_image, "> verifycode.BMP" #默認驗證圖片的位置在當前目錄 or die "$!"; binmode($check_code_image); print $check_code_image $response->content; close $check_code_image; #關閉print對句柄的控制 system("start verifycode.BMP"); } else { print "wrong\n"; } print "please input verifycode:"; my $inp=<STDIN>; $inp=~s/\r?\n$//; print "[$inp]\n"; my $res= $ua->post('http://passport.cnblogs.com/login.aspx?ReturnUrl=http://home.cnblogs.com/ing/', [ __EVENTTARGET=>"", __EVENTARGUMENT=>"", __VIEWSTATE=>"/wEPDwUKLTM1MjEzOTU2MGQYAQUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFC2Noa1JlbWVtYmVy4b/ZXiH+8FthXlmKpjSEgi7XBNU=", __VIEWSTATEGENERATOR=>"C2EE9ABB", __EVENTVALIDATION=>"/wEdAAYIqCk3Gcmu25zI9fQWqoC7hI6Xi65hwcQ8/QoQCF8JIahXufbhIqPmwKf992GTkd2Mxo6xcg+Ng5CZxsqMUGnVMKtTyqevv9cjRp4Oh+9VMaKeKEbp39eHc9mbdvkCgxCM74oSoIAJofLsQdCCbtmog/0fDw==", tbUserName=>"改爲本身的ID", tbPassword=>"改爲本身的密碼", chkRemember=>"on", LBD_VCID_c_login_logincaptcha=>$randserial, LBD_BackWorkaround_c_login_logincaptcha=>1, CaptchaCodeTextBox=>$inp, btnLogin=>"登++錄", #txtReturnUrl=>"http://home.cnblogs.com/u/paktc/" ], ); my $fh; our $main='http://home.cnblogs.com/ing'; our $limit=70; my $all; my @ech; for my $i (17..45) { print STDOUT "Code: $i\n"; open $fh, ">:encoding(gbk)", "ing$i.txt"; select $fh; $fh->autoflush(1); $res = $ua->get("http://home.cnblogs.com/u/paktc/feed/$i.html") or warn "$!"; unless ($res->is_success) { print "wrong\n"; } $all=$res->content; @ech=($all=~/GetIngRecentComments\((\d+)/gi); for (@ech) { &getDetail($_); } close $fh; } print STDERR "over"; <STDIN>; sub getDetail { our $main; our $limit; my $pagecode=shift; $res = $ua->get($main."/".$pagecode) or warn "$!"; my @all=split( /\r?\n/, decode('utf8', $res->content) ); my ( $p_time, $message, $comment_count, $comment_author, $comment_text, $comment_time ); for (my $i=0; $i<=$#all; $i++) { if ($all[$i]=~/\d+-\d+-\d+ \d+:\d+$/) { $p_time=$&; print "\n$p_time\n"; } if ($all[$i]=~/ing_detail_body">(.*)/i) { $message=$1; while (not $message=~s/<\/div>//i) { $message .= $all[++$i]; } #case <div id="ing_detail_body"><a href="/ing/tag/..." class="ing_tag">[組圖]</a> # [標籤] $message=~s/<a href=".*class="ing_tag">([^<]+)<\/a>/$1/i; #case 幸運閃 其餘狀況還未考慮 $message=~s/<img src=.*alt="([^"]+)" title=".*"\/>/☆/i; #case 連接 $message=~s/<a href="([^"]+)" .*<\/a>/<$1>/gi; &charResume(\$message); #必須傳入一個引用,不然會崩潰 my @msg=splitText2($message, $limit, 2); print " ", "┈"x($limit/2), "\n"; print decode('gbk', join("", @msg)); print " ", "┈"x($limit/2), "\n"; print STDOUT decode('gbk', join("", @msg)), "\n"; } if ($all[$i]=~/ing_comment_count">(.*)(<\/div>)?/i) { $comment_count=$1; } if ($all[$i]=~/comment_author_.*ing\/">(.*)<\/a>/) { $comment_author = $1; ($comment_text = $all[$i+1]) =~ s/^ +//; $comment_text =~s/<a href="([^"]+)" .*<\/a>/<$1>/gi; #替換連接 &charResume(\$comment_text); $all[$i+2] =~/(\d+-\d+-\d+ \d+:\d+)/; $comment_time = $1; printf(" <%s> ", $comment_author); my $name_length = length(encode('gbk', $comment_author))+7; #length " <%s> " my @msg = splitText2($comment_text, ($limit - $name_length), 0); print decode('gbk', shift @msg); for (@msg) { print " "x$name_length . decode('gbk', $_); } print "\n"; $i+=2; } } } sub splitText2 { my ($text, $limit, $indent) = @_; my ($a, $b, @arr); $b = encode('gbk', $text); while (length($b) > $limit) { ($a, $b) = &cut_gbk($limit, $b); push (@arr, " "x$indent . $a ."\n"); } # At last push (@arr, " "x$indent . $b ."\n"); return @arr; } sub cut_gbk { my ($limit, $gstr) = @_; my $i; my $cut = 0; my $gstr_a; my $gstr_b; foreach $i ( split("", decode('gbk', $gstr)) ) { if (ord($i) < 128) { $cut+=1; } else { $cut+=2; } if ($cut >= $limit) { $gstr_a = substr($gstr, 0, $cut); $gstr_b = substr($gstr, $cut); last; } } return ($gstr_a, $gstr_b); } sub charResume { #傳入unicode格式的字符串 my $ref = shift; my $char; my $count=0; ${$ref}=~s/\</</gi; ${$ref}=~s/\>/>/gi; ${$ref}=~s/\"/"/gi; ${$ref}=~s/\ / /gi; ${$ref}=~s/\ / /gi; ${$ref}=~s/\&/\&/gi; ${$ref}=~s/\&/\&/gi; while (${$ref}=~/\&#(\d+);/) { $count++; $char=chr($1); ${$ref}=~s/\&#$1;/$char/g; if ($count > 100) { print STDOUT "DEEP LOOP \n"; sleep 1.0; exit; } } } sub getRandHex { my $str=""; for my $i (1 .. $_[0]) { $str .= sprintf("%x", int(rand(16))); } return $str; }