最近 @甜瓜 (QQ:1069629945) 開發了一套NBA數據採集腳本, 我以爲很贊. 經他容許發佈出來和你們分享一些經驗:php
球員球隊: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30ide
在1到30的循環中抓取球隊信息, 球員信息並用id將其關聯起來, 腳本以下:工具
public void Run() { Logger.ClearAll(); for(int i=1; i<=30; i++) { Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i); Default.Ready(); var teamid = i; var teamname = Default.SelectSingleNode("div.blockA>h2>span"); Logger.Log(teamname.Text()); var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a"); Logger.Log(teamurl.Text()); var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)"); Logger.Log(teamcity.Text().Replace("主場所在城市:","")); var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)"); Logger.Log(gym.Text().Replace("主體育館:","")); var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)"); Logger.Log(peoplenum.Text().Replace("可容納人數:","")); var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)"); Logger.Log(intonba.Text().Replace("加入NBA時間:","")); var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)"); Logger.Log(champion.Text().Replace("獲總冠軍次數:","")); var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)"); Logger.Log(coach.Text().Replace("現任主教練:","")); DataManager.AppendData("TEAM", DataEntry.Create() .Set("teamid", teamid+"") .Set("teamname", teamname.Text()) .Set("teamurl", teamurl.Text().Replace("主場所在城市:","")) .Set("gym",gym.Text().Replace("主體育館:","")) .Set("peoplenum", peoplenum.Text().Replace("可容納人數:","")) .Set("intonba", intonba.Text().Replace("加入NBA時間:","")) .Set("champion", champion.Text().Replace("獲總冠軍次數:","")) .Set("coach", coach.Text().Replace("現任主教練:","")) ); Logger.Log(i.ToString()); var playelist = Default.SelectNodes("div.tab>table tr"); foreach(var player in playelist) { var num = player.SelectSingleNode("TD:eq(0)"); var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a"); var url = a.Attr("href"); var playerid = Regex.Match(url, @"\d+").Value; var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img"); var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A"); var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)"); var height = player.SelectSingleNode("TD:eq(3)"); var weight = player.SelectSingleNode("TD:eq(4)"); var birth = player.SelectSingleNode("TD:eq(5)"); var college = player.SelectSingleNode("TD:eq(6)"); Logger.Log(playerimageurl.Text()); Logger.Log(playername.Text()); Logger.Log(position.Text()); Logger.Log(height.Text()); Logger.Log(weight.Text()); Logger.Log(birth.Text()); Logger.Log(college.Text()); Logger.Log(playerimageurl.Attr("src")); Logger.Log(playerid); DataManager.AppendData("player", DataEntry.Create() .Set("playerid", playerid) .Set("teamid", teamid+"") .Set("playername", playername.Text()) .Set("position", position.Text()) .Set("height",height.Text()) .Set("weight", weight.Text()) .Set("birth", birth.Text()) .Set("college", college.Text()) .Set("num", num.Text()) .Set("playerimageurl",playerimageurl.Attr("src")) ); } } }
比賽信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012開發工具
腳本以下:優化
public void Run() { Logger.ClearAll(); Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012"); Default.Ready(); var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技術統計\")"); List<string> urls = new List<string>(); foreach(var g in games) { var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString(); urls.Add(url.ToString()); } foreach(var url in urls) { if( Default.Available == false) return; Default.Navigate(url); Default.Ready(); var teamNames = Default.SelectNodes("div.blockA>h2"); var scores = Default.SelectNodes("table.tab04 tr"); var scoreslist = Default.SelectNodes("table.tab02 tr>td"); var awayscores = Default.SelectNodes("table.tab02 tr"); var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)"); var logos = Default.SelectNodes("td.logo img"); var awayid =Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value; var homeid =Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value; var homescore=scores[1].Text(); var awayscore=scores[0].Text(); var awayscore1=scoreslist[0].Text(); var awayscore2=scoreslist[1].Text(); var awayscore3=scoreslist[2].Text(); var awayscore4=scoreslist[3].Text(); var homescore1=scoreslist[4].Text(); var homescore2=scoreslist[5].Text(); var homescore3=scoreslist[6].Text(); var homescore4=scoreslist[7].Text(); var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1=""; var jiashiawayscores2="" ; var jiashiawayscores3 =""; var jiashiawayscores4=""; var jiashihomescores1=""; var jiashihomescores2=""; var jiashihomescores3 =""; var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains(\"加時賽\")"); if(!td.IsEmpty()) { if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==1) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); } else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==2) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); } else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==3) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text(); jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); } else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==4) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text(); jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text(); jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC", DataEntry.Create() .Set("teamid", Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value) .Set("gametime",gametime.Text().Replace("開始比賽","")) .Set("score1",awayscore1) .Set("score2", awayscore2) .Set("score3",awayscore3) .Set("score4",awayscore4) .Set("score", scores[1].Text()) .Set("gameid",url) .Set("status", "0") .Set("jiashiscore1",jiashiawayscores1) .Set("jiashiscore2",jiashiawayscores2) .Set("jiashiscore3",jiashiawayscores3) .Set("jiashiscore4",jiashiawayscores4) ); DataManager.AppendData("GAMESTATIC", DataEntry.Create() .Set("teamid", Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value) .Set("gametime",gametime.Text().Replace("開始比賽","")) .Set("score1",homescore1) .Set("score2", homescore2) .Set("score3",homescore3) .Set("score4",homescore4) .Set("score", scores[2].Text()) .Set("gameid",url) .Set("status", "1") .Set("jiashiscore1",jiashihomescores1) .Set("jiashiscore2",jiashihomescores2) .Set("jiashiscore3",jiashihomescores3) .Set("jiashiscore4",jiashihomescores4) ); } }
這裏的亮點是要看48, 49兩行, 這裏對加時賽也進行了處理. 不是全部的比賽都有加時賽, 就算有也能夠打多場(1-4場). 所以甜瓜很是細心的對這塊也作了處理. 我的感受這塊代碼也仍是有優化的餘地, 可是這種處理也很是簡單直白, 一目瞭然, 也是很不錯的. url
最後運行起來:spa
文中開發工具Spider Studio (採集工做站)下載地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安裝後運行, 將腳本複製進去點"運行"便可看到效果. code
Spider Studio QQ羣: 45995410htm