示例 - C#腳本代碼採集搜狐NBA球員, 球隊和比賽實況

最近 @甜瓜 (QQ:1069629945) 開發了一套NBA數據採集腳本, 我以爲很贊. 經他容許發佈出來和你們分享一些經驗:php

球員球隊: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30ide

在1到30的循環中抓取球隊信息, 球員信息並用id將其關聯起來, 腳本以下:工具

public void Run()
{
    Logger.ClearAll();
    for(int i=1; i<=30; i++)
    {
        Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
        Default.Ready();
        var teamid = i;
        var teamname = Default.SelectSingleNode("div.blockA>h2>span");
        Logger.Log(teamname.Text());
        var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
        Logger.Log(teamurl.Text());
        var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
        Logger.Log(teamcity.Text().Replace("主場所在城市:",""));
        var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
        Logger.Log(gym.Text().Replace("主體育館:",""));
        var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
        Logger.Log(peoplenum.Text().Replace("可容納人數:",""));
        var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
        Logger.Log(intonba.Text().Replace("加入NBA時間:",""));
        var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
        Logger.Log(champion.Text().Replace("獲總冠軍次數:",""));
        var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
        Logger.Log(coach.Text().Replace("現任主教練:",""));
        DataManager.AppendData("TEAM",
                               DataEntry.Create()
                               .Set("teamid", teamid+"")
                               .Set("teamname", teamname.Text())
                               .Set("teamurl", teamurl.Text().Replace("主場所在城市:",""))
                               .Set("gym",gym.Text().Replace("主體育館:",""))
                               .Set("peoplenum", peoplenum.Text().Replace("可容納人數:",""))
                               .Set("intonba", intonba.Text().Replace("加入NBA時間:",""))
                               .Set("champion", champion.Text().Replace("獲總冠軍次數:",""))
                               .Set("coach", coach.Text().Replace("現任主教練:",""))
                              );
        Logger.Log(i.ToString());
        var playelist = Default.SelectNodes("div.tab>table tr");
        foreach(var player in playelist)
        {
            var num = player.SelectSingleNode("TD:eq(0)");
            var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
            var url = a.Attr("href");
            var playerid = Regex.Match(url, @"\d+").Value;
            var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
            var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
            var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
            var height = player.SelectSingleNode("TD:eq(3)");
            var weight = player.SelectSingleNode("TD:eq(4)");
            var birth = player.SelectSingleNode("TD:eq(5)");
            var college = player.SelectSingleNode("TD:eq(6)");
            Logger.Log(playerimageurl.Text());
            Logger.Log(playername.Text());
            Logger.Log(position.Text());
            Logger.Log(height.Text());
            Logger.Log(weight.Text());
            Logger.Log(birth.Text());
            Logger.Log(college.Text());
            Logger.Log(playerimageurl.Attr("src"));
            Logger.Log(playerid);
            DataManager.AppendData("player",
                                   DataEntry.Create()
                                   .Set("playerid", playerid)
                                   .Set("teamid", teamid+"")
                                   .Set("playername", playername.Text())
                                   .Set("position", position.Text())
                                   .Set("height",height.Text())
                                   .Set("weight", weight.Text())
                                   .Set("birth", birth.Text())
                                   .Set("college", college.Text())
                                   .Set("num", num.Text())
                                   .Set("playerimageurl",playerimageurl.Attr("src"))
                                  );
        }
    }
}

 

比賽信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012開發工具

腳本以下:優化

public void Run()
{
    Logger.ClearAll();
    Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
    Default.Ready();
    var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技術統計\")");

    List<string> urls = new List<string>();
    foreach(var g in games)
    {
        var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
        urls.Add(url.ToString());
    }
    foreach(var url in urls)
    {
        if( Default.Available == false) return;
        Default.Navigate(url);
        Default.Ready();
        var teamNames = Default.SelectNodes("div.blockA>h2");
        var scores = Default.SelectNodes("table.tab04 tr");
        var scoreslist = Default.SelectNodes("table.tab02 tr>td");
        var awayscores = Default.SelectNodes("table.tab02 tr");
        var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
        var logos = Default.SelectNodes("td.logo img");
        var awayid =Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
        var homeid =Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
        var homescore=scores[1].Text();
        var awayscore=scores[0].Text();
        var awayscore1=scoreslist[0].Text();
        var awayscore2=scoreslist[1].Text();
        var awayscore3=scoreslist[2].Text();
        var awayscore4=scoreslist[3].Text();
        var homescore1=scoreslist[4].Text();
        var homescore2=scoreslist[5].Text();
        var homescore3=scoreslist[6].Text();
        var homescore4=scoreslist[7].Text();
        var gametime = Default.SelectSingleNode("div.center>h2");

        var jiashiawayscores1="";
        var jiashiawayscores2="" ;
        var jiashiawayscores3 ="";
        var jiashiawayscores4="";
        var jiashihomescores1="";
        var jiashihomescores2="";
        var jiashihomescores3 ="";
        var jiashihomescores4="";

        var td = Default.SelectSingleNode("table.tabBig td:contains(\"加時賽\")");
        if(!td.IsEmpty())
        {

            if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==1)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
            }
            else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==2)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();

            }
            else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==3)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
                jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();

            }
            else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==4)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
                jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
                jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
                jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text();

            }


        }


        DataManager.AppendData("GAMESTATIC",
                               DataEntry.Create()
                               .Set("teamid", Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
                               .Set("gametime",gametime.Text().Replace("開始比賽",""))
                               .Set("score1",awayscore1)
                               .Set("score2", awayscore2)
                               .Set("score3",awayscore3)
                               .Set("score4",awayscore4)
                               .Set("score", scores[1].Text())
                               .Set("gameid",url)
                               .Set("status", "0")
                               .Set("jiashiscore1",jiashiawayscores1)
                               .Set("jiashiscore2",jiashiawayscores2)
                               .Set("jiashiscore3",jiashiawayscores3)
                               .Set("jiashiscore4",jiashiawayscores4)
                              );
        DataManager.AppendData("GAMESTATIC",
                               DataEntry.Create()
                               .Set("teamid", Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
                               .Set("gametime",gametime.Text().Replace("開始比賽",""))
                               .Set("score1",homescore1)
                               .Set("score2", homescore2)
                               .Set("score3",homescore3)
                               .Set("score4",homescore4)
                               .Set("score", scores[2].Text())
                               .Set("gameid",url)
                               .Set("status", "1")
                               .Set("jiashiscore1",jiashihomescores1)
                               .Set("jiashiscore2",jiashihomescores2)
                               .Set("jiashiscore3",jiashihomescores3)
                               .Set("jiashiscore4",jiashihomescores4)
                              );


    }

}

 

這裏的亮點是要看48, 49兩行, 這裏對加時賽也進行了處理. 不是全部的比賽都有加時賽, 就算有也能夠打多場(1-4場). 所以甜瓜很是細心的對這塊也作了處理. 我的感受這塊代碼也仍是有優化的餘地, 可是這種處理也很是簡單直白, 一目瞭然, 也是很不錯的. url

最後運行起來:spa

 

 

文中開發工具Spider Studio (採集工做站)下載地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安裝後運行, 將腳本複製進去點"運行"便可看到效果. code

Spider Studio QQ羣: 45995410htm

相關文章
相關標籤/搜索