python之網絡爬蟲

時間 2019-11-09

標籤 python 網絡爬蟲欄目 Python 简体版

原文原文鏈接

1、演繹自已的北愛 html

踏上北漂的航班，開始演奏了我自已的北京愛情故事python

2、爬蟲1linux

一、網絡爬蟲的思路json

首先：指定一個url，而後打開這個url地址，讀其中的內容。windows

其次：從讀取的內容中過濾關鍵字；這一步是關鍵，能夠經過查看源代碼的方式獲取。bash

最後：下載獲取的html的url地址，或者圖片的url地址保存到本地網絡

二、針對指定的url來網絡爬蟲app

分析：測試

第一步：大約共有4300個下一頁。url

第二步：一個頁面上有10個我的頭像

第三步：一個頭像內大約有100張左右的我的圖片

指定的淘寶mm的url爲：http://mm.taobao.com/json/request_top_list.htm?type=0&page=1

這個頁面默認是沒有下一頁按鈕的，咱們能夠經過修改其url地址來進行查看下一個頁面

最後一頁的url地址和頁面展現以下圖所示：

點擊任意一個頭像來進入我的的主頁，以下圖

三、定製的腳本

 
        #!/usr/bin/env python 
       
        #coding:utf-8 
       
        #Author：Allentuns 
       
        #Email：zhengyansheng@hytyi.com 
       
        import  
        urllib 
       
        import  
        os 
       
        import  
        sys 
       
        import  
        time 
       
        ahref =  
        '<a href="' 
       
        ahrefs =  
        '<a href="h' 
       
        ahtml =  
        ".htm" 
       
        atitle =  
        "<img style" 
       
        ajpg =  
        ".jpg" 
       
        btitle =  
        '<img src="' 
       
        page = 0 
       
        while  
        page < 4300:     
        #這個地方能夠修改;最大值爲4300，我測試的時候寫的是3. 
       
        mmurl =  
        "http://mm.taobao.com/json/request_top_list.htm?type=0&page=%d"  
        %(page) 
       
        content = urllib.urlopen(mmurl). 
        read 
        () 
       
        href = content. 
        find 
        (ahref) 
       
        html = content. 
        find 
        (ahtml) 
       
        url = content[href + len(ahref) : html + len(ahtml)] 
       
        print url 
       
        imgtitle = content. 
        find 
        (btitle,html) 
       
        imgjpg = content. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content[imgtitle + len(btitle): imgjpg + len(ajpg)] 
       
        print littleimgurl 
       
        urllib.urlretrieve(littleimgurl, 
        "/www/src/temp/image/taobaomm/allentuns.jpg" 
        ) 
       
        s = 0 
       
        while  
        s < 18: 
       
        href = content. 
        find 
        (ahrefs,html) 
       
        html = content. 
        find 
        (ahtml,href) 
       
        url = content[href + len(ahref): html + len(ajpg)] 
       
        print s,url 
       
        imgtitle = content. 
        find 
        (btitle,html) 
       
        imgjpg = content. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content[imgtitle : imgjpg + len(ajpg)] 
       
        littlesrc = littleimgurl. 
        find 
        ( 
        "src" 
        ) 
       
        tureimgurl = littleimgurl[littlesrc + 5:] 
       
        print s,tureimgurl 
       
        if  
        url. 
        find 
        ( 
        "photo" 
        ) == -1: 
       
        content01 = urllib.urlopen(url). 
        read 
        () 
       
        imgtitle = content01. 
        find 
        (atitle) 
       
        imgjpg = content01. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content01[imgtitle : imgjpg + len(ajpg)] 
       
        littlesrc = littleimgurl. 
        find 
        ( 
        "src" 
        ) 
       
        tureimgurl = littleimgurl[littlesrc + 5:] 
       
        print tureimgurl 
       
        imgcount = content01.count(atitle) 
       
        i = 20 
       
        try: 
       
        while  
        i < imgcount: 
       
        content01 = urllib.urlopen(url). 
        read 
        () 
       
        imgtitle = content01. 
        find 
        (atitle,imgjpg) 
       
        imgjpg = content01. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content01[imgtitle : imgjpg + len(ajpg)] 
       
        littlesrc = littleimgurl. 
        find 
        ( 
        "src" 
        ) 
       
        tureimgurl = littleimgurl[littlesrc + 5:] 
       
        print i,tureimgurl 
       
        time 
        . 
        sleep 
        (1) 
       
        if  
        tureimgurl.count( 
        "<" 
        ) == 0: 
       
        imgname = tureimgurl[tureimgurl.index( 
        "T" 
        ):] 
       
        urllib.urlretrieve(tureimgurl, 
        "/www/src/temp/image/taobaomm/%s-%s"  
        %(page,imgname)) 
       
        else 
        : 
       
        pass 
       
        i += 1 
       
        except IOError: 
       
        print  
        '/nWhy did you do an EOF on me?' 
       
        break 
       
        except: 
       
        print  
        '/nSome error/exception occurred.' 
       
        s += 1 
       
        else 
        : 
       
        print  
        "---------------{< 20;1 page hava 10 htm and pic  }-------------------------}" 
       
        page = page + 1 
       
        print  
        "****************%s page*******************************"  
        %(page) 
       
        else 
        : 
       
        print  
        "Download Finshed."

四、圖片展現(部分圖片)

五、查看下載的圖片數量

2、爬蟲2

一、首先來分析url

第一步：總共有7個頁面；

第二步：每一個頁面有20篇文章

第三步:查看後總共有317篇文章

二、python腳本

腳本的功能:經過給定的url來將這片博客裏面的全部文章下載到本地

 
        #!/usr/bin/env python 
       
        #coding: utf-8 
       
        import  
        urllib 
       
        import  
        time 
       
        list00 = [] 
       
        i = j = 0 
       
        page = 1 
       
        while  
        page < 8: 
       
        str =  
        "http://blog.sina.com.cn/s/articlelist_1191258123_0_%d.html"  
        %(page) 
       
        content = urllib.urlopen(str). 
        read 
        () 
       
        title = content. 
        find 
        (r 
        "<a title" 
        ) 
       
        href  = content. 
        find 
        (r 
        "href=" 
        ,title) 
       
        html  = content. 
        find 
        (r 
        ".html" 
        ,href) 
       
        url = content[href + 6:html + 5] 
       
        urlfilename = url[-26:] 
       
        list00.append(url) 
       
        print i,  url 
       
        while  
        title != -1 and href != -1 and html != -1 and i < 350: 
       
        title = content. 
        find 
        (r 
        "<a title" 
        ,html) 
       
        href  = content. 
        find 
        (r 
        "href=" 
        ,title) 
       
        html  = content. 
        find 
        (r 
        ".html" 
        ,href) 
       
        url = content[href + 6:html + 5] 
       
        urlfilename = url[-26:] 
       
        list00.append(url) 
       
        i = i + 1 
       
        print i,  url 
       
        else 
        : 
       
        print  
        "Link address Finshed." 
       
        print  
        "This is %s page"  
        %(page) 
       
        page = page + 1 
       
        else 
        : 
       
        print  
        "spage=" 
        ,list00[50] 
       
        print list00[:51] 
       
        print list00.count( 
        "" 
        ) 
       
        print  
        "All links address Finshed." 
       
        x = list00.count( 
        '' 
        ) 
       
        a = 0 
       
        while  
        a < x: 
       
        y1 = list00.index( 
        '' 
        ) 
       
        list00.pop(y1) 
       
        print a 
       
        a = a + 1 
       
        print list00.count( 
        '' 
        ) 
       
        listcount = len(list00) 
       
        while  
        j < listcount: 
       
        content = urllib.urlopen(list00[j]). 
        read 
        () 
       
        open 
        (r 
        "/tmp/hanhan/" 
        +list00[j][-26:], 
        'a+' 
        ).write(content) 
       
        print  
        "%2s is finshed."  
        %(j) 
       
        j = j + 1 
       
        #time.sleep(1) 
       
        else 
        : 
       
        print  
        "Write to file End."