Python 3.6.3html
myConfig.py數組
1 ''' 2 說明 3 ''' 4 # 被爬取的網站 5 homePageUrl = 'http://bbs.fengniao.com' 6 7 # 存儲圖片的路徑 8 imgFolder = 'C:/L/workspace/FirstPython/src/1809 - PetPhoto/img/' 9 10 # 論壇編號 11 forumNum = '30' # 論壇編號 12 13 # 開始頁碼、結束頁碼 14 pageBegin = 1 # 開始頁碼 15 pageEnd = 2 # 結束頁碼
index.pyapp
1 ''' 2 3 # 抓取《蜂鳥網》【寵物攝影】欄目的寵物圖片 4 5 - - - - - - - - - - - - - - - - - - - - - 6 若是本地測試,不要使用網頁另存爲,而是要使用打開網頁中右鍵:查看源碼->複製、粘貼 7 ''' 8 import myConfig 9 import myList 10 11 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 12 # main 13 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 14 if __name__ == '__main__': 15 16 # 循環全部貼子頁面 17 for pageNum in range( myConfig.pageBegin, myConfig.pageEnd + 1 ): 18 19 # 網址格式插入的字符串 20 # 首頁和之後的頁面url格式不太同樣 21 if( pageNum == 1 ): 22 s = '' 23 else: 24 s = '_%s_lastpost'% str( pageNum ) 25 26 # 網址 27 url = 'http://bbs.fengniao.com/forum/forum_%s%s.html' % ( myConfig.forumNum, s ) 28 # 生成的網址示例以下: 29 # http://bbs.fengniao.com/forum/forum_30.html 30 # http://bbs.fengniao.com/forum/forum_30_2_lastpost.html 31 32 # 處理每個列表頁 33 myList.eachList( url ) 34
myList.pyide
1 ''' 2 《個人列表》 3 - - - - - - - - - - 4 - 列表,一般數據頁面的彙總頁,如文章列表、帖子列表 5 - 在這個頁面裏,咱們要整理出來全部頁面的一個清單,因此它的下級處理環節就是 myPage(個人頁面) 6 - 它的上級頁面一般是 index(缺省頁,主要作些準備工做) 7 - 要有一個支持本地文件調試的節點,一般取名爲:pageHtml( a, b, htmlCode ) 8 - 2018-0928 Livon 9 ''' 10 import re 11 import os 12 import urllib.request 13 import myConfig 14 import myPage 15 16 ''' 17 每個列表頁 18 ''' 19 def eachList( listPageUrl ): 20 21 print( listPageUrl ) 22 23 # 獲取一個 list 頁 24 htmlResponse = urllib.request.urlopen( listPageUrl ) 25 html = htmlResponse.read() 26 html = html.decode('utf8') 27 28 # 解析 html 代碼 29 listPageHtml( html ) 30 31 ''' 32 解析列表頁的 html 代碼 33 ''' 34 def listPageHtml( html ): 35 36 # 帖子列表區域 37 arr_post_list_ul = re.findall("<ul class=\"txtList\">((?:.|\n)*?)</ul>", html ) 38 39 if( len( arr_post_list_ul ) < 1 ): 40 print('未發現列表區域,程序終止。') 41 return 42 43 # 打印每一個區域,僅僅爲了查看、調試 44 for i in range( 0, len( arr_post_list_ul )): 45 print('arr_post_list_ul : ' + arr_post_list_ul[i] ) 46 47 print('該頁列表區域數量:') 48 print( len( arr_post_list_ul ) ) 49 50 # 第1頁有 2 個區域,之後頁只有一個區域,共性是:都是最後一個區域是指望的內容 51 ul = arr_post_list_ul[ len(arr_post_list_ul) - 1 ] 52 53 # 帖子列表項 54 arr_post_list_li = re.findall("<li >((?:.|\n)*?)</li>", ul ) 55 56 # 處理每一個帖子 57 for i in range( 0, len( arr_post_list_li ) ): 58 post_list_li( i, arr_post_list_li[i] ) 59 60 61 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 62 # 處理一個帖子 63 參數: 64 i: 序號;li: 每個 post 放在一個 li 元素中 65 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 66 def post_list_li( i, li ): 67 68 # 數組:存放全部帖子的地址 69 postPageList = [] 70 71 # 序號 72 print( str( i + 1 ) + ' - arr_post_list_li : ' ) 73 print( li ) 74 75 # 時間 76 time = re.findall('<span class="time">(.*?)</span>', li ) 77 print( ' - 時間 : ' + time[0] ) 78 79 # 做者 80 author = re.findall('<a class="username" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a>', li ) 81 print( ' - 做者 : ' + author[0][1] ) 82 print( ' - 主頁 : ' + author[0][0] ) 83 84 # 主題 85 title = re.findall('<a href="(.*?)" target="_blank" title="(.*?)" class="tit(.*?)" style=\'\'>(.*?)</a>', li ) 86 print( ' - 帖子Url : ' + myConfig.homePageUrl + title[0][0] ) 87 print( ' - 帖子Title : ' + title[0][1] ) 88 89 # 添加網址 90 postPageList.append( myConfig.homePageUrl + title[0][0] ) 91 92 # 處理子網址(一個帖子可能有多個頁面,也就是多個網址) 93 sub_post = re.findall('<span>\(</span>(.*?)<span>\)</span>', li ) 94 95 # 是否存在多於一個的子網頁 96 if( len(sub_post) > 0 ): 97 98 # 存在 99 postPageList.clear() # 清空列表,從新 append(每1次) 100 print( ' - 子頁面 Url(第一個子頁面與帖子頁面內容相同): ' ) 101 102 # 查找全部子網址 103 arr_url = re.findall('<a href="(.*?)" target="_blank">(.*?)</a>', sub_post[0] ) 104 for url in arr_url: 105 print( url[1] + ' - ' + myConfig.homePageUrl + url[0] ) 106 107 # 添加每個子網址 108 postPageList.append( myConfig.homePageUrl + url[0] ) 109 110 # 是否有「最後一頁」的連接? 111 if( url[1] == '最後一頁' ): 112 113 # 有,以前保留的網址所有做廢,從新整理 114 postPageList.clear() # 清空列表,從新 append(每2次) 115 116 # 共有?頁 117 pageCount = int( url[0][16:-5]) 118 print( ' 共有頁數: ' + str(pageCount) ) 119 for pageNum in range( 0, pageCount ): 120 print( myConfig.homePageUrl + url[0][:16] + str( pageNum ) + url[0][-5:] ) 121 # 添加每個子網址 122 postPageList.append( myConfig.homePageUrl + url[0][:16] + str( pageNum ) + url[0][-5:] ) 123 124 # 列表頁的縮略圖(該列表頁僅顯示前5張) 125 # regularExpress = 'style="background-image\:url\((.*?)\?imageView2/2/w/400/q/90/ignore-error/1/\)"></a>' 126 # arr_pic = re.findall( regularExpress, li ) 127 # print( ' - 列表頁中的貼子圖片(取前5張): ' ) 128 # for pic in arr_pic: 129 # print( ' - ' + pic ) 130 131 # 存儲圖片的文件夾 132 folderName = '%s《%s》- %s' % ( convertTime( time[0] ), title[0][1], author[0][1] ) 133 # folderName 示例:2018-0924-0642《萌萌噠土撥鼠》-美時美攝 134 135 path = myConfig.imgFolder + folderName 136 print( '存儲路徑 ====> ' + path ) 137 138 # 若是目錄已經存在,說明以前曾經爬取過,若是想從新獲取,請手工刪除該目錄 139 if( os.path.exists( path ) ): 140 # 目錄已經存在 141 print( 'ERROR : 目錄已經存在,應該是以前曾經獲取過,若是想從新抓取,請手動刪除該目錄。' ) 142 else: 143 # 目錄不存在,建立目錄 144 os.makedirs( path ) # 若是不存在,則會建立目錄 145 146 # 處理該帖子的全部頁面,也能夠說是獲取該文件夾下的圖片 147 myPage.postPageList( folderName, postPageList ) 148 149 150 151 152 153 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 154 # 轉換時間格式 155 由:2018-09-29 10:15:25 156 轉成:2018-0929-1015 157 目的:轉成合法的目錄名 158 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 159 def convertTime( t ): 160 161 t = '%s-%s-%s' % ( t[:4], t[4:10].replace('-',''), t[11:16].replace( ':', '' )) 162 return t 163 164 165 166 167 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 168 # main 169 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 170 if __name__ == '__main__': 171 print( convertTime('2018-09-29 10:15:25'))
myPage.pypost
1 ''' 2 《個人頁面》 3 - - - - - - - - - - 4 - 頁面,通常指具體的數據所在頁面,若是是文章就是文章頁,若是是論壇就是帖子頁(一般一個帖子有多個頁面) 5 - 在這個頁面裏,有咱們要獲取的數據,因此一般這個頁面的下級處理環節就是 myData(個人數據) 6 - 上級處理環節就是 myList(個人列表) 7 - 一般進入該頁面時,會帶有N個頁面的地址,要逐個頁面進行處理 8 - 要有一個支持本地文件調試的節點,一般取名爲:pageHtml( a, b, htmlCode ) 9 - 最後一般是處理每個數據對象的方法,這裏是處理每一張圖片 10 - 2018-0928 Livon 11 ''' 12 13 import re 14 import urllib.request 15 import myConfig 16 import myData 17 18 19 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 20 每個 post(帖子),都有至少一個 postPage(貼子頁面) 21 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 22 def postPageList( folderName, postPageList ): 23 24 # 處理每個頁面的 URL 地址 25 for pageNum in range( 0, len( postPageList ) ): 26 27 # 請求帖子頁面,可能只有一頁,也多是一個子頁 28 htmlResponse = urllib.request.urlopen( postPageList[pageNum] ) 29 html = htmlResponse.read() 30 html = html.decode('utf8') 31 32 # 處理請求到的 html 代碼 33 postPageHtml( folderName, pageNum, html ) 34 35 36 37 38 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 39 處理頁面 html 40 本地 html 文件測試時,調用該方法 41 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 42 def postPageHtml( folderName, pageNum, html ): 43 44 # 頁面中的全部圖片數組 45 regularExpression = '<img src="(.*?)\?imageView2/2/w/1024/q/90/ignore-error/1/">' 46 arr_picUrl = re.findall( regularExpression, html ) 47 48 # 依次處理每張圖片 49 for i in range( 0, len( arr_picUrl )): 50 print( '序號:%s - 圖片地址:%s' % ( str(i+1), arr_picUrl[i] )) 51 eachImg( folderName, pageNum, i, arr_picUrl[i] ) 52 53 54 55 56 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 57 逐個處理每一張圖片 58 參數:目錄名、帖子的第N頁、該頁中的第N張、圖片Url 59 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 60 def eachImg( folderName, pageNum, i, imgUrl ): 61 62 # 原圖 63 file = '%s%s/%s-%s.jpg' % ( myConfig.imgFolder, folderName, str( pageNum), str(i)) 64 myData.crawl( file, imgUrl ) # 爬取 65 66 # 縮略圖 67 file = '%s%s/%s-%s_small.jpg' % ( myConfig.imgFolder, folderName, str( pageNum), str(i)) 68 smallImgUrl = imgUrl + '?imageView2/2/w/1024/q/90/ignore-error/1/' 69 myData.crawl( file, smallImgUrl ) # 爬取 70 71
myData.py測試
1 import urllib.request 2 3 ''' 4 爬取數據 5 - - - - - - - - - - - - - - - - 6 參數:文件(路徑+文件名)、圖片Url 7 ''' 8 def crawl( file, url ): 9 10 try: 11 img = urllib.request.urlopen( url ) 12 except urllib.error.HTTPError as e: 13 print( e.reason ) 14 else: 15 fp = open( file,'wb') # 以 byte(二進制)方式寫入 16 fp.write( img.read() ) 17 fp.close() 18 print( "爬取數據成功" ) 19 20 21 22 ''' 23 測試 24 ''' 25 # if __name__ == '__main__': 26 # 27 # postNum = '001' 28 29