1.shell爬蟲實例:html
[root@db01 ~]# vim pa.sh #!/bin/bash www_link=http://www.cnblogs.com/clsn/default.html?page= for i in {1..8} do a=`curl ${www_link}${i} 2>/dev/null|grep homepage|grep -v "ImageLink"|awk -F "[><\"]" '{print $7"@"$9}' >>bb.txt`#@爲本身 指定的分隔符.這行是獲取內容及內容網址 done egrep -v "pager" bb.txt >ma.txt #將處理後,只剩內容和內容網址的放在一個文件裏 b=`sed "s# ##g" ma.txt` #將文件裏的空格去掉,由於for循環會將每行的空格先後做爲兩個變量,而不是一行爲一個變量,這個坑花 了我好長時間。 for i in $b do c=`echo $i|awk -F @ '{print $1}'` #c=內容網址 d=`echo $i|awk -F @ '{print $2}'` #d=內容 echo "<a href='${c}' target='_blank'>${d}</a> " >>cc.txt #cc.txt爲生成a標籤的文本 done
爬蟲結果顯示:歸檔文件中慘綠少年的爬蟲結果python
注意:爬取結果放入博客應在a標籤後加的空格符或其餘,博客園默認不顯示字符串nginx
二、shell
2.1Python爬蟲學習vim
爬取這個網頁bash
import urllib.request # 網址 url = "http://10.0.0.6/" # 請求 request = urllib.request.Request(url) # 爬取結果 response = urllib.request.urlopen(request) data = response.read() # 設置解碼方式 data = data.decode('utf-8') # 打印結果 print(data)
結果:curl
E:\python\python\python.exe C:/python/day2/test.py <html> <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> <body> <h1>魔降風雲變的第一個標題</h1> <p>魔降風雲變的第一個段落。</p> </body> </html>
print(type(response))
結果:socket
<class 'http.client.HTTPResponse'>
print(response.geturl())
結果:函數
http://10.0.0.6/
print(response.info())
結果:學習
Server: nginx/1.12.2 Date: Fri, 02 Mar 2018 07:45:11 GMT Content-Type: text/html Content-Length: 184 Last-Modified: Fri, 02 Mar 2018 07:38:00 GMT Connection: close ETag: "5a98ff58-b8" Accept-Ranges: bytes
print(response.getcode())
結果:
200
2.2爬取網頁代碼並保存到電腦文件
import urllib.request # 網址 url = "http://10.0.0.6/" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36'} req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) data = res.read() data = data.decode('utf-8') # 打印抓取的內容 print(data)
結果:
<html> <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> <body> <h1>魔降風雲變的第一個標題</h1> <p>魔降風雲變的第一個段落。</p> </body> </html>
import urllib.request # 定義保存函數 def saveFile(data): #------------------------------------------ path = "E:\\content.txt" f = open(path, 'wb') f.write(data) f.close() #------------------------------------------ # 網址 url = "http://10.0.0.6/" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36'} req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) data = res.read() # 也能夠把爬取的內容保存到文件中 saveFile(data) #****************************************** data = data.decode('utf-8') # 打印抓取的內容 print(data)
添加保存文件的函數並執行的結果:
# # 打印爬取網頁的各種信息 print(type(res)) print(res.geturl()) print(res.info()) print(res.getcode())
結果:
<class 'http.client.HTTPResponse'> http://10.0.0.6/ Server: nginx/1.12.2 Date: Fri, 02 Mar 2018 08:09:56 GMT Content-Type: text/html Content-Length: 184 Last-Modified: Fri, 02 Mar 2018 07:38:00 GMT Connection: close ETag: "5a98ff58-b8" Accept-Ranges: bytes 200
2.3爬取圖片
import urllib.request, socket, re, sys, os # 定義文件保存路徑 targetPath = "E:\\" def saveFile(path): # 檢測當前路徑的有效性 if not os.path.isdir(targetPath): os.mkdir(targetPath) # 設置每一個圖片的路徑 pos = path.rindex('/') t = os.path.join(targetPath, path[pos + 1:]) return t # 用if __name__ == '__main__'來判斷是不是在直接運行該.py文件 # 網址 url = "http://10.0.0.6/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36' } req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) data = res.read() print(data) for link, t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))): print(link) try: urllib.request.urlretrieve(link, saveFile(link)) except: print('失敗')
2.31
import urllib.request, socket, re, sys, os
url = "http://10.0.0.6/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36'
}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
data = res.read()
print(req) print(res) print(data) print(str(data))
結果:
<urllib.request.Request object at 0x0000000001ECD9E8> <http.client.HTTPResponse object at 0x0000000002D1B128> b'<html>\n<meta http-equiv="Content-Type" content="text/html;charset=utf-8">\n<body>\n<img src="http://10.0.0.6/ma1.png" />\n<img src="http://10.0.0.6/ma2.jpg" />\n<h1>\xe9\xad\x94\xe9\x99\x8d\xe9\xa3\x8e\xe4\xba\x91\xe5\x8f\x98\xe7\x9a\x84\xe7\xac\xac\xe4\xb8\x80\xe4\xb8\xaa\xe6\xa0\x87\xe9\xa2\x98</h1>\n<p>\xe9\xad\x94\xe9\x99\x8d\xe9\xa3\x8e\xe4\xba\x91\xe5\x8f\x98\xe7\x9a\x84\xe7\xac\xac\xe4\xb8\x80\xe4\xb8\xaa\xe6\xae\xb5\xe8\x90\xbd\xe3\x80\x82</p>\n</body>\n</html>\n' b'<html>\n<meta http-equiv="Content-Type" content="text/html;charset=utf-8">\n<body>\n<img src="http://10.0.0.6/ma1.png" />\n<img src="http://10.0.0.6/ma2.jpg" />\n<h1>\xe9\xad\x94\xe9\x99\x8d\xe9\xa3\x8e\xe4\xba\x91\xe5\x8f\x98\xe7\x9a\x84\xe7\xac\xac\xe4\xb8\x80\xe4\xb8\xaa\xe6\xa0\x87\xe9\xa2\x98</h1>\n<p>\xe9\xad\x94\xe9\x99\x8d\xe9\xa3\x8e\xe4\xba\x91\xe5\x8f\x98\xe7\x9a\x84\xe7\xac\xac\xe4\xb8\x80\xe4\xb8\xaa\xe6\xae\xb5\xe8\x90\xbd\xe3\x80\x82</p>\n</body>\n</html>\n'
2.32
import urllib.request, socket, re, sys, os
url = "http://10.0.0.6/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36'
}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
data = res.read()
for link in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))): print(link)
結果:
('http://10.0.0.6/ma2.jpg', 'jpg') ('http://10.0.0.6/ma1.png', 'png')
2.33
for link,t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))): print(link)
結果:
http://10.0.0.6/ma1.png http://10.0.0.6/ma2.jpg
2.34
import urllib.request, socket, re, sys, os # 定義文件保存路徑 targetPath = "E:\\" def saveFile(path): # 檢測當前路徑的有效性 if not os.path.isdir(targetPath): os.mkdir(targetPath) # 設置每一個圖片的路徑 pos = path.rindex('/') t = os.path.join(targetPath, path[pos + 1:]) return t # 用if __name__ == '__main__'來判斷是不是在直接運行該.py文件 # 網址 url = "http://10.0.0.6/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36' } req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) data = res.read() for link,t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))): print(link) try: urllib.request.urlretrieve(link, saveFile(link)) except: print('失敗')
結果:
http://10.0.0.6/ma2.jpg http://10.0.0.6/ma1.png
2.4登陸知乎