1、演繹自已的北愛 html
踏上北漂的航班,開始演奏了我自已的北京愛情故事python
2、爬蟲1linux
一、網絡爬蟲的思路json
首先:指定一個url,而後打開這個url地址,讀其中的內容。windows
其次:從讀取的內容中過濾關鍵字;這一步是關鍵,能夠經過查看源代碼的方式獲取。bash
最後:下載獲取的html的url地址,或者圖片的url地址保存到本地網絡
二、針對指定的url來網絡爬蟲app
分析:測試
第一步:大約共有4300個下一頁。url
第二步:一個頁面上有10個我的頭像
第三步:一個頭像內大約有100張左右的我的圖片
指定的淘寶mm的url爲:http://mm.taobao.com/json/request_top_list.htm?type=0&page=1
這個頁面默認是沒有下一頁按鈕的,咱們能夠經過修改其url地址來進行查看下一個頁面
最後一頁的url地址和頁面展現以下圖所示:
點擊任意一個頭像來進入我的的主頁,以下圖
三、定製的腳本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#!/usr/bin/env python
#coding:utf-8
#Author:Allentuns
#Email:zhengyansheng@hytyi.com
import
urllib
import
os
import
sys
import
time
ahref =
'<a href="'
ahrefs =
'<a href="h'
ahtml =
".htm"
atitle =
"<img style"
ajpg =
".jpg"
btitle =
'<img src="'
page = 0
while
page < 4300:
#這個地方能夠修改;最大值爲4300,我測試的時候寫的是3.
mmurl =
"http://mm.taobao.com/json/request_top_list.htm?type=0&page=%d"
%(page)
content = urllib.urlopen(mmurl).
read
()
href = content.
find
(ahref)
html = content.
find
(ahtml)
url = content[href + len(ahref) : html + len(ahtml)]
print url
imgtitle = content.
find
(btitle,html)
imgjpg = content.
find
(ajpg,imgtitle)
littleimgurl = content[imgtitle + len(btitle): imgjpg + len(ajpg)]
print littleimgurl
urllib.urlretrieve(littleimgurl,
"/www/src/temp/image/taobaomm/allentuns.jpg"
)
s = 0
while
s < 18:
href = content.
find
(ahrefs,html)
html = content.
find
(ahtml,href)
url = content[href + len(ahref): html + len(ajpg)]
print s,url
imgtitle = content.
find
(btitle,html)
imgjpg = content.
find
(ajpg,imgtitle)
littleimgurl = content[imgtitle : imgjpg + len(ajpg)]
littlesrc = littleimgurl.
find
(
"src"
)
tureimgurl = littleimgurl[littlesrc + 5:]
print s,tureimgurl
if
url.
find
(
"photo"
) == -1:
content01 = urllib.urlopen(url).
read
()
imgtitle = content01.
find
(atitle)
imgjpg = content01.
find
(ajpg,imgtitle)
littleimgurl = content01[imgtitle : imgjpg + len(ajpg)]
littlesrc = littleimgurl.
find
(
"src"
)
tureimgurl = littleimgurl[littlesrc + 5:]
print tureimgurl
imgcount = content01.count(atitle)
i = 20
try:
while
i < imgcount:
content01 = urllib.urlopen(url).
read
()
imgtitle = content01.
find
(atitle,imgjpg)
imgjpg = content01.
find
(ajpg,imgtitle)
littleimgurl = content01[imgtitle : imgjpg + len(ajpg)]
littlesrc = littleimgurl.
find
(
"src"
)
tureimgurl = littleimgurl[littlesrc + 5:]
print i,tureimgurl
time
.
sleep
(1)
if
tureimgurl.count(
"<"
) == 0:
imgname = tureimgurl[tureimgurl.index(
"T"
):]
urllib.urlretrieve(tureimgurl,
"/www/src/temp/image/taobaomm/%s-%s"
%(page,imgname))
else
:
pass
i += 1
except IOError:
print
'/nWhy did you do an EOF on me?'
break
except:
print
'/nSome error/exception occurred.'
s += 1
else
:
print
"---------------{< 20;1 page hava 10 htm and pic }-------------------------}"
page = page + 1
print
"****************%s page*******************************"
%(page)
else
:
print
"Download Finshed."
|
四、圖片展現(部分圖片)
五、查看下載的圖片數量
2、爬蟲2
一、首先來分析url
第一步:總共有7個頁面;
第二步:每一個頁面有20篇文章
第三步:查看後總共有317篇文章
二、python腳本
腳本的功能:經過給定的url來將這片博客裏面的全部文章下載到本地
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
#!/usr/bin/env python
#coding: utf-8
import
urllib
import
time
list00 = []
i = j = 0
page = 1
while
page < 8:
str =
"http://blog.sina.com.cn/s/articlelist_1191258123_0_%d.html"
%(page)
content = urllib.urlopen(str).
read
()
title = content.
find
(r
"<a title"
)
href = content.
find
(r
"href="
,title)
html = content.
find
(r
".html"
,href)
url = content[href + 6:html + 5]
urlfilename = url[-26:]
list00.append(url)
print i, url
while
title != -1 and href != -1 and html != -1 and i < 350:
title = content.
find
(r
"<a title"
,html)
href = content.
find
(r
"href="
,title)
html = content.
find
(r
".html"
,href)
url = content[href + 6:html + 5]
urlfilename = url[-26:]
list00.append(url)
i = i + 1
print i, url
else
:
print
"Link address Finshed."
print
"This is %s page"
%(page)
page = page + 1
else
:
print
"spage="
,list00[50]
print list00[:51]
print list00.count(
""
)
print
"All links address Finshed."
x = list00.count(
''
)
a = 0
while
a < x:
y1 = list00.index(
''
)
list00.pop(y1)
print a
a = a + 1
print list00.count(
''
)
listcount = len(list00)
while
j < listcount:
content = urllib.urlopen(list00[j]).
read
()
open
(r
"/tmp/hanhan/"
+list00[j][-26:],
'a+'
).write(content)
print
"%2s is finshed."
%(j)
j = j + 1
#time.sleep(1)
else
:
print
"Write to file End."
|
三、下載文章後的截圖