Python 簡單爬蟲功能實現

時間 2020-10-23

標籤 html python 數據庫 bash 服務器網絡 ide 函數網站 url 欄目 Python 简体版

原文原文鏈接

當Google創始人用python寫下他們第一個簡陋的爬蟲, 運行在一樣簡陋的服務器上的時候 ;
不多有人可以想象 , 在接下的數十年間 , 他們是怎樣地顛覆了互聯網乃至於人類的世界。html

有網絡的地方就有爬蟲，爬蟲英文名稱spider。它是用來抓取網站數據的程序。好比: 咱們經過一段程序，按期去抓取相似百度糯米、大衆點評上的數據，將這些信息存儲到數據庫裏，而後加上展現頁面，一個團購導航站就問世了。毫無疑問，爬蟲是不少網站的初期數據來源。python

1、第一個爬蟲功能的實現
數據庫

——查看博文目錄第一篇文章的URLbash

首先須要引入urllib模塊，使用find函數查找url，通過字符處理就都獲得了須要的URL。
服務器

#!/usr/bin/env python
import urllib
url = ['']*40
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con[href +6 :html +5 ]
print url

2、查看博文目錄第一頁全部文章的URL網絡

A：
ide

#!/usr/bin/env python
import urllib
url = ['']*40
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url[0] = con[href +6 :html +5 ]
print url
while title != -1 and href != -1 and html != -1 and i < 40:
    url[i] = con[href +6 :html +5 ]
    print url[i]
    title = con.find(r'<a title=',html)
    href = con.find(r'href=',title)
    html = con.find(r'.html',href)
    i = i +1

或者B：

函數

#!/usr/bin/env python
import urllib
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con[href +6 :html +5 ]
while title != -1 and href != -1 and html != -1 and i < 50:
    title = con.find(r'<a title=',html)
    href = con.find(r'href=',title)
    html = con.find(r'.html',href)
    url = con[href +6 :html +5 ]
    print url
    i = i + 1

3、下載博文目錄第一頁全部的文章網站

A：
url

#!/usr/bin/env python
import urllib
i = 0
url = ['']*40
con = urllib.urlopen('http://www.zhihu.com/collection/19668036').read()
target = con.find(r'<a target="_blank')
base = con.find(r'href=',target)
end = con.find('>',base)
url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1]
print url[0]
while i < 20:
  url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1]
  print url[0]
  target = con.find(r'<a target="_blank',end)
  base = con.find(r'href=',target)
  end = con.find('>',base)
  i = i + 1
while j < 30:
    content = urllib.urlopen(url[j]).read()
    print url[0]
    open(r'zhihu/'+url[j],'w+').write(content)
    print 'downloading',
    j = j + 1
    time.sleep(15)

或者B：

#!/usr/bin/env python
import time
import urllib
i = 0
j = 0
url = ['']*30
name = ['']*30
con = urllib.urlopen('http://www.zhihu.com/collection/19668036').read()
target = con.find(r'<a target="_blank')
base = con.find(r'href=',target)
end = con.find('>',base)
url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1]
while target != -1 and base != -1 and end != -1 and i < 30:
  url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1]
  name[0] =  con[base +16 :end - 1]
  target = con.find(r'<a target="_blank',end)
  base = con.find(r'href=',target)
  end = con.find('>',base)
  content = urllib.urlopen(url[0]).read()
  open(r'zhihu/'+name[0]+'.html','w+').write(content)
  print 'downloading',name[0]
  time.sleep(5)
  i = i + 1

4、下載全部文章

A：

import time
import urllib
page = 1
url = ['']*350
i = 0
link = 1
while page <= 7:
  con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
  title = con.find(r'<a title=')
  href = con.find(r'href=',title)
  html = con.find(r'.html',href)
  while title != -1 and href != -1 and html != -1 and i < 350:
    url[i] = con[href +6 :html +5 ]
    print link,url[i]
    title = con.find(r'<a title=',html)
    href = con.find(r'href=',title)
    html = con.find(r'.html',href)
    link = link + 1
    i = i +1
  else:
    print 'find end!'
  page = page + 1
else:
    print 'all find end'
j = 0
while j < 50:
    content = urllib.urlopen(url[j]).read()
    open(r'tmp/'+url[j][-26:],'w+').write(content)
    j = j + 1
    time.sleep(5)
else:
    print 'Download over!'

B：

#!/usr/bin/env python
import time
import urllib
i = 0
link = 1
page = 1
url = ['']*350
while page <= 7:
  con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
  title = con.find(r'<a title=')
  href = con.find(r'href=',title)
  html = con.find(r'.html',href)
  while title != -1 and href != -1 and html != -1 and i < 350:
    url[i] = con[href +6 :html +5 ]
    print link,url[i]
    title = con.find(r'<a title=',html)
    href = con.find(r'href=',title)
    html = con.find(r'.html',href)
    content = urllib.urlopen(url[i]).read()
    open(r'/tmp/sina/'+url[i][-26:],'w+').write(content)
    time.sleep(5)
    link = link + 1
    i = i +1
  page = page + 1
else:
    print 'Download Over!'

運行結果：