從urllib和urllib2基礎到一個簡單抓取網頁圖片的小爬蟲

時間 2019-11-16

標籤 urllib urllib2 基礎一個簡單抓取網頁圖片爬蟲欄目 HTML 简体版

原文原文鏈接

urllib最經常使用的兩大功能（我的理解urllib用於輔助urllib2）html

1.urllib.urlopen()python

2. urllib.urlencode() #適當的編碼，可用於後面的post提交數據cookie

import urllib
Dict = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python'}
print urllib.urlencode(Dict)

urllib2經常使用的函數app

1.最基本的打開讀取一個網頁函數

import urllib2
response = urllib2.urlopen('http://www.baidu.com/')
html = response.read()

2.地址建立一個Request對象post

req = urllib2.Request('http://www.baidu.com/')
response = urllib2.urlopen(req)
the_page = response.read()

3.Data數據利用post方式提交網站

value={'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python'}
data = urllib.urlencode(values)
request = urllib2.Request(url,data)
#request= urllib2.Request(url, data, headers)  Request對象共有三個參數
response = urllib2.urlopen(request)
print response.read()

4.在 HTTP Request 中加入特定的 Headerui

import urllib2
request = urllib2.Request('http://www.baidu.com/')
request.add_header('User-Agent', 'fake-client')
response = urllib2.urlopen(request)
print response.read()

5.Cookiegoogle

import urllib2
import cookielib
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = opener.open('http://www.baidu.com')
for item in cookie:
    print 'Name = '+item.name
    print 'Value = '+item.value

6.獲得 HTTP 的返回碼編碼

import urllib2
try:
    response = urllib2.urlopen('http://bbs.csdn.net/why')
except urllib2.HTTPError, e:
    print e.code

7.Timeout 設置

import urllib2
response = urllib2.urlopen('http://www.baidu.com/', timeout=10)

8.Redirect動做

import urllib2
my_url = 'http://www.google.cn'
response = urllib2.urlopen(my_url)
redirected = response.geturl() == my_url
print redirected
my_url = 'http://rrurl.cn/b1UZuP'
response = urllib2.urlopen(my_url)
redirected = response.geturl() == my_url
print redirected

9.使用 HTTP 的 PUT 和 DELETE 方法

import urllib2
request = urllib2.Request(uri, data=data)
request.get_method = lambda: 'PUT' # or 'DELETE'
response = urllib2.urlopen(request)

10.Debug Log

import urllib2
httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.google.com')

11.表單的處理

# -*- coding: utf-8 -*-
import urllib
import urllib2
postdata=urllib.urlencode({
    'username':'汪小光',
    'password':'why888',
    'continueURI':'http://www.verycd.com/',
    'fk':'',
    'login_submit':'登陸'
})
req = urllib2.Request(
    url = 'http://secure.verycd.com/signin',
    data = postdata
)
result = urllib2.urlopen(req)
print result.read()

最後附上一段抓取某網站妹子圖片的代碼

import urllib
import urllib2
import os


def url_open(url):
    req = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0')
    response = urllib2.urlopen(req)
    html = response.read()

    return html


def get_page(url):
    html = url_open(url).decode('utf-8')

    a = html.find('current-comment-page') + 23
    b = html.find(']', a)

    return html[a:b]


def find_imgs(url):
    html = url_open(url).decode('utf-8')
    img_addrs = []

    a = html.find('img src=')

    while a != -1:
        b = html.find('.jpg', a, a+255)
        if b != -1:
            img_addrs.append(html[a+9:b+4])
        else:
            b = a + 9

        a = html.find('img src=', b)

    return img_addrs


def save_imgs(folder, img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename, 'wb') as f:
            img = url_open(each)
            f.write(img)


def download_mm(folder='OOXX', pages=10):
    os.mkdir(folder)
    os.chdir(folder)

    url = "http://jandan.net/ooxx/"
    page_num = int(get_page(url))

    for i in range(pages):
        page_num -= i
        page_url = url + 'page-' + str(page_num) + '#comments'
        img_addrs = find_imgs(page_url)
        save_imgs(folder, img_addrs)

if __name__ == '__main__':
    download_mm()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。