初學-BeautifulSoup爬取豆瓣頁面

時間 2019-11-12

標籤初學 beautifulsoup 豆瓣頁面简体版

原文原文鏈接

# -*- coding: utf-8 -*-
import os
import urllib
import urllib2
from bs4 import BeautifulSouphtml

headers = {
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
'Accept - Language':'zh - CN, zh;',
'Cache - Control':'max - age = 0',
'Connection':'keep - alive',
'Content - Length':'125',
'Content - Type':'application / x - www - form - urlencoded',
'X-Content-Type-Options':'nosniff',
'X-DAE-Node':'daisy2b',
'X-Douban-Mobileapp':'0',
'X-Xss-Protection':'1; mode=block',
}web

def parse(html,downloader_Function):
soup = BeautifulSoup(html, 'html.parser')
all_a = soup.find_all(rel="nofollow")
for a in all_a:app

if 'src' not in a.attrs:
print a['href']
else:
path = a['src']
name = a['alt']
downloader_Function(path,name)url

def htmlContent(url):
req = urllib2.Request(url, headers=headers)
resp = urllib2.urlopen(req)
html = resp.read()
return htmlspa

def fileDownloader(path,fileName):
currentDir = os.getcwd() + '/download/'code

filePath = currentDir +'%s.png'%fileName
urllib.urlretrieve(path,filePath)orm

def start():
htmlText = htmlContent('https://movie.douban.com/')
print htmlText
parse(htmlText,fileDownloader)xml

start()
print(dir(BeautifulSoup))htm

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。