參考:http://www.freebuf.com/news/special/96763.htmlhtml
相關資料:http://www.jb51.net/article/65287.htmpython
一、Python3 win7安裝BeautifulSoupweb
BeautifulSoup中文文檔:http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html算法
BeautifulSoup下載:http://www.crummy.com/software/BeautifulSoup/api
解壓,運行cmd執行:python setup.py install便可app
二、導入beatifulsoup庫 :from bs4 import BeautifulSoupdom
傳入數據,創建對象: soup = BeautifulSoup(data),python2.7
操做soup,完成需求解析。ide
三、示例代碼:網站
1 from bs4 import BeautifulSoup 2 from urllib import request 3 import re 4 5 web = request.urlopen('http://www.freebuf.com') 6 # 沒有特別指明解析器,bs4使用了它認爲最好的解析器,可是在不一樣的環境下運行,可能解析器是不同的。 7 # 若是沒有'html.parser',會有warning提示,代表了bs4的自動選擇解析器來解析的特性。 8 soup = BeautifulSoup(web.read(),'html.parser') 9 tags_a = soup.find_all(name='a', attrs={'href': re.compile('^https?://')}) 10 11 for tag_a in tags_a: 12 print(tag_a['href'])
四、利用BeautifulSoup獲取網站的sitemap:
1 # coding:utf-8 2 # 獲取整個網站的sitemap 3 4 import urllib.request 5 import urllib.error 6 from urllib.parse import urlparse 7 from bs4 import BeautifulSoup 8 import time 9 import datetime 10 11 url = input('請輸入掃描的url:') 12 domain = input('請輸入包含的域名:') 13 sites = set() 14 15 16 # 獲取一個頁面的全部url 17 def get_local_pages(url, domain): 18 pages = set() 19 global sites 20 repeat_time = 0 21 22 # 解析傳入的url爲後面相對路徑拼接用 23 parse_url = urlparse(url) 24 25 # 防止url讀取卡住:自動重讀5次 26 while True: 27 try: 28 print('Ready to Open the web!') 29 time.sleep(1) 30 print('Opening the web : %s' % url) 31 web = urllib.request.urlopen(url=url, timeout=20) 32 print('Success to Open the web!') 33 break 34 except urllib.error.URLError as e: 35 print('Open Url Error:',e) 36 print('Open url Failed!!!Repeat!') 37 time.sleep(1) 38 repeat_time += 1 39 if repeat_time == 5: 40 return 41 42 soup = BeautifulSoup(web.read()) 43 tags = soup.find_all(name='a') 44 45 for tag in tags: 46 # 避免參數傳遞異常 47 try: 48 ret = tag['href'] 49 except: 50 print('Maybe not the attr : href') 51 continue 52 53 parse_page = urlparse(ret) 54 55 # 1 url不爲空(協議,域名,路徑) 56 if parse_page[0] is '' and parse_page[1] is '' and parse_page[2] is '': 57 print('Bad Page(協議\域名\路徑均爲空):%s' % ret) 58 continue 59 60 # 2 協議不爲空,判斷合法性 61 if parse_page[0] is not '' and 'http' not in parse_page[0]: 62 print('Bad Page(協議不合法,非http):%s' % ret) 63 continue 64 65 # 3 域名不爲空,domain要包含在域名中 66 if parse_page[1] is not '' and domain not in parse_page[1]: 67 print('Bad Page(域名不合法,非%s):%s' % (domain, ret)) 68 continue 69 70 # 4 協議爲空,域名不爲空(拼接ret),例如://caipiao.taobao.com 71 if parse_page[0] is '' and parse_page[1] is not '': 72 print('Fix page(僅域名存在): %s' % ret) 73 newpage = parse_url[0] + ':' + ret 74 if newpage not in sites: 75 print('Add Fix Page(拼接域名):%s' % newpage) 76 pages.add(newpage) 77 continue 78 79 # 5 協議域名爲空,路徑不爲空(拼接ret) 80 if parse_page[0] is '' and parse_page[1] is '': 81 print('Fix page(僅路徑存在): %s' % ret) 82 temp_page = parse_url[0] + '://' + parse_url[1] + '/' + ret 83 # 保持URL的乾淨 84 newpage = temp_page[:8] + temp_page[8:].replace('//', '/') 85 if newpage not in sites: 86 print('Add Fix Page(拼接路徑):%s' % newpage) 87 pages.add(newpage) 88 continue 89 90 # 整理輸出 91 newpage = ret 92 if newpage not in sites: 93 print('Add New Page:%s' % newpage) 94 pages.add(newpage) 95 96 return pages 97 98 99 # dfs 算法遍歷全站(目前中小型網站可用,待完善) 100 def dfs(pages, domain): 101 global sites 102 if pages in sites: 103 return 'Success!' 104 105 # visited = set() 106 # sites = set.union(sites,pages) 107 for page in pages: 108 if page not in sites: 109 sites.add(page) 110 get_pages = get_local_pages(page, domain) 111 dfs(get_pages, domain) 112 return 113 114 t1 = datetime.datetime.now() 115 pages = get_local_pages(url, domain) 116 dfs(pages,domain) 117 text_name = domain + '全站掃描.txt' 118 with open(text_name, 'a') as f: 119 f.write('\n' + str(datetime.datetime.now()) + '\n') 120 for i in sites: 121 with open(text_name, 'a') as f: 122 f.write(i + '\n') 123 124 with open(text_name, 'a') as f: 125 f.write('\n用時:' + str(datetime.datetime.now() - t1) + '\n') 126 127 sitemap
五、基本知識點
Bs4的基本api的使用,關於beautifulSoup的基本使用方法,我這裏須要介紹在下面的腳本中我使用到的方法:
Soup = BeautifulSoup(data)
#構建一個解析器
Tags = Soup.findAll(name,attr)
咱們重點要講findAll方法的兩個參數:name和attr
Name: 指的是標籤名,傳入一個標籤名的名稱就能夠返回全部固定名稱的標籤名
Attr: 是一個字典存儲須要查找的標籤參數,返回對應的標籤
Tag.children 表示獲取tag標籤的全部子標籤
Tag.string 表示獲取tag標籤內的全部字符串,不用一層一層索引下去尋找字符串
Tag.attrs[key] 表示獲取tag標籤內參數的鍵值對鍵爲key的值
Tag.img 表示獲取tag標籤的標籤名爲
img的自標籤(一個)
六、利用BeautifulSoup獲取58頁面的指定信息(python2.7)
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 4 import urllib 5 import urllib2 6 from bs4 import BeautifulSoup 7 8 url = 'http://ny.58.com/zufang/24584108096437x.shtml?qq-pf-to=pcqq.c2c' 9 10 # rq = urllib2.Request(url) 11 # print rq 12 rp = urllib.urlopen(url) 13 html = rp.read() 14 soup = BeautifulSoup(html) 15 16 # 獲取標題 17 title = soup.find_all(name='h1', attrs={'class': 'main-title font-heiti'}) 18 for data in title: 19 data_title = data.get_text() 20 print data_title 21 22 # 獲取租金 23 primary = soup.find_all(name='em', attrs={'class': 'house-price'}) 24 for data in primary: 25 data_primary = data.get_text() 26 print data_primary 27 28 # 獲取房屋 29 house_type = soup.find_all(name='div', attrs={'class': 'fl house-type c70'}) 30 for data in house_type: 31 temp_type = data.get_text().replace('-', ' ') 32 temp_type = ' '.join(temp_type.split()) 33 print temp_type 34 # data_type_list = [] 35 # for d in temp_type: 36 # data_type_list.append(d) 37 # print data_type_list 38 39 40 # 獲取小區 41 xiaoqu = soup.find_all(name='div', attrs={'class': 'fl xiaoqu c70'}) 42 for data in xiaoqu: 43 data_xiaoqu = data.get_text().strip() 44 print data_xiaoqu 45 46 # 獲取配置 47 config = soup.find_all(name='li', attrs={'class': 'house-primary-content-li clearfix person-config'}) 48 for data in config: 49 data_config = data.div.get_text().replace('-',' ') 50 data_config = ' '.join(data_config.split()) 51 print data_config 52 53 # 獲取聯繫人 54 contact = soup.find_all(name='li', attrs={'class': 'house-primary-content-li clearfix person-contact'}) 55 for data in contact: 56 data_contact = data.div.span.get_text() 57 print data_contact 58 59 60 # 寫入文件 61 # with open('58_test1.txt','w') as f: 62 # f.write('標題:'+data_title.decode('gbk')) 63 # f.write('租金:' + data_primary)