基礎班
```
#coding=utf8
from future import unicode_literals
from bs4 import BeautifulSoup
import requestshtml
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
'Cookie':''
}
url='http://www.tripadvisor.cn/Hotels-g60763-New_York_City_New_York-Hotels.html'api
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.content,'lxml')
titles=soup.select('a.property_title')
imgs=soup.select('img[height="200"]')//經過標籤的屬性height=200獲取全部高爲200的圖片,也能夠用class屬性獲取,如imgs=soup.select('img.photo_image')
cates=soup.select('div.clickable_tags')
for title,img,cate in zip(titles,imgs,cates):
data={
'title':title.get_text(),//獲取標題文本
'img':img.get('src'),//經過src獲取圖片
'cate':cate.get_text(',')//獲取文本,中間用「,」隔開
}
print(data)
```app
分類實現:
```
from bs4 import BeautifulSoup
import requestsdom
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
'Cookie':''
}
url_saves='http://www.tripadvisor.cn/Hotels-g60763-New_York_City_New_York-Hotels.html'函數
def get_attractions(url,data=None):
wb_data=requests.get(url_saves)
soup=BeautifulSoup(wb_data.text,'lxml')
soup.decode('utf-8')
titles=soup.select('a.property_title')
imgs=soup.select('img.photo_image')
mates=soup.select('div.clickable_tags')url
for title,img,mate in zip(titles,imgs,mates): data={ 'title':title.get_text(), 'img':img.get('src'), 'mate':mate.get_text() } print(data)
def get_favs(url,data=None):
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
titles=soup.select('a.property_title')
imgs = soup.select('img.photo_image')
mates = soup.select('div.clickable_tags')spa
if data==None: for title, img, mate in zip(titles, imgs, mates): data = { 'title': title.get_text(), 'img': img.get('src'), 'mate': mate.get_text() } print(data)
get_favs(url_saves)
```線程
對於用連接控制頁數的網頁,獲取全部頁面的方法:
urls=['http://tieba.baidu.com/p/4900705159?pn={}',format(str(i)) for i in range(1,6,1)]
代理
for single_url in urls: get_attractions(single_url)//經過循環調用上面代碼中的類獲取每一頁的信息
對於用js處理的圖片,能夠設置User-Agent模擬手機客戶端登錄,而後獲取圖片rest
from bs4 import BeautifulSoup import requests import time url='http://bj.58.com/pingbandiannao/24604629984324x.shtml' wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') def get_links_from(who_sells): urls = [] list_view = 'http://bj.58.com/pbdn/{}/pn2/'.format(str(who_sells)) wb_data = requests.get(list_view) soup = BeautifulSoup(wb_data.text,'lxml') for link in soup.select('td.t a.t'): //查找td標籤下的t和a標籤下的t urls.append(link.get('href').split('?')[0]) return urls def get_views_from(url): id = url.split('/')[-1].strip('x.shtml') //以/分割連接,而後用-1取最後一個,用strip('x,shtml')在取到的數據後追加x.html api = 'http://jst1.58.com/counter?infoid={}'.format(id)//上面的id是動態的,因此用.format加入動態id //這個是找到了58的查詢接口,不瞭解接口能夠參照一下新浪微博接口的介紹 js = requests.get(api) views = js.text.split('=')[-1] // 找到的id用=號分割,而後返回最後一個 return views def get_item_info(who_sells=0): urls = get_links_from(who_sells) for url in urls: wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') data = { 'title':soup.title.text,//直接獲取標題文本 'price':soup.select('.price')[0].text, #獲取地區,對於沒有填寫地區的設置爲None,使用 .stripped_strings 能夠去除多餘空格或空行內容 'area' :list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span','c_25d') else None, 'date' :soup.select('.time')[0].text, 'cate' :'我的' if who_sells == 0 else '商家', # 'views':get_views_from(url) } print(data) get_item_info(url) get_links_from(1) get_item_info()
wb_data=requests.get(url) if wb_data.status_code==404: //用requests的status_code獲取頁面的狀態碼,判斷是否爲404 pass
# http://cn-proxy.com/ proxy_list = [ 'http://117.177.250.151:8081', 'http://111.85.219.250:3129', 'http://122.70.183.138:8118', ] proxy_ip = random.choice(proxy_list) # 隨機獲取代理ip proxies = {'http': proxy_ip}
data = { 'title':soup.title.text.strip(), 'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(), 'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0], //先用map把數據裝入map中,map中lambda表達式定義x,用x的text方法迭代後面soup.select()獲取到的數據,從而取出文本,map返回的是一個地址,用list把map轉換爲值 'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a'))), 'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings), 'url':url }
db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x-y