一、javascript
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 10:30:26 2019html
@author: Office
"""
import urllib.requestjava
#須要爬取的網站
url = "http://www.baidu.com/"node
#response:向指定的url地址發起請求,並返回http響應的數據(文件的對象)
response = urllib.request.urlopen(url)mysql
#讀取內容 bytes類型
data = response.read()#讀取文件的所有內容,會把讀取到的數據賦值給一個字符串變量
#data = response.readline()#讀取一行,若要所有打印出來,須要寫一個循環
#data = response.readlines()#讀取文件的所有內容,會把讀取到的數據賦值給一個列表變量
#print(data)
#print(type(data))web
#將文件獲取的內容轉換成字符串
str_data = data.decode("utf-8")
#print(str_data)
#print(type(str_data))ajax
#將爬取到的網頁寫入文件
#第一種方法
with open("baidu.html","w",encoding="utf-8")as f:#以str類型的方式寫入文件
f.write(str_data)sql
#第二種方法,urlretrieve在執行的過程當中,會殘留一些緩存,須要進行清除緩存
#urllib.request.urlretrieve(url,"baidu2.html")
#urllib.request.urlcleanup(url,"baidu2.html") #清除緩存 json
#response相關屬性
#print(response.info())#返回當前環境的有關信息
#print(response.getcode())#返回狀態碼 只需記住200,304(客戶端已經執行了get,但文件未變化,有緩存的意思),400(錯誤請求,如語法錯誤),500(服務器內部產生錯誤)
#print(response.geturl())#返回當前正在爬取的URl地址瀏覽器
二、
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 15:09:34 2019
@author: Office
"""
import urllib.request
url = "http://www.baidu.com/"
#模擬請求頭
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
#設置一個請求體
req=urllib.request.Request(url,headers=headers)
#發起請求
response=urllib.request.urlopen(req)
data=response.read().decode('utf-8')
print(data)
三、
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 15:17:49 2019
@author: Office
"""
import urllib.request
import random
url = "http://www.baidu.com/"
#模擬請求頭
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
#隨機選擇一個請求頭
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
#設置一個請求體
req=urllib.request.Request(url,headers=headers)
#發起請求
response=urllib.request.urlopen(req)
data=response.read().decode('utf-8')
#print(data)
print(req.get_full_url()) #獲取url地址
print(req.get_header('User-agent')) #獲取User-agent,第一個單詞首字母大寫,後面的單詞首字母小寫
#第二種寫法:
url = "http://www.baidu.com/"
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
end_user_angent=random.choice(user_angent_list)
req=urllib.request.Request(url,headers=headers)
req.add_header('User-Agent',end_user_angent)
response=urllib.request.urlopen(req)
data=response.read().decode('utf-8')
print(data)
四、
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 16:10:42 2019
@author: Office
"""
import urllib.request
url = "http://www.baidu.com/"
#若是網頁長時間未響應,系統判斷超時,沒法爬取
for i in range(1,100):
try:
response=urllib.request.urlopen(url,timeout=0.2)
print(len(response.read().decode('utf-8')))
except:
print("請求超時,繼續下一個爬取")
五、
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 16:24:45 2019
@author: Office
"""
#http 使用場景:進行客戶端與服務端之間的消息傳遞時使用
#GET:經過url網址傳遞信息,能夠直接在url網址上添加要傳遞的信息
#POST:能夠向服務器提交數據,是一種比較流行的比較安全的數據傳遞方式
#PUT:請求服務器存儲一個資源,一般要指定存儲的位置
#DELETE:請求服務器刪除一個資源
'''
GET請求
特色:把數據拼接到請求路徑的後面傳遞給服務器
優勢:速度快
缺點:承載的數據量小,不安全
'''
import urllib.request
import urllib.parse
import string
import random
#單個值約束
#url='http://www.baidu.com/s?wd='
#
#wd='圖片'
#wd=urllib.parse.quote(wd)
#end_url=url+wd
#
#headers={
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
# 'X-REQUESTED-With':'XMLHttpRequest',
# 'Content-Type':'application/x-www-form-urlencoded'
# }
#
#user_angent_list=[
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
# ]
#
#end_user_angent=random.choice(user_angent_list)
#
#req=urllib.request.Request(end_url,headers=headers)
#req.add_header('User-Agent',end_user_angent)
#
#response=urllib.request.urlopen(req)
#data=response.read().decode('utf-8')
#print(data)
#多個值約束
url='https://www.baidu.com/s?'
da_ta={
'wd':'風景',
'key':'zhang',
'value':'san'
}
final_da_ta=urllib.parse.urlencode(da_ta)
final_url=url+final_da_ta
end_url=urllib.parse.quote(final_url,safe=string.printable)
print(end_url)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
end_user_angent=random.choice(user_angent_list)
headers['User-Agent']=end_user_angent
req=urllib.request.Request(end_url,headers=headers)
response=urllib.request.urlopen(req)
data=response.read().decode('utf-8')
print(data)
六、
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 16:50:51 2019
@author: Office
"""
'''
POST 請求
特色:把參數進行打包,單獨傳輸
優勢:數量大,安全(當對服務器數據進行修改時建議使用post)
缺點:速度慢
'''
import urllib.parse
import urllib.request
url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
headers={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.index',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
#將要發送的數據合成一個字典
#字典的鍵去網址裏找,通常爲input標籤的name屬性的值
key=input("請輸入你要翻譯的內容:")
data={
'i' : key,
'from' : 'AUTO',
'to' : 'AUTO',
'smartresult' : 'dict',
'client' : 'fanyideskweb',
'salt': '15564473252080',
'sign': 'b6f44d14938df7391a28b66252a461aa',
'doctype' : 'json',
'version' : '2.1',
'keyfrom' : 'fanyi.web',
'action' : 'FY_BY_CLICKBUTTION'
}
#將要發送的數據進行打包,記住編碼
da_ta=urllib.parse.urlencode(data).encode('utf-8')
#請求
end_data=urllib.request.urlopen(url,da_ta).read().decode('utf-8')
print(end_data)
七、
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 29 11:02:48 2019
@author: Office
"""
from bs4 import BeautifulSoup
import urllib.request
#轉化本地文件
soup = BeautifulSoup(open("soup_text.html",encoding="utf-8"),'lxml')
#根據標籤名查找
#print(soup.a) 只能找到第一個符合要求的標籤
#print(soup.div)
#獲取屬性
#print(soup.a["href"])獲取href屬性
#print(soup.a.attrs)獲取屬性和值,返回的是一個字典
#print(soup.a.attrs["href"])也能夠這樣寫
#獲取內容
#print(soup.a.text)
#print(soup.a.string)
#print(soup.a.get_text())
#三者的區別:若是標籤中還有標籤,那麼string獲取到的結果爲None,而另外兩個,能夠獲取文本內容
#print(soup.div.text)
#print(soup.div.string)
#print(soup.div.get_text)
#print(soup.div.get_text().split()[0])#獲取裏面元素
#find 找到的都是第一個符合要求的標籤
#print(soup.find('a'))#找到第一個符合要求的a
#print(soup.find('a',title="qin"))#經過第二條件title="qin"進行限制來查找
#print(soup.find('a',class_="du"))#因爲class是關鍵字,因此須要加一個下劃線
#print(soup.find('a',id="feng"))
#find方法不只能夠適用於soup,普通的對象也能夠適用,會去指定的普通對象裏面去朝招符合要求的節點。
#經過層級的方式,往下查找
#div=soup.find('div',class_='tang')
#print(div.find('a',alt="qi"))
#print(div.find('a',class_="du"))#若是有兩個相同,仍是找到第一個符合要求的
#find_all
#lt=soup.find_all('a')#找全部a的標籤
#print(lt,len(lt))
#div=soup.find('div',class_='tang')
#print(div.find_all('a'))
#print(div.find_all(['i','b']))#find_all 裏面還能夠接多個標籤,以列表的形式
#print(div.find_all('a',limit=2))#找到全部取前面2個
#select 根據選擇器選擇指定的內容
#常見的選擇器:標籤選擇器,類選擇器,id選擇器,組合選擇器,層級選擇器,屬性選擇器
#選擇器返回的永遠是列表,須要經過下表提取指定的對象,而後獲取屬性和節點
#print(soup.select('div > u1 > li > a'))#標籤和大於符號之間必須有空格
#print(soup.select('div > u1 > li > a')[0])#取第一個
#print(soup.select('.tang > u1 > li > a')[0])#也能夠這樣寫,返回的結果和上面同樣
#print(soup.select('#du'))#id能夠這樣寫
#print(soup.select('#feng')[0].text)#返回的是一個列表,取值的話,必須先經過下表取出來,在調用獲取內容的函數
#print(soup.select('#feng')[0]['href'])#返回的是href的值
#select 方法也能夠經過普通對象調用,找到都是這個對象下面符合要求的全部節點
#div=soup.find('div',class_='tang')
#print(div.select('.du'))
#print(soup.select('.du'))
八、
# -*- coding: utf-8 -*-
"""
Created on Wed May 1 11:05:33 2019
@author: admin
"""
import urllib.request
import urllib.parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url='http://www.renren.com/970622703/profile'
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36',
'Cookie':'anonymid=jv4jjsmt8luy21; ln_uact=17767258153; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; __guid=238633222.311801873786504100.1556674290342.3481; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291856; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291858; wp_fold=0; depovince=GW; _r01_=1; JSESSIONID=abcnRiMszrXoLbNlVdXPw; ick_login=4c390ed0-4fe6-4264-b9b2-610a614ac13c; first_login_flag=1; jebecookies=989247e8-b114-48f9-9592-aec3cd10e92b|||||; _de=7266BDD6184F288A5EF7AB01E3CFE338; p=38e98cbf34016e9010c9f1f73791f2423; t=3b04ed4095e7a4b7612203f7169bbc843; societyguester=3b04ed4095e7a4b7612203f7169bbc843; id=970622703; xnsid=8ebbfe1f; ver=7.0; loginfrom=null; monitor_count=9',
}
req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
九、
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 15 08:52:30 2018
@author: T0001
"""
html='''<tr>
<td class="posterColumn">
<span name="rk" data-value="1"></span>
<span name="ir" data-value="9.216510839765467"></span>
<span name="us" data-value="7.791552E11"></span>
<span name="nv" data-value="1868842"></span>
<span name="ur" data-value="-1.7834891602345326"></span>
<div class="unseeable">NOT YET RELEASED</div>
</td>
<td class="titleColumn">
1.
<a href="/title/tt0111161" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman" >The Shawshank Redemption</a>
<span class="secondaryInfo">(1994)</span>
</td>
<td class="ratingColumn imdbRating">
<strong title="9.2 based on 1,868,842 user ratings">9.2</strong>
</td>
<td class="ratingColumn">
<div class="seen-widget seen-widget-tt0111161 pending" data-titleid="tt0111161">
<div class="inline">
<div class="pending">3.2</div>
<div class="unseeable">NOT YET RELEASED</div>
<div class="unseen">4.5</div>
<div class="rating"></div>
<div class="seen">Seen</div>
</div>
</div>
</td>
<td class="watchlistColumn">
<div class="wlb_ribbon" data-tconst="tt0111161" data-recordmetrics="true"></div>
</td>
</tr>
'''
from lxml import etree
#本地打開
#tree=etree.parse("文件名")
#網絡打開
#tree=etree.HTML("網頁字符串")
imdb=etree.HTML(html)
#屬性定位
#print(imdb.xpath('//span[@name="ir"]'))
#print(imdb.xpath('//div[@data-tconst]'))
#層級和索引混合定位
#print(imdb.xpath('//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[1]'))#索引從1開始
#print(imdb.xpath('//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[@class="unseeable"]'))#也能夠經過屬性定位
#print(imdb.xpath('//td[@class="ratingColumn"]//div'))#<td class="ratingColumn">下面全部的div
#print(imdb.xpath('//td[@class="ratingColumn"]//div[@class="seen"]'))#後面也能夠用屬性來定位
#result1=imdb.xpath('//div[@class="inline"]/div[last()-2]')#
#邏輯運算
#print(imdb.xpath('//div[@class="wlb_ribbon"and @data-tconst="tt0111161"]')) 若是一個屬性不能限制,也能夠加一個屬性,他們之間用and
#模糊匹配
#print(imdb.xpath('//div[contains(@class,"un")]'))#全部的div ,有class屬性,而且屬性中含有un的節點
#print(imdb.xpath('//div[contains(text(),4)]'))#全部的值 ,含有4的節點
#print(imdb.xpath('//div[starts-with(@class,"r")]'))#全部的div ,有class屬性,而且屬性中以r開頭的節點
#取文本內容
#print(imdb.xpath('//div[@class="inline"]/div[5]/text()'))獲取節點內容
#取屬性
#print(imdb.xpath('//div[@class="inline"]/div[2]/@class'))
#print(imdb.xpath('//div[@class="inline"]//text()'))#將<div class="inline">後面的全部節點裏面不帶標籤的全部內容給取出來
#print(imdb.xpath('//div[@class="inline"]/div[last()-1]/@class'))#也能夠經過這種方式獲取
#將1.給取出來
#s=imdb.xpath('//td[@class="titleColumn"]/text()')
#a=[]
#for i in s:
# if i.strip() != "":
# a.append(i.strip())
#s=imdb.xpath('//td[@class="titleColumn"]')
#k=s[0].xpath('string(.)')
#l=k.replace('\n', '').replace('\t', '')
#print(l.strip().split()[0])
#for i in result:
# print(etree.tostring(i))
十、
# -*- coding: utf-8 -*-
"""
Created on Wed May 1 11:13:30 2019
@author: admin
"""
import urllib.request
import urllib.parse
url='http://www.baidu.com/'
proxy={
'http':'222.135.92.68:38094'
}
#建立handler
handler=urllib.request.ProxyHandler(proxy)
#建立opener
opener=urllib.request.build_opener(handler)
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
}
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
print(response.read().decode('utf-8'))
十一、
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 15 11:37:22 2018
@author: T0001
"""
from lxml import etree
import numpy as np
import pandas as pd
import urllib.request
import random
url='http://news.ceic.ac.cn/'
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
headers['User-Agent']=agentStr
req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req).read().decode('utf-8')
earth=etree.HTML(response)
result=earth.xpath('//td[@align="center"]/text()')
result1=earth.xpath('//td[@align="left"]/a/text()')
data=np.array(result).reshape((-1,5))
c=np.column_stack((data,result1))
pd.DataFrame(c,columns=['gf','gdf','dsf','dsgf','fdg','dfgh']).to_csv('dz.csv',index=False)
十二、
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 19:06:01 2019
@author: Office
"""
import urllib.request
import ssl
import random
import json
import pandas as pd
from sqlalchemy import create_engine
#模擬請求頭
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
#隨機選擇一個請求頭
agentStr=random.choice(agentlist)
def ajaxCrawler(url):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
req=urllib.request.Request(url,headers=headers)
# 使用ssl建立未驗證的上下文
context=ssl._create_unverified_context()
response=urllib.request.urlopen(req,context=context)
jsonStr=response.read().decode('utf-8')
jsonData=json.loads(jsonStr)
return jsonData
title=[]
score=[]
release_date=[]
vote_count=[]
for i in range(1,100):
url='https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start='+str(i*20)+'&limit=20'
info=ajaxCrawler(url)
for j in info:
title.append(j["title"])
score.append(j['score'])
release_date.append(j['release_date'])
vote_count.append(j['vote_count'])
#轉化爲DataFrame
data=pd.DataFrame({'score':score,'title':title,'release_date':release_date,'vote_count':vote_count},columns=['score','title','release_date','vote_count'])
#保存到excel
#data.to_csv('dy.csv')
#保存到mysql
engine=create_engine('mysql+pymysql://root:123456@localhost/demo')
data.to_sql('douban',engine,if_exists="replace")
1三、
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 13 20:12:39 2019
@author: wqq
"""
import urllib.request
import re
import random
import gzip
import numpy as np
import pandas as pd
url="http://esf.hz.fang.com/"
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
headers['User-Agent']=agentStr
req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req)
#print(response.info().get('Content-Encoding'))
string=gzip.decompress(response.read()).decode('gbk')
phone_reg=r'''<span class="red"><b>(.*?)</b>'''
phone_pat=re.compile(phone_reg)
z_jia=re.findall(phone_pat,string)
phone_reg=r'''<span>(.*?)元/㎡</span>'''
phone_pat=re.compile(phone_reg)
d_jia=re.findall(phone_pat,string)
phone_reg=r'''<p class="tel_shop">(.*?)<span class="people_name">'''
phone_pat=re.compile(phone_reg,re.S)
match=re.findall(phone_pat,string)
g_ju=[]
m_ji=[]
l_ceng=[]
c_xiang=[]
n_dai=[]
for i in match:
k=(i.split())
g_ju.append(k[0])
m_ji.append(k[1].split("<i>|</i>")[1])
if "<i>|</i>" not in k[2]:
l_ceng.append(k[2])
else:
l_ceng.append(k[2].split("<i>|</i>")[1])
if "<i>|</i>" not in k[3]:
c_xiang.append(k[3])
else:
c_xiang.append(k[3].split("<i>|</i>")[1])
if "<i>|</i>" not in k[4]:
n_dai.append(k[4])
else:
n_dai.append(k[4].split("<i>|</i>")[1])
phone_reg=r'''<a target="_blank" href="/house-xm\d+/" title=(.*?)>'''
phone_pat=re.compile(phone_reg)
g_yu_name=re.findall(phone_pat,string)
phone_reg=r'''<span class="tit_shop">(.*?)</span>'''
phone_pat=re.compile(phone_reg)
title=re.findall(phone_pat,string)
phone_reg=r'''<span>(.*?)</span>'''
phone_pat=re.compile(phone_reg)
d_duan=re.findall(phone_pat,string)[::2]
d_duan.remove(d_duan[-1])
pd.DataFrame({'title':title,'g_ju':g_ju,
'm_ji':m_ji,'l_ceng':l_ceng,
'c_xiang':c_xiang,'n_dai':n_dai,
'z_jia(萬)':z_jia,'d_jia(元/m2)':d_jia,
'g_yu_name':g_yu_name,'d_duan':d_duan},
columns=['title','g_ju','m_ji','l_ceng','c_xiang','n_dai','z_jia(萬)','d_jia(元/m2)','g_yu_name','d_duan']).to_csv("二手房.csv",index=False)
1四、
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 29 08:32:04 2019
@author: Office
"""
import urllib.request
import random
import re
def handle_request(url,page=None):
if page != None:
url=url+str(page)+".html"
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
#隨機選擇一個請求頭
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
request=urllib.request.Request(url,headers=headers)
return request
def get_text(a_href):
#調用函數構造請求對象
request = handle_request(a_href)
#發送請求,得到響應
content = urllib.request.urlopen(request).read().decode('utf-8')
#解析內容
pattern = re.compile(r'<div class="neirong">(.*?)</div>',re.S)
lt = pattern.findall(content)
text = lt[0]
#寫個正則,將內容裏面全部的圖片標籤所有清空
pat=re.compile(r'<img .*?>')
text=pat.sub('',text)
return text
def parse_content(content):
#寫正則
pattern=re.compile(r'<h3><a href="(/lizhi/qianming/\d+\.html)">(.*?)</a></h3>')
#返回的lt是一個列表,列表中的元素都是元組,元組中的第一個元素就是正則
#中第一個小括號匹配到的內容,元組中的第二個元素就是正則中第二個小括號
#匹配到的內容
lt=pattern.findall(content)
#遍歷列表
for href_title in lt:
#獲取內容的連接
a_href = 'http://www.yikexun.cn' + href_title[0]
#獲取標題
title = href_title[-1]
#向a_href發送請求,獲取響應內容
text = get_text(a_href)
#寫入到html文件中
string = '<h1>%s</h1>%s' % (title,text)
with open ('lizhi.html', 'a' , encoding='utf8') as f:
f.write(string)
def main():
url='http://www.yikexun.cn/lizhi/qianming/list_50_'
start_page=int(input('請輸入起始頁碼:'))
end_page=int(input('請輸入結束頁碼:'))
for page in range(start_page,end_page):
#根據url和page去生成指定的request
request=handle_request(url,page)
content=urllib.request.urlopen(request).read().decode('utf-8')
#解析內容
parse_content(content)
main()
1五、
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 15 14:16:22 2018
@author: T0001
"""
#爬取圖片
import urllib.request
from lxml import etree
import random
url="https://www.ivsky.com/tupian/ziranfengguang/"
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
proxy=[
{'http':'http://61.164.39.66:53281'} ,
{'http':'http://116.209.57.18:9999'},
{'http':'http://183.148.133.77:9999'},
{'http':'http://211.23.149.29:80'},
{'http':'http://39.137.69.10:8080'}
]
end_proxy=random.choice(proxy)
proxy_handler=urllib.request.ProxyHandler(end_proxy)
opener=urllib.request.build_opener(proxy_handler)
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
html=response.read().decode("utf-8")
html=etree.HTML(html)
a=html.xpath('//div[@class="il_img"]/a/@href')
for i in a:
url_new="https://www.ivsky.com"+i
req1=urllib.request.Request(url,headers=headers)
response1=opener.open(req1)
html1=response1.read().decode("utf-8")
html_pic=etree.HTML(html1)
pic=html_pic.xpath('//div[@class="il_img"]/a/img/@src')
for j in pic:
end_url="https:"+j
req2=urllib.request.Request(end_url,headers=headers)
response2=opener.open(req2)
html2=response2.read()
with open('pic/'+j.split('/')[-1],'wb') as f:
f.write(html2)
1六、
# -*- coding: utf-8 -*-
"""
Created on Wed May 1 17:33:25 2019
@author: admin
"""
import urllib.request
url='http://www.baidu.com/'
#建立handler
handler=urllib.request.HTTPHandler()
#建立opener
opener=urllib.request.build_opener(handler)
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
}
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
print(response.read().decode('utf-8'))
1七、
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 08:37:26 2019
@author: Office
"""
import urllib.request
import random
import re
url="https://www.qiushibaike.com/text/page/1/"
proxy=[
{'http':'http://61.164.39.66:53281'} ,
{'http':'http://116.209.57.18:9999'},
{'http':'http://183.148.133.77:9999'},
{'http':'http://211.23.149.29:80'},
{'http':'http://39.137.69.10:8080'}
]
end_proxy=random.choice(proxy)
proxy_handler=urllib.request.ProxyHandler(end_proxy)
opener=urllib.request.build_opener(proxy_handler)
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
html=response.read().decode('utf-8')
print(html)
pat=r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
re_joke=re.compile(pat,re.S)
divsList=re_joke.findall(html)
dic={}
for i in divsList:
#用戶名
re_u=re.compile(r'<h2>(.*?)</h2>',re.S)
username=re_u.findall(i)
username=username[0]
#段子
re_d=re.compile(r'<div class="content">\n<span>(.*?)</span>',re.S)
duanzi=re_d.findall(i)
duanzi=duanzi[0]
dic[username]=duanzi
print(dic)
1八、
# -*- coding: utf-8 -*-
"""
Created on Wed May 1 08:50:22 2019
@author: admin
"""
import urllib.request
import urllib.parse
import http.cookiejar
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
#真實的模擬瀏覽器,當發送完post請求的時候,將cookie保存到代碼中
#建立一個cookiejar對象
cj=http.cookiejar.CookieJar()
#建立一個cookiejar 建立一個handler
handler=urllib.request.HTTPCookieProcessor(cj)
#根據handler建立一個opener
opener=urllib.request.build_opener(handler)
url='http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201943946542 '
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
}
fromdata={
'email':'17767258153',
'icode':'' ,
'origURL':'http://www.renren.com/home',
'domain':'renren.com',
'key_id':'1',
'captcha_type':'web_login',
'password':'204b8409cfb80c1d46a7134d150cd281a1808d1c0429eb7334a3fa8f4c6ae327',
'rkey':'b8871697112ad27ac3a61f5e85ebf5b4',
'f':'http%3A%2F%2Fwww.renren.com%2F970622703',
}
fromdata=urllib.parse.urlencode(fromdata).encode('utf-8')
req=urllib.request.Request(url,headers=headers)
response=opener.open(req,data=fromdata)
#print(response.read().decode('utf-8'))
get_url="http://www.renren.com/970622703/profile"
req=urllib.request.Request(get_url,headers=headers)
response=opener.open(req)
print(response.read().decode('utf-8'))
1九、
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 11:28:33 2019
@author: Office
"""
import urllib.request
from lxml import etree
import random
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
url='http://tubiao.17mcp.com/Ssq/index-500.html'
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req).read().decode('utf-8')
html=etree.HTML(response)
data=html.xpath('//tr/td[@style="color:White"]/text()')
qihao=html.xpath('//tr[@style="height: 25px"]/td[1]/text()')
da_ta=np.array(data).reshape(-1,7)
qi_hao=np.array(qihao)
end_data=np.column_stack((qi_hao,da_ta))
finnal_data=pd.DataFrame(end_data,columns=['qihao','one','two','three','four','five','six','seven'])
#保存到excel
finnal_data.to_csv('雙色球.csv',index=False)
#保存到mysql
engine=create_engine('mysql+pymysql://root:123456@localhost/demo')
finnal_data.to_sql('shungseqiu',engine,if_exists="replace")
20、
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 15:36:41 2019
@author: Office
"""
import urllib.request
import random
import re
keyname="chakra bracelet"
key=urllib.request.quote(keyname)
for i in range(1,2):
try:
print("--------正在爬第"+str(i)+"頁------------")
url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
proxy=[
{'http':'http://61.164.39.66:53281'} ,
{'http':'http://116.209.57.18:9999'},
{'http':'http://183.148.133.77:9999'},
{'http':'http://211.23.149.29:80'},
{'http':'http://39.137.69.10:8080'}
]
end_proxy=random.choice(proxy)
proxy_handler=urllib.request.ProxyHandler(end_proxy)
opener=urllib.request.build_opener(proxy_handler)
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':agentStr,
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
data=response.read().decode("utf-8","ignore")
pat='"pic_url":"//(.*?)"'
imglist=re.compile(pat).findall(data)
for j in range(0,len(imglist)):
try:
thisimg=imglist[j]
thisimgurl="http://"+thisimg
localfile="D:/"+str(i)+"_"+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename=localfile)
except Exception as err:
pass
except Exception as err:
pass
2一、
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 13 18:12:39 2019
@author: wqq
"""
import urllib.request
import urllib.parse
import ssl
import random
from lxml import etree
import pandas as pd
ssl._create_default_https_context = ssl._create_unverified_context
url = 'https://veromoda.tmall.com/p/rd609297.htm?spm=a1z10.10672-b-s.w5001-17277175636.16.7b822b67cHKn8X&scene=taobao_shop'
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0" ,
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"]
agentstr = random.choice(agentlist)
headers = {
'user-agent':agentstr,
'Accept': 'image/webp,*/*',
'Cookie': 'cq=ccp%3D1; cna=OA95FVY8Iw4CAXAKF/liwJ5M; isg=BI2N3dH67G6QcEhAxVcwy0Dzn6nHwsFXFVAU088SySSTxq14l7rRDNtcMJoFHdn0; l=bBNzmI9HqQPbVy7kBOCwquI8aG7OSIOYYuPRwNqXi_5ay1T_qsQOkjo1oe96Vs5RsXTB4mxQgLp9-etks; hng=""; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; t=2e15a61bdd752ef76d25e931fbd573ee; lid=%E4%BD%8E%E8%B0%83leoalan; _tb_token_=e1b6ee565fbb5; cookie2=1f4e270456996b258181536824f34637'
}
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
data = response.read().decode('gbk')
tree = etree.HTML(data)
imdb = etree.HTML(data)
title = imdb.xpath('//span[@class="user_name"]/text()')
adress = imdb.xpath('//div[@class="user_w990"]//a[@target="_blank"]/@href')
price = imdb.xpath('//span/strong[@class="user_pricetit"]/text()')
#oldprice = imdb.xpath('//span/span[@class="user_ft14 user_yj"]/text()')
a=0
for i in adress:
i = 'https:'+i
adress[a] = i
a+=1
pd.DataFrame({
'商品名稱':title,
'商品連接':adress,
'商品價格':price
},
columns=['商品名稱','商品連接','商品價格']
).to_excel('D:/天貓商品.xls')
2二、
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 19:37:12 2019
@author: Office
"""
import urllib.request
import urllib.parse
from lxml import etree
import time
import random
import os
def handle_request(url,page):
#因爲第一頁和後面的頁碼規律不同,因此要進行判斷
if page == 1:
url = url.format('')
else:
url = url.format('_' + str(page))
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
end_user_angent = random.choice(user_angent_list)
headers['User-Agent'] = end_user_angent
request = urllib.request.Request(url,headers=headers)
return request
#解析內容
def parse_content(content):
tree = etree.HTML(content)
image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
#懶加載
#遍歷列表,依次下載圖片
for image_src in image_list:
download_image(image_src)
def download_image(image_src):
dirpath = 'xinggan'
#建立一個文件夾
if not os.path.exists(dirpath):
os.mkdir(dirpath)
#搞個文件名
filename = os.path.basename(image_src)
#圖片路徑
filepath = os.path.join(dirpath,filename)
#發送請求,保存圖片
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-REQUESTED-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded'
}
user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
end_user_angent = random.choice(user_angent_list)
headers['User-Agent'] = end_user_angent
request = urllib.request.Request(image_src,headers=headers)
response = urllib.request.urlopen(request)
with open(filepath,'wb') as f:
f.write(response.read())
def main():
url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian.html'
start_page = int(input('請輸入起始頁碼:'))
end_page = int(input('請輸入結束頁碼:'))
for page in range(start_page,end_page+1):
request = handle_request(url,page)
content = urllib.request.urlopen(request).read().decode('utf-8')
parse_content(content)
time.sleep(2)
if __name__ == '__main__':
main()
2三、
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 30 21:07:14 2018
@author: Chen
"""
import pydotplus
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
import pandas as pd
#讀取csv文件到dataframe
df = pd.read_csv('./data.csv')
#print(df.head())#測試的時候用
df = df[['weather','temperature','humidity','wind','sports']]
df['weather'] = df['weather'].map({'晴': 0, '陰': 1, '雨': 2})
df['temperature'] = df['temperature'].map({'炎熱': 0, '適中': 1, '寒冷': 2})
df['wind'] = df['wind'].map({'弱': 0, '強': 1})
#分紅事實表,和分類表
df = df.dropna()
X = df.drop('sports', axis=1)
Y = df['sports']
'''
#分紅訓練集和測試集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)
'''
from sklearn import tree
model = tree.DecisionTreeClassifier()
#用測試集學習
#model.fit(X_train, y_train)
#用全集學習
model.fit(X, Y)
'''
#經過測試集測試模型的準確度
y_predict = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)
'''
#生成可視化的樹
dot_data = tree.export_graphviz(model.tree_, out_file=None,
feature_names=X.columns,
class_names=['no','yes'],
filled=True, rounded=True, # leaves_parallel=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
nodes = graph.get_node_list()
for node in nodes:
if node.get_label():
values = [int(ii) for ii in node.get_label().split('value = [')[1].split(']')[0].split(',')];
color = {0: [255,255,224], 1: [255,224,255], 2: [224,255,255],}
values = color[values.index(max(values))]; # print(values)
color = '#{:02x}{:02x}{:02x}'.format(values[0], values[1], values[2]); # print(color)
node.set_fillcolor(color )
graph.write_pdf("tree.pdf")
graph.write_png("tree.png")
#