import urllib.request
#建立一個ruquest對象 url="https://tieba.baidu.com/p/6310762577" request=urllib.request.Request(url) #鏈接url,返回response對象 response=urllib.request.urlopen(request) #獲取內容數據 html=response.read()#read(方法) #設置內容爲utf-8編碼 html=html.decode("utf-8") html
Out[6]:javascript
#導入正則表達式包 import re
#建立一個正則對象 str='src="(.+?.jpg)" size' imger=re.compile(str) imglist=re.findall(imger,html)#找打全部圖片 imglist
#把imglist保存到目錄下 import os import time i=1 #if not os.path.exists("imagess"): #os.mkdir("imagrss")#系統裏面不存在」images「就建立一個 for img in imglist: time.sleep(1)#休息1秒抓取下一張 urllib.request.urlretrieve(img,"C:/Users/1/Desktop/{}.jpg".format(i)) i=i+1 print("爬蟲結束")
import urllib.request
#在淘寶轉化瀏覽設備爲6plus,F12在JS中找到內容 url="https://suggest.taobao.com/sug?q=python+%E7%88%AC%E8%99%AB&code=utf-8&area=c2c&nick=&sid=null&callback=jsonp157311230486278746" #鏈接url,返回response對象 response=urllib.request.urlopen(url) html=response.read().decode("utf8") html
#把json的字符串格式轉換成python的字典類型
import json dic=json.loads(a) dic #注json不支持單引號,包含單引號的字符串會解析失敗
Out[42]:html
a=html[26:452]
for item in dic["result"]: print(item[0].replace("<b>","").replace("</b>",""))#replace字符串替換函數
#python 自帶工具能夠作編碼‘ str=urllib.request.quote("牛仔衣") str
import requests import json keyword = '連衣裙' # 不須要進行URL的編碼 url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url) html = response.text # 獲取響應體的內容 dic = json.loads(html) for item in dic['result']: print(item[0])
連衣裙女秋冬 連衣裙2019新款秋 連衣裙女 連衣裙2019秋款新 連衣裙夏 連衣裙女春秋 連衣裙長款秋冬 連衣裙收腰顯瘦 氣質 連衣裙兩件套秋冬 連衣裙長裙女秋冬
#搜索連衣裙二級菜單 import requests import json #定義一個函數,替換返回值中的特殊格式 def replace_str(html): html = html.replace('<b>','') html = html.replace('<\/b>','') return html keyword = '連衣裙' url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url) dic = json.loads(response.text) for item in dic['result']: print(item[0]) url2 = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(item[0]) response2 = requests.get(url2) content = replace_str(response2.text) dic2 = json.loads(content) for item2 in dic2['result']: print('\t'+item2[0])
連衣裙2018款新款 連衣裙2018款新款女 連衣裙2018款新款女 雪紡 連衣裙2018夏新款女 中長款 修身 裙子女夏2018新款 中長款連衣裙 裙子夏女2018新款 款連衣裙 氣質 連衣裙夏女2018新款 中長款 氣質 女裙子2018新款 中長款連衣裙 女夏2018新款連衣裙長 款35歲 女裝2018新款中長連衣裙 中長款 連衣裙女夏2018新款拉鍊款 連衣裙夏季新款 連衣裙夏季新款女 2018 氣質 a字 連衣裙夏季新款 雪紡 2018 連衣裙夏季新款 寬鬆 韓版 2018 連衣裙夏季新款女 名媛 連衣裙夏季新款 2018韓版孕婦裝 連衣裙夏季新款2018顯瘦超仙 連衣裙夏季新款女裝歐美中裙 連衣裙夏季新款女歐洲站2018 連衣裙夏季新款女韓版 棉麻 連衣裙夏季新款 時尚氣質
反爬蟲機制java
#加入user-agent,冒充瀏覽器 user-agent可在瀏覽器中F12 找到,且必須以字典格式傳入 header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} keyword = '羽絨服' url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url,headers=header) dic = json.loads(response.text) for item in dic['result']: print(item[0])
羽絨服女中長款 修身顯瘦 羽絨服女輕薄款 羽絨服中長款女 羽絨服女長款2017新款 韓版 潮 羽絨服 棗紅色 羽絨服女收腰 羽絨服男輕薄 羽絨服女長款 冬季 中長款 羽絨服套裝 羽絨服女長款收腰過膝
#設置代理IP proxies = {"HTTP":"122.114.31.177:808"} #百度搜索代理IP header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} keyword = '西裝' url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url,headers=header,proxies=proxies) dic = json.loads(response.text) for item in dic['result']: print(item[0])
西裝男套裝 青少年 西裝短褲套裝女 西裝 修身連衣裙 西裝領長袖女 西裝 短袖 西裝v領 連衣裙 收腰 西裝喇叭褲高腰 西裝褲 夏 女 西裝春秋女 西裝熱褲女
# 把數據寫到文件中 #搜索連衣裙二級菜單 import requests import json import pandas as pd #定義一個函數,替換返回值中的特殊格式 def replace_str(html): html = html.replace('<b>','') html = html.replace('<\/b>','') return html keyword = '連衣裙' proxies = {"HTTP":"122.114.31.177:808"} header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url,headers=header,proxies=proxies) dic = json.loads(response.text) lst=[] for item in dic['result']: lst.append([item[0],1]) url2 = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(item[0]) response2 = requests.get(url2,headers=header,proxies=proxies) content = replace_str(response2.text) dic2 = json.loads(content) for item2 in dic2['result']: lst.append([item2[0],2]) #寫到文件中 data = pd.DataFrame(lst,columns=['title','level']) data.to_csv('./lyq.csv',index=False,header=True) print('運行結束')
# 寫一個爬蟲—Post 方式抓取有道翻譯數據 import requests import json def translate(word): url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" header={ "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection":"keep-alive", "Content-Length": "254", "Content-Type": "application/x-www-form-urlencoded;", "Cookie": "OUTFOX_SEARCH_USER_ID=-2118096325@10.168.8.63; OUTFOX_SEARCH_USER_ID_NCOO=529585232.72911924; fanyi-ad-id=44547; fanyi-ad-closed=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abcm1o-ND2W6Y9HCq9vqw; _ntes_nnid=a7d939a02cd5c3865942d2a2051b410e,1529376772962; ___rl__test__cookies=1529398225385", "Host": "fanyi.youdao.com", "Origin": "http://fanyi.youdao.com", "Referer": "http://fanyi.youdao.com/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", "X-Requested-With":"XMLHttpRequest" } payload = { "i": word, "from": "AUTO", "to": "AUTO", "smartresult": "dict", "client": "fanyideskweb", "salt": "1529398225392", "sign": "bf09bc9795dfc7863516162c961fd97e", "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_CLICKBUTTION", "typoResult":"false" } response = requests.post(url,data=payload,headers=header) dic = json.loads(response.text) print(dic['translateResult'][0][0]['tgt']) if __name__=='__main__': translate('中國')
China