jupyter notebook 開啓服務
反反爬機制:
-robots.txt 協議: 遵循或者不遵循.html
import requests #1 url = 'https://www.sogou.com' #2. response = requests.get(url=url) #3 page_text =response.text #4. with open('./sougou.html','w',encoding='utf-8') as fp: fp.write(page_text)
import requests url = 'https://www.sogou.com/web' # 封裝函數 wd = input('enter a word:') param = { 'query': wd } response = requests.get(url=url, params=param) page_text = response.content fileName = wd + '.html' with open(fileName, 'wb') as fp: fp.write(page_text) print('over')
url = 'https://fanyi.baidu.com/sug' wd = input('enter a word:') data = { 'kw': wd } response = requests.post(url=url,data=data) print(response.json())
url = 'https://movie.douban.com/j/chart/top_list' params = { 'type': '5', 'interval_id': "100:90", 'action': '', 'start': '60', 'limit': '100', } movie_data = requests.get(url=url,params=params).json() print(movie_data)
import requests url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } id_list = [] for page in range(1,11): data = { "on": "true", "page": str(page), "pageSize": "15", "productName": "", "conditionType": "1", "applyname": "", "applysn": "", } json_data = requests.post(url=url,data=data,headers=headers).json() for dic in json_data['list']: id = dic['ID'] id_list.append(id) detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for id in id_list: detail_data = { 'id': id } detail_json = requests.post(url=detail_url,data=detail_data,headers=headers).json() print(detail_json)
url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551785494&di=d9329f74ebdc5bd6158447daf4d5a783&imgtype=jpg&er=1&src=http%3A%2F%2Fimg.biaoche.org%2F%3Fimg%3D03.imgmini.eastday.com%2Fmobile%2F20180616%2F0e1faa7f78e9c172db3c73d0cc1be192_wmk.jpeg' img_data =requests.get(url=url,headers=headers).content with open('./xiaohua.jpg','wb') as fp: fp.write(img_data) import urllib url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551191414490&di=8db3ee6e5b31215f03cf77e7deaa2077&imgtype=0&src=http%3A%2F%2F00imgmini.eastday.com%2Fmobile%2F20180918%2F20180918171154_1d58954e0491887b39e7122bdd1a9506_2.jpeg' urllib.request.urlretrieve(url=url,filename='lulaoye.jpg')
import re string = '''fall in love with you i love you very much i love she i love her ''' re.findall('^i.*', string,re.M) # re.M 匹配以後分多個 ['i love you very much', 'i love she', 'i love her'] ########################################### # 匹配所有行 string1 = """細思極恐 你的隊友在看書 你的敵人在磨刀 你的閨蜜在減肥 隔壁老王在練腰 """ re.findall('.*',string1,re.S) # re.s 匹配成一行 # ['細思極恐\n你的隊友在看書\n你的敵人在磨刀\n你的閨蜜在減肥\n隔壁老王在練腰\n\n', '']
import os import re import urllib import requests url = 'https://www.qiushibaike.com/pic/page/%d/?s=5170552' # page = 1 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } if not os.path.exists('./qiutu'): os.mkdir('./qiutu') start_page = int(input('enter a start pageNum:')) end_page = int(input('enter a end pageNum:')) for page in range(start_page,end_page+1): new_url = format(url%page) # print(new_url) page_text = requests.get(url=new_url,headers=headers).text img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>',page_text,re.S) for img_url in img_url_list: img_url = 'https:'+img_url imgName = img_url.split('/')[-1] imgPath = 'qiutu/'+imgName urllib.request.urlretrieve(url=img_url,filename=imgPath) print(imgPath,'下載成功!') print('over!!!')
mport requests from bs4 import BeautifulSoup url = 'http://www.shicimingju.com/book/sanguoyanyi.html' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } page_text = requests.get(url=url,headers=headers).text soup = BeautifulSoup(page_text,'lxml') a_list =soup.select('.book-mulu > ul > li > a') fp = open('sanguo.txt','w',encoding='utf-8') for a in a_list: title = a.string # string 是親兒子 ,下面的文字 detail_url = 'http://www.shicimingju.com' + a['href'] detail_page_text = requests.get(url=detail_url,headers=headers).text soup = BeautifulSoup(detail_page_text,'lxml') content = soup.find('div',class_='chapter_content').text fp.write(title + '\n' + content) print(title,'下載完成') print('over') fp.close