AJAX 指異步 JavaScript 及 XML(Asynchronous JavaScript And XML)web
ajax不是一門編程語言,而是利用Javascript在保證頁面不被刷新,頁面連接不改變的狀況下與服務器交換數據並更新部分網頁的技術ajax
打開今日頭條,並搜索街拍編程
網址:https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8Djson
切換到 network-->XHR 能夠看到紅筆圈出的:x-requested-with:XMLHttpRequestapi
切換到preview從data中得到圖片地址等信息,每條都是一一對應的 數組
再切換到headers,能夠看到請求地址的各個參數服務器
不斷往下拉,發現變化的參數只有offset,每次加載一次加十cookie
一、get_page(offset)請求頁面函數:發送請求,將得到response利用json()方法轉化爲JSON格式,而後返回session
二、get_image(json)解析函數:提取每一張圖片連接,將圖片連接和圖片所屬的標題一塊兒返回,此時能夠構造一個生成器app
三、save_image(item)保存文件函數:根據item的title建立文件夾,將得到的圖片以二進制的形式寫入文件,圖片文件名用MD5值,能夠去重
四、主函數:構造一個offset數組,遍歷offset,提取圖片連接,並將其下載便可
1 import requestsget 2 from urllib.parse import urlencode 3 4 def get_page(offset): 5 headers = { 6 'x-requested-with': 'XMLHttpRequest', 7 'user - agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 8 'referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D', 9 'cookie':'__tasessionId=ma7hh1k0i1564919983167; csrftoken=a2318e2879cb7daac5e3169ac6731316; tt_webid=6721280117165032963; UM_distinctid=16c5c7fee055e9-09bb3e770de9ee-7a1437-144000-16c5c7fee0685f; CNZZDATA1259612802=745403741-1564916538-%7C1564916538; s_v_web_id=7a3ff2239d94842e143caf217478bc62' 10 } 11 params = { 12 'aid': '24', 13 'app_name': 'web_search', 14 'offset': offset, 15 'format': 'json', 16 'keyword': '街拍', 17 'autoload': 'true', 18 'count': '20', 19 'en_qc': '1', 20 'cur_tab': '1', 21 'from': 'search_tab', 22 'pd': 'synthesis', 23 } 24 base_url = 'https://www.toutiao.com/api/search/content/?' 25 url = base_url + urlencode(params) 26 try: 27 response = requests.get(url,headers=headers) 28 if response.status_code == 200: 29 return response.json() 30 except requests.ConnectionError as e: 31 return None 32 33 def get_image(json): 34 if json.get('data'): 35 for item in json.get('data'): 36 title = item.get('title') 37 images = item.get('image_list') 38 for image in images: 39 yield { 40 'image':image.get('url'), 41 'title':title 42 } 43 44 import os 45 from hashlib import md5 46 47 def save_image(item): 48 if not os.path.exists(item.get('title')): 49 os.mkdir(item.get('title')) 50 try: 51 response = requests.get(item.get('image')) 52 if response.status_code == 200: 53 file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdgest(),'jpg') 54 if not os.path.exists(file_path): 55 with open(file_path,'wb') as f: 56 f.write(response.content) 57 else: 58 print('Already Downloaded',file_path) 59 except Exception as e: 60 print('Failed to Save Image') 61 62 from multiprocessing.pool import Pool 63 64 def main(offset): 65 json = get_page(offset) 66 for item in get_image(json): 67 print(item) 68 save_image(item) 69 70 GROUP_START = 1 71 GROUP_END = 20 72 73 if __name__ == '__main__': 74 pool = Pool() 75 groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)]) 76 pool.map(main,groups) 77 pool.close() 78 pool.join()