import requests import time, random, csv from fake_useragent import UserAgent from bs4 import BeautifulSoup from threadpool import ThreadPool, makeRequests def request_url(city_code, city_name, city_letter): """ 請求主頁 """ with open('has_elong.json', 'a+', encoding='utf-8') as hs: hs.write(city_code + '\n') hs.close() if city_code and int(city_code) < 1000: city_code = '0' + str(city_code) else: city_code = str(city_code) with open('藝龍/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as f: cs = csv.writer(f, dialect='excel') # [酒店名稱,價格,地址,星級,主題,可供服務,酒店信息] cs.writerow(['酒店名稱', '價格', '地址', '星級', '主題', '可供服務', '酒店信息']) # 循環1-89頁 for n in range(1, 89): url = 'http://hotel.elong.com/%s/' % city_letter data = { "code": "7140144", "listRequest.areaID": "", "listRequest.bookingChannel": "1", "listRequest.cardNo": "192928", "listRequest.checkInDate": "2019-03-02 00:00:00", # 入住時間 "listRequest.checkOutDate": "2019-03-03 00:00:00", # 離開時間 "listRequest.cityID": city_code, "listRequest.cityName": city_name, # 北京等地區 "listRequest.customLevel": "11", "listRequest.distance": "20", "listRequest.endLat": "0", "listRequest.endLng": "0", "listRequest.facilityIds": "", "listRequest.highPrice": "0", "listRequest.hotelBrandIDs": "", "listRequest.isAdvanceSave": "false", "listRequest.isAfterCouponPrice": "true", "listRequest.isCoupon": "false", "listRequest.isDebug": "false", "listRequest.isLimitTime": "false", "listRequest.isLogin": "false", "listRequest.isMobileOnly": "true", "listRequest.isNeed5Discount": "true", "listRequest.isNeedNotContractedHotel": "false", "listRequest.isNeedSimilarPrice": "false", "listRequest.isReturnNoRoomHotel": "true", "listRequest.isStaySave": "false", "listRequest.isTrace": "false", "listRequest.isUnionSite": "false", "listRequest.keywords": "", "listRequest.keywordsType": "0", "listRequest.language": "cn", "listRequest.listType": "0", "listRequest.lowPrice": "0", "listRequest.orderFromID": "50", "listRequest.pageIndex": n, # 翻頁 "listRequest.pageSize": "20", "listRequest.payMethod": "0", "listRequest.personOfRoom": "0", "listRequest.poiId": "0", "listRequest.promotionChannelCode": "0000", "listRequest.proxyID": "ZD", "listRequest.rankType": "0", "listRequest.returnFilterItem": "true", "listRequest.sellChannel": "1", "listRequest.seoHotelStar": "0", "listRequest.sortDirection": "1", "listRequest.sortMethod": "1", "listRequest.starLevels": "", "listRequest.startLat": "0", "listRequest.startLng": "0", "listRequest.taRecommend": "false", "listRequest.themeIds": "", "listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e", "listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61" } headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'no-cache', 'Content-Length': '1599', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', # 'Cookie':'……61b8-48a1-b398-8b9ec1903f05……', 'Host': 'hotel.elong.com', 'Origin': 'http://hotel.elong.com', 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 'Referer': 'http://hotel.elong.com/%s/' % city_letter, 'User-Agent': UserAgent(verify_ssl=False).random, 'X-Requested-With': 'XMLHttpRequest' } try: time.sleep(random.randint(1, 4)) res = requests.get(url, data=data, headers=headers) dete_list = get_info_and_req_details(res.text) for data in dete_list: cs.writerow(data) except Exception: continue f.close() def get_info_and_req_details(html): """ 清洗該頁列表數據並向請求各個酒店的詳情頁 page_list = [酒店名稱,價格,地址,星級,主題,可供服務,酒店信息] """ bs = BeautifulSoup(html, "lxml") h_list = bs.find_all('div', attrs={'class': 'h_item'}) page_list = [] i = 0 for hotel in h_list: if i < 25: try: hotel_name = hotel.find('div', attrs={'class': 'h_info_pic'}).find('img').get('alt') hotel_price = str(hotel.find('span', attrs={'class': 'h_pri_num'}).get_text()) + '元起' hotel_add = hotel.find('p', attrs={'class': 'h_info_b2'}).find('a').get_text().replace('[', '').replace(']', '') hotel_ress = hotel.find('span', attrs={'class': 'l1'}).get('data-hoteladdress') try: hotel_grade = hotel.find('b', attrs={'class': 'icon_stars'}).get('title') except Exception: hotel_grade = '經濟型' try: hotel_theme = hotel.find('div', attrs={'class': 'tagList'}).get_text().replace('\n', ',') except Exception: hotel_theme = '' try: hotel_link = hotel.find('div', attrs={'class': 'h_info_pic'}).find('a').get('href') time.sleep(random.randint(1, 3)) detail_html = requests.get('http://hotel.elong.com%s#hotelContent' % hotel_link) server, hotel_info = get_details(detail_html.text) except Exception: server = '' hotel_info = '' except Exception: continue page_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info]) i += 1 return page_list def get_details(detail_html): """ 清洗詳情頁數據 """ detail = BeautifulSoup(detail_html, 'lxml') server = '' hotel_info = '' try: server = detail.find('ul', attrs={'class': 'dview_icon_list'}).get_text().replace('\n', ',') hotel_info = detail.find('div', attrs={'class': 'dview_info'}).get_text().replace('\n', ',').replace('\t', ',') except Exception: return server, hotel_info return server, hotel_info if __name__ == '__main__': has_num = [] req_list = []
// 地址爬取請借鑑爬取攜程酒店信息 for line in open('elong.json', encoding='utf-8'): line_list = line.replace("\n", "").split(',') for has in open("has_elong.json", encoding='utf-8'): has_num.append(int(has.replace('\n', ''))) if int(line_list[0]) in has_num: continue # request_url(line_list[0], line_list[1], line_list[2]) line_tuple = (line_list, None) req_list.append(line_tuple) pool = ThreadPool(3) requests_list = makeRequests(request_url, req_list) [pool.putRequest(req) for req in requests_list] pool.wait()