原創文章,歡迎轉載。轉載請註明:轉載自IT人故事會,謝謝!
原文連接地址:「docker實戰篇」python的docker爬蟲技術-python腳本app抓取(13)python
上次已經分析出來具體的app的請求鏈接了,本次主要說說python的開發,抓取APP裏面的信息。源碼:github.com/limingios/d…android
查看分析ios
解析出來的headergit
夜神配置github
python代碼,爬取分類mongodb
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import requests
#header內容比較多,由於各個廠家的思路不一樣,
#fiddler爬取出來的字段比較多,有些內容應該是非必填的,只能在實際的時候嘗試註釋一些來試。
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
print(response.text)
handle_index()
複製代碼
爬取詳情,信息經過分類找到裏面的詳情docker
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
#建立隊列
queue_list = Queue()
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic["result"]["cs"]:
# print(item_index)
for item_index_cs in item_index["cs"]:
# print(item_index_cs)
for item in item_index_cs["cs"]:
#print(item)
data_2 ={
"client":"4",
"_session":"1547000257341354730010002552",
"keyword":item["name"],
"_vs ":"400"
}
#print(data_2)
queue_list.put(data_2)
handle_index()
print(queue_list.qsize())
複製代碼
分類菜譜內部的詳情信息json
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
#建立隊列
queue_list = Queue()
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic["result"]["cs"]:
# print(item_index)
for item_index_cs in item_index["cs"]:
# print(item_index_cs)
for item in item_index_cs["cs"]:
#print(item)
data_2 ={
"client":"4",
#"_session":"1547000257341354730010002552",
"keyword":item["name"],
"_vs ":"400",
"order":"0"
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print("當前的食材:",data["keyword"])
caipu_list_url = "http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict["result"]["list"]:
caipu_info ={}
caipu_info["shicai"] = data["keyword"]
if caipu_item["type"]==13:
caipu_info["user_name"] = caipu_item["r"]["an"]
caipu_info["shicai_id"] = caipu_item["r"]["id"]
caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")
caipu_info["caipu_name"] = caipu_item["r"]["n"]
caipu_info["zuoliao_list"] = caipu_item["r"]["major"]
print(caipu_info)
else:
continue
handle_index()
handle_caipu_list(queue_list.get())
複製代碼
菜品內部的詳情信息centos
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
#建立隊列
queue_list = Queue()
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic["result"]["cs"]:
# print(item_index)
for item_index_cs in item_index["cs"]:
# print(item_index_cs)
for item in item_index_cs["cs"]:
#print(item)
data_2 ={
"client":"4",
#"_session":"1547000257341354730010002552",
"keyword":item["name"],
"_vs ":"400",
"order":"0"
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print("當前的食材:",data["keyword"])
caipu_list_url = "http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict["result"]["list"]:
caipu_info ={}
caipu_info["shicai"] = data["keyword"]
if caipu_item["type"]==13:
caipu_info["user_name"] = caipu_item["r"]["an"]
caipu_info["shicai_id"] = caipu_item["r"]["id"]
caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")
caipu_info["caipu_name"] = caipu_item["r"]["n"]
caipu_info["zuoliao_list"] = caipu_item["r"]["major"]
#print(caipu_info)
detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])
detail_data ={
"client":"4",
"_session":"1547000257341354730010002552",
"author_id":"0",
"_vs":"2803",
"ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]
caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]
print(json.dumps(caipu_info))
else:
continue
handle_index()
handle_caipu_list(queue_list.get())
複製代碼
vagrant up
複製代碼
ip 192.168.66.100api
su -
#密碼:vagrant
docker
複製代碼
hub.docker.com/r/bitnami/m…
默認端口:27017
docker pull bitnami/mongodb:latest
複製代碼
mkdir bitnami
cd bitnami
mkdir mongodb
docker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest
#關閉防火牆
systemctl stop firewalld
複製代碼
用第三方工具鏈接
鏈接mongodb的工具
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/11 0:53
# @Author : liming
# @Site :
# @File : handle_mongodb.py
# @url : idig8.com
# @Software: PyCharm
import pymongo
from pymongo.collection import Collection
class Connect_mongo(object):
def __init__(self):
self.client = pymongo.MongoClient(host="192.168.66.100",port=27017)
self.db_data = self.client["dou_guo_mei_shi"]
def insert_item(self,item):
db_collection = Collection(self.db_data,'dou_guo_mei_shi_item')
db_collection.insert(item)
# 暴露出來
mongo_info = Connect_mongo()
複製代碼
python爬取的數據經過mongo的工具保存到centos7的docker鏡像中
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
#建立隊列
queue_list = Queue()
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic["result"]["cs"]:
# print(item_index)
for item_index_cs in item_index["cs"]:
# print(item_index_cs)
for item in item_index_cs["cs"]:
#print(item)
data_2 ={
"client":"4",
#"_session":"1547000257341354730010002552",
"keyword":item["name"],
"_vs ":"400",
"order":"0"
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print("當前的食材:",data["keyword"])
caipu_list_url = "http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict["result"]["list"]:
caipu_info ={}
caipu_info["shicai"] = data["keyword"]
if caipu_item["type"]==13:
caipu_info["user_name"] = caipu_item["r"]["an"]
caipu_info["shicai_id"] = caipu_item["r"]["id"]
caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")
caipu_info["caipu_name"] = caipu_item["r"]["n"]
caipu_info["zuoliao_list"] = caipu_item["r"]["major"]
#print(caipu_info)
detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])
detail_data ={
"client":"4",
"_session":"1547000257341354730010002552",
"author_id":"0",
"_vs":"2803",
"ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]
caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]
#print(json.dumps(caipu_info))
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
handle_caipu_list(queue_list.get())
複製代碼
引用線程池
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
#建立隊列
queue_list = Queue()
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic["result"]["cs"]:
# print(item_index)
for item_index_cs in item_index["cs"]:
# print(item_index_cs)
for item in item_index_cs["cs"]:
#print(item)
data_2 ={
"client":"4",
#"_session":"1547000257341354730010002552",
"keyword":item["name"],
"_vs ":"400",
"order":"0"
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print("當前的食材:",data["keyword"])
caipu_list_url = "http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict["result"]["list"]:
caipu_info ={}
caipu_info["shicai"] = data["keyword"]
if caipu_item["type"]==13:
caipu_info["user_name"] = caipu_item["r"]["an"]
caipu_info["shicai_id"] = caipu_item["r"]["id"]
caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")
caipu_info["caipu_name"] = caipu_item["r"]["n"]
caipu_info["zuoliao_list"] = caipu_item["r"]["major"]
#print(caipu_info)
detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])
detail_data ={
"client":"4",
"_session":"1547000257341354730010002552",
"author_id":"0",
"_vs":"2803",
"ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]
caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]
#print(json.dumps(caipu_info))
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
pool = ThreadPoolExecutor(max_workers=20)
while queue_list.qsize()>0:
pool.submit(handle_caipu_list,queue_list.get())
複製代碼
當app運維人員,發現咱們的一直在請求他們的服務器,極可能就把我們的ip給封了,經過代理ip的方式。隱藏自我。
一個小時1元,我申請了一個小時我們一塊兒使用下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/11 2:40
# @Author : Aries
# @Site :
# @File : handle_proxy.py
# @Software: PyCharm
#60.17.177.187 代理出來的ip
import requests
url = 'http://ip.hahado.cn/ip'
proxy = {'http':'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030'}
response = requests.get(url=url,proxies=proxy)
print(response.text)
複製代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
#建立隊列
queue_list = Queue()
def handle_request(url,data):
header ={
"client": "4",
"version": "6916.2",
"device": "SM-G955N",
"sdk": "22,5.1.1",
"imei": "354730010002552",
"channel": "zhuzhan",
"mac": "00:FF:E2:A2:7B:58",
"resolution": "1440*900",
"dpi":"2.0",
"android-id":"bcdaf527105cc26f",
"pseudo-id":"354730010002552",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"3",
"carrier": "Android",
#"imsi": "310260000000000",
"user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon": "105.566938",
"lat": "29.99831",
"cid": "512000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58349118",
"Host": "api.douguo.net",
#"Content-Length": "65"
}
proxy = {'http': 'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030'}
response = requests.post(url=url,headers=header,data=data,proxies=proxy)
return response
def handle_index():
url = "http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
"client":"4",
"_session":"1547000257341354730010002552",
"v":"1503650468",
"_vs":"0"
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic["result"]["cs"]:
# print(item_index)
for item_index_cs in item_index["cs"]:
# print(item_index_cs)
for item in item_index_cs["cs"]:
#print(item)
data_2 ={
"client":"4",
#"_session":"1547000257341354730010002552",
"keyword":item["name"],
"_vs ":"400",
"order":"0"
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print("當前的食材:",data["keyword"])
caipu_list_url = "http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict["result"]["list"]:
caipu_info ={}
caipu_info["shicai"] = data["keyword"]
if caipu_item["type"]==13:
caipu_info["user_name"] = caipu_item["r"]["an"]
caipu_info["shicai_id"] = caipu_item["r"]["id"]
caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")
caipu_info["caipu_name"] = caipu_item["r"]["n"]
caipu_info["zuoliao_list"] = caipu_item["r"]["major"]
#print(caipu_info)
detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])
detail_data ={
"client":"4",
"_session":"1547000257341354730010002552",
"author_id":"0",
"_vs":"2803",
"ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]
caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]
#print(json.dumps(caipu_info))
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
pool = ThreadPoolExecutor(max_workers=2)
while queue_list.qsize()>0:
pool.submit(handle_caipu_list,queue_list.get())
複製代碼
PS:本次是app數據抓取的入門。首先是經過模擬器的代理服務,到本地的電腦(安裝fiddler),這樣fiddler就能夠抓取數據了,分析數據這塊要憑藉本身的經驗找到對應的url,若是能分析到url,基本爬蟲就寫一半。封裝請求頭。經過fiddler獲取的。裏面header內容比較多,嘗試刪除最簡化,也是一種反爬蟲的策略,有的數據放進去到容易被發現是爬蟲了,例如cookies等等,可是有的爬蟲爬取數據須要cookies。經過代理的方式設置代理ip,防止爬取過程當中同一個ip,一直請求一個接口被發現是爬蟲。引入了隊列的目的就是爲了使用線程池的時候方便提取。而後放入mongodb中。這樣使用多線程的app數據就完成了。