python 使用協程爬取淘女郎存到本地

# -*- coding: utf-8 -*-
from gevent import monkey
monkey.patch_all()
import urllib.request
from gevent.pool import Pool
import json
import datetime
import random
import time
import urllib.request
from urllib.request import Request
import urllib.parse
import os
import re
import requests
#標記翻頁數據

starttime = datetime.datetime.now()
user_agent = [
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
]
def getMMData(url,currentPage=0):
    formdata = {
        'q':'',
        'viewFlag':'A',
        'sortType':'default',
        'searchStyle':'',
        'searchRegion':'city:',
        'searchFansNum':'',
        'currentPage': currentPage,
        'pageSize': '100'
    }
    formdata = urllib.parse.urlencode(formdata)
    headers = {'User-Agent': random.choice(user_agent)}

    requ = Request(url, data=bytes(formdata, 'utf-8'), headers=headers)
    data = urllib.request.urlopen(requ).read()



    data = data.decode('gbk')
    dataToDict = json.loads(data)
    return dataToDict['data']['searchDOList']
# 獲取ID
def getMMID(data):
    ID = []
    for i in data:
        ID.append(i['userId']+i['avatarUrl']+i['realName']+i['city']+i['height']+i['weight'])
    return ID
def getgirl(page):
    url = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
    try:
        for i in getMMData(url, page):
            a = ("https://mm.taobao.com/self/aiShow.htm?spm=719.7763510.1998643336.2.2WOZLp&userId=%d" % i['userId'])
            path = os.getcwd()  # 獲取此腳本所在目錄
            new_path = os.path.join(path, i['realName'])
            print(i['realName'])
            if not os.path.isdir(new_path):
                os.mkdir(new_path)
            f = open(i['realName'] + '/' + i['realName'] + ".txt", "w+")
            f.write("姓名:" + i['realName'] + "\n" + "城市: " + i['city'] + "\n" + "我的主頁:" + a + "\n"
                    + "身高:" + i['height'] + "\n" + "體重:" + i['weight'])

            try:
                fwws = urllib.request.urlopen("http:" + i['avatarUrl'], timeout=10)
                with open(i['realName'] + '/' + time.strftime('%H-%M-%S') + random.choice(
                        'qwertyuiopasdfghjklzxcvbnm') + ".jpg", 'wb') as code:
                    code.write(fwws.read())
                b = requests.get(a)


                pattern = re.compile(r'(13\d|14[579]|15[^4\D]|17[^49\D]|18\d)\d{8}')

                tel = pattern.search(b.text)

                fz = open(i['realName'] + '/' + "聯繫方式.txt", "w+")
                fz.write("mathch內就是聯繫電話" + str(tel))
                fz.close()


            except:
                print("拋出頁面%s"%a)

    except EOFError:
        print("程序出現錯誤")

    finally:
        print("如今跑到%d"%page)


def run():
    pool = Pool(2)

    pool.map(getgirl,range(1,46))

if __name__ == '__main__':
    print("開始啓動程序")
run()
相關文章
相關標籤/搜索