import requests from bs4 import BeautifulSoup import lxml import re import time import random import pymysql.cursors from selenium import webdriver import pandas import numpy connection = pymysql.connect(host='localhost', user='root', password='123', db='asd', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) payload = { "Ancoding":"gzip, deflate, sdch, br", "Accept-Language":"zh-CN,zh;q=0.8", "Connection":"keep-alive", "Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950", "Host":"tanggulake.tmall.com", "Referer":"https://tanggulake.tmall.com/search.htm?spm=a220o.1000855.w5002-15900729481.1.b3kpys&search=y", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "X-Requested-With":"XMLHttpRequest"} with connection.cursor() as cursor: # Create a new sql = "select * from 競店" cursor.execute(sql) q = cursor.fetchall() # connection is not autocommit by default. So you must commit to save # your changes. connection.commit() def id_s(): for dress in q: url =dress["地址"] url_re = requests.get(url+"1", params=payload) soup = BeautifulSoup(url_re.text, "lxml") pig=soup.select("div > div > div > div > span:nth-of-type(1)") get_pig=(pig[2].text.split("/"))[1] print(get_pig) ids=[] pigg=[] dates1=[] for pij in range(1,int(get_pig)+1): time.sleep(random.randrange(1,5)) ur1=dress["地址"]+str(pij) url_re1=requests.get(ur1,params=payload) soup=BeautifulSoup(url_re1.text,"lxml") date = soup.select("div > div > div > dl") for spid in date: ids.append(re.sub("\D", "", spid.get("data-id"))) date = soup.select("div > div > div > dl") imgs = soup.select("img") # 圖片 for imgasd in imgs: w = imgasd.get("src") p = re.match(r".*//(.*?.jpg)", w) pigg.append(r"https://" + p.group(1)) shuju2 = pandas.DataFrame(pigg) shuju2 = shuju2.rename(columns={0: "圖片連接"}) date = soup.select("div > div > div > dl") dated = soup.select("dl") # 獲取網頁信息 for i in dated: c = list(i.stripped_strings) # 刪除空格 b = [elem for elem in c if elem != '¥'] # 過濾 dates1.append([b[0], b[2]]) shuju2 = pandas.DataFrame(pigg) shuju2 = shuju2.rename(columns={0: "圖片連接"}) shuju3 = pandas.DataFrame(ids) shuju3 = shuju3.rename(columns={0: "id"}) shuju1 = pandas.DataFrame(dates1) # 寫入 shuju1 = shuju1.rename(columns={0: "標題", 1: "價格"}) result = pandas.concat([shuju1, shuju2, shuju3], axis=1) with connection.cursor() as cursor: # Create a new sql = 'select id from'+" " +dress["店鋪名稱"] cursor.execute(sql) fff = cursor.fetchall() fff = [i["id"] for i in fff] for w in result.values: if w[3] not in q: sql = "INSERT INTO "+dress["店鋪名稱"]+ "(`id`,圖片連接,價格,標題) VALUES (%s,%s,%s,%s)" cursor.execute(sql, (w[3],w[2],w[1],w[0])) # connection is not autocommit by default. So you must commit to save # your changes. connection.commit() id_s() driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false']) with connection.cursor() as cursor: # Create a new for i in q: sql = 'select id from' + " " + i["店鋪名稱"] cursor.execute(sql) q=cursor.fetchall() ids = [i["id"] for i in q] for ids_find in ids: driver.get("http://item.taobao.com/item.htm?id="+ids_find) time.sleep(10) date = driver.page_source soup = BeautifulSoup(date, "lxml") color = re.findall(r'<li title="(.*)">顏色分類|li title="(.*)">主要顏色', str(soup.select(".attributes-list"))) color = [i for i in color[0] if i] leimu = soup.select(".tb-pine")[0].get("data-catid") a = "" title = a.join(re.findall("([\u4e00-\u9fa5])", driver.title)) id_dress="http://item.taobao.com/item.htm?id="+ids_find dates = [ color, leimu,id_dress] with connection.cursor() as cursors: # Create a new sql = "UPDATE "+ i["店鋪名稱"]+ " SET 顏色='%s',類目='%s',商品地址='%s' where id = '%s'" % (dates[0][0], dates[1], dates[2],ids_find) print(sql) cursors.execute(sql) connection.commit() connection.commit()