本文從https://www.rankedftw.com/lad... 爬取了星際2天梯數據,並存儲到本地的MySQL數據庫中,共爬得32萬條數據。html
import requests from bs4 import BeautifulSoup import pymysql.cursors from config import * from multiprocessing import Pool # 鏈接到數據庫 connection = pymysql.connect(host, user, password, db, port) cursor = connection.cursor() def get_html(url): # 獲取html r = requests.get(url) return r.text def save_to_db(values): # 插入數據到表格中 insert_ = 'INSERT INTO {} VALUES{}'.format(tabel, tuple(values)) try: if cursor.execute(insert_): connection.commit() except: connection.rollback() print('插入失敗', values) def get_info(html): # 解析html獲取數據並存儲 soup = BeautifulSoup(html,'lxml') infos = soup.find('table',class_='team-size-1').find_all('tr') for i in infos[1:]: if i.find_all('td', class_='img')[1].find('img', class_='league') is not None: League = i.find_all('td', class_='img')[1].find('img', class_='league').get('src').split('/')\[-1][:-10] else: League = '0' infomation = {'Rank': int(i.find_all('td', class_='number')[0].text), 'Name': i.find('span', class_='name').text, 'MMR': int(i.find_all('td', class_='number')[1\].text), 'Points': int(i.find_all('td', class_='number')[2].text), 'Wins': int(i.find_all('td', class_='number')[3].text), 'Losses': int(i.find_all('td', class_='number')[4].text), 'Played': 0, 'WinRate': i.find_all('td', class_='number')[6].text, 'Age': i.find_all('td', class_='number')[7].text, 'Region': i.find_all('td', class_='img')[0].find('img').get('src')[-12:-10], 'League': League, 'Tier': int(i.find_all('td', class_='img')[2].text), 'Race': i.find('img', class_='race').get('src').split('/')[-1]\[:-10], } #keys = [i for i in infomation.keys()] values = [i for i in infomation.values()] save_to_db(values) #list_ = [int(Rank),Name,int(MMR),int(Points),int(Wins),int(Losses),Played,WinRate,Age,Region,League,int(Tier),Race] #save_to_db(list_) def main(offset): try: url = 'https://www.rankedftw.com/ladder/lotv/1v1/mmr/?offset={}'.format(offset*100) html = get_html(url) get_info(html) except Exception as e: print(e) if __name__ == '__main__': pool_ = Pool() pool_.map(main, [i for i in range(3254)]) connection.close()