案例一: html
重點: 前端
1. 使用bs4 爬取python
2. 數據寫入本地 txtmysql
from bs4 import BeautifulSoup import requests url = "http://maoyan.com/board" header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } rsq = requests.get(url=url,headers=header).text soup = BeautifulSoup(rsq, "lxml") # 全部信息都在 <dd> </dd>標籤中,先提取出這個標籤 items = soup.select('dd') # 構建本地txt文檔 with open("D://maoyan.txt", "w", encoding="utf-8") as f: for item in items: # 提取標題 title=item.select('p a[data-act="boarditem-click" ]')[0].get_text() # 提取主演 star=item.select('p[class = "star"]')[0].get_text().replace("\n", "").strip(" ") # 提取分數 score=item.select('p[class = "score"]')[0].get_text().strip('\n').strip(' ') # 提取上映時間 releasetime=item.select('p[class = "releasetime"]')[0].get_text() # 數據整合 datas=title + " " + releasetime + " " + star + " " + score + "\n" print(datas) # 利用for循環把每條datas信息寫入本地 f.write(datas) f.close() print("Sucessful")
優化後sql
重點:數據庫
1. 連接數據庫,建立表服務器
2. 在線爬取寫入數據庫表優化
# 下面須要用requests 請求,不要用 「 from urllib import request 」網站
from bs4 import BeautifulSoup from urllib import request import time import pymysql # 記錄開始時間 start_time = time.time() print("嘗試連接服務器") try: # 連接到服務器 connect = pymysql.connect(host='主機地址', user='用戶', passwd='密碼', db='數據庫名', port=3306) # 建立遊標,對數據進行操做 cursor = connect.cursor() # 若是存在同名表就刪除 cursor.execute('DROP TABLES IF EXISTS maoyan_datas') # 使用SQL語句建立表 sql1 =""" create table maoyan_datas( title CHAR(100), star CHAR(200), score CHAR(50), releasetime CHAR(200) )""" # 執行上面的sql語句 cursor.execute(sql1) # 提交執行 connect.commit() # db.close() print("連接數據庫建立表完成") except Exception as e: print("連接數據庫建立表失敗" + str(e)) url="http://maoyan.com/board" rsq = request.urlopen(url) html = rsq.read().decode() # 解析網址 soup = BeautifulSoup(html,"lxml") # 提取前端結構中 <dd> </dd> 標籤部分,由於此部分包含所有信息 items = soup.select('dd') print("已獲取網站數據") sql2 = 'insert into maoyan_datas(title,star,score,releasetime) values(%s,%s,%s,%s)' i = 0 for item in items: # 提取標題 title = item.select('p a[data-act="boarditem-click" ]')[0].get_text() # 提取主演 star = item.select('p[class = "star"]')[0].get_text().replace("\n","").strip(" ") # 提取分數 score = item.select('p[class = "score"]')[0].get_text().strip('\n').strip(' ') # 提取上映時間 releasetime = item.select('p[class = "releasetime"]')[0].get_text() # 數據拼接 all = [title, str(star),str(score),str(releasetime)] #打印當前獲取的電影信息 print(all) # 把電影信息寫入數據庫 cursor.execute(sql2,all) i = i +1 print("已寫入 %s 行數據"%i) connect.close() print("done,消耗了時間: %f s" % (time.time() - start_time))