Python3爬取影片入庫html
1、服務器說明html5
[root@openshift maoyan]# cat /etc/redhat-releasepython
CentOS Linux release 7.4.1708 (Core)mysql
[root@openshift maoyan]# python -Vweb
Python 3.6.3 :: Anaconda, Inc.sql
2、爬取電影入庫數據庫
首頁頁面分地址分析django
子頁面數據獲取,四個字段的數據寫入,服務器
3、mysql數據庫鏈接app
import pymysql
pymysql.install_as_MySQLdb()
class Sql(object):
conn = pymysql.connect(
host="127.0.0.1",
port=3306,
user='root',
passwd='123456',
db="movies",
charset="utf8"
)
4、源代碼編寫
[root@openshift maoyan]# cat maoyan2.py
# coding:utf-8
import requests,os,sys,django
from bs4 import BeautifulSoup
import re,urllib
import pymysql
pymysql.install_as_MySQLdb()
import datetime
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'maoyan.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
class Sql(object):
conn = pymysql.connect(
host="127.0.0.1",
port=3306,
user='root',
passwd='123456',
db="movies",
charset="utf8"
)
url = 'http://maoyan.com/films?showType=3'
#url = 'https://maoyan.com/films?showType=3&offset=30'
wbdata = requests.get(url,headers=headers)
soup = BeautifulSoup(wbdata.content,'html5lib')
movie_list = soup.select('div.movie-item > a')
for movie in movie_list:
m_url = 'http://maoyan.com' + movie.get('href')
m_data = requests.get(m_url,headers=headers)
m_soup = BeautifulSoup(m_data.content,'html5lib')
name = m_soup.select_one('div.movie-brief-container > h3.name').get_text()
movie_cate = m_soup.select("div.movie-brief-container > ul > li")[0].get_text()
release_date = m_soup.select("div.movie-brief-container > ul > li")[2].get_text()[0:10]
movie_img = m_soup.select_one('div.avatar-shadow > img').get('src')
created = datetime.datetime.now()
viewd = 1
cur = conn.cursor()
cur.execute("insert into userscore_movie(name,movie_cate,viewed,created,release_date,movie_img) VALUES('%s','%s','%d','%s','%s','%s')" %(name,movie_cate,viewd,created,release_date,movie_img))
print('正在爬取電影: '+name)
cur.close()
conn.commit()
Sql()
5、執行腳本,爬取數據過程
6、數據庫查看
自此,完成了Python3爬取影片入庫過程。