爬蟲(七):爬取貓眼電影top100

一:分析網站html

目標站和目標數據
目標地址:http://maoyan.com/board/4?offset=20
目標數據:目標地址頁面的電影列表,包括電影名,電影圖片,主演,上映日期以及評分。mysql

二:上代碼正則表達式

(1):導入相應的包sql

import requests from requests.exceptions import RequestException # 處理請求異常 import re import pymysql import json from multiprocessing import Pool

(2):分析網頁數據庫

經過檢查發現須要的內容位於網頁中的<dd>標籤內。經過翻頁發現url中的參數的變化。json

(3):獲取html網頁網站

# 獲取一頁的數據
def get_one_page(url): # requests會產生異常
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } try: response = requests.get(url, headers=headers) if response.status_code == 200:  # 狀態碼是200表示成功
            return response.text else: return None except RequestException: return None

(4):經過正則提取須要的信息 --》正則表達式詳情url

# 解析網頁內容
def parse_one_page(html): pattern = re.compile( '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?class="name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) # re.S能夠匹配任意字符包括換行
    items = re.findall(pattern, html)  # 將括號中的內容提取出來
    for item in items: yield {  # 構造一個生成器
            'index': item[0].strip(), 'title': item[2].strip(), 'actor': item[3].strip()[3:], 'score': ''.join([item[5].strip(), item[6].strip()]), 'pub_time': item[4].strip()[5:], 'img_url': item[1].strip(), }

(5):將獲取的內容存入mysql數據庫spa

# 鏈接數據庫,首先要在本地建立好數據庫
def commit_to_sql(dic): conn = pymysql.connect(host='localhost', port=3306, user='mydb', passwd='123456', db='maoyantop100', charset='utf8') cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)  # 設置遊標的數據類型爲字典
    sql = '''insert into movies_top_100(mid,title,actor,score,pub_time,img_url) values("%s","%s","%s","%s","%s","%s")''' % ( dic['index'], dic['title'], dic['actor'], dic['score'], dic['pub_time'], dic['img_url'],) cursor.execute(sql) # 執行sql語句並返回受影響的行數
    # # 提交
 conn.commit() # 關閉遊標
 cursor.close() # 關閉鏈接
    conn.close()

(6):主程序及運行code

def main(url): html = get_one_page(url) for item in parse_one_page(html): print(item) commit_to_sql(item) if __name__ == '__main__': urls = ['http://maoyan.com/board/4?offset={}'.format(i) for i in range(0, 100, 10)] # 使用多進程
    pool = Pool() pool.map(main, urls)

 (7):最後的結果

 

完整代碼:

# -*- coding: utf-8 -*- # @Author : FELIX # @Date : 2018/4/4 9:29

import requests from requests.exceptions import RequestException import re import pymysql import json from multiprocessing import Pool # 鏈接數據庫
def commit_to_sql(dic): conn = pymysql.connect(host='localhost', port=3306, user='wang', passwd='123456', db='maoyantop100', charset='utf8') cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)  # 設置遊標的數據類型爲字典
    sql = '''insert into movies_top_100(mid,title,actor,score,pub_time,img_url) values("%s","%s","%s","%s","%s","%s")''' % ( dic['index'], dic['title'], dic['actor'], dic['score'], dic['pub_time'], dic['img_url'],) cursor.execute(sql) # 執行sql語句並返回受影響的行數
    # # 提交
 conn.commit() # 關閉遊標
 cursor.close() # 關閉鏈接
 conn.close() # 獲取一頁的數據
def get_one_page(url): # requests會產生異常
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } try: response = requests.get(url, headers=headers) if response.status_code == 200:  # 狀態碼是200表示成功
            return response.text else: return None except RequestException: return None # 解析網頁內容
def parse_one_page(html): pattern = re.compile( '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?class="name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) # re.S能夠匹配任意字符包括換行
    items = re.findall(pattern, html)  # 將括號中的內容提取出來
    for item in items: yield {  # 構造一個生成器
            'index': item[0].strip(), 'title': item[2].strip(), 'actor': item[3].strip()[3:], 'score': ''.join([item[5].strip(), item[6].strip()]), 'pub_time': item[4].strip()[5:], 'img_url': item[1].strip(), } # print(items)


def write_to_file(content): with open('result.txt', 'a', encoding='utf8')as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') ii = 0 def main(url): html = get_one_page(url) for item in parse_one_page(html): global ii print(ii, item) ii = ii + 1 commit_to_sql(item) write_to_file(item) # print(html)


if __name__ == '__main__': urls = ['http://maoyan.com/board/4?offset={}'.format(i) for i in range(0, 100, 10)] # 使用多進程
    pool = Pool() pool.map(main, urls)
相關文章
相關標籤/搜索