crawler_exa3

時間 2019-11-06
標籤 crawler exa3 exa 简体版
原文原文鏈接
優化中...php
#! /usr/bin/env python
# -*- coding:utf-8 -*-
# Author: Tdcqma

'''
v1.0:
    因爲網站結構存在變動的可能性，一旦爬蟲爬取的頁面發生變化則會影響正則表達式的匹配，致使爬蟲失效。
爲了解決這個問題從新架構該爬蟲，新的爬蟲將分3個部分，即：
    【1】信息收集：一旦網站結構發生變化只須要更改此部分的正則表達式便可，收集的信息須要保存至一個嵌套列表中。
    【2】信息篩選：即便網站結構發生變化也不須要變動此部分。
    【3】信息發送：即便網站結構發生變化也不須要變動此部分。

'''

import urllib.request
import ssl,re
import smtplib,email
import datetime

# ---------------------------------------------
# 【1】信息收集，正則表達匹配網站信息，包括date、title、url等，
#      將全部信息保存至sec_all_list列表中
# ---------------------------------------------

# 指定以當前日期(年月日格式)爲搜索條件
#today = str(datetime.date.today())
today = "2017-09-25"    # 臨時指定測試時間
str_domain = "http://www.nsfocus.net"
sec_all_list = []   # 收集全部漏洞信息並保存在列表中

# 因一天的漏洞個數可能要佔用好幾個網站頁面，因此指定被掃描網站須要掃描的網頁數範圍，默認讀取10頁
for i in range(10):
    url = "http://www.nsfocus.net/index.php?act=sec_bug&type_id=&os=&keyword=&page=%s" % (i+1)
    request = urllib.request.Request(url)
    # 當嘗試訪問https開始當站點時，設置全局取消SSL證書驗證
    ssl._create_default_https_context = ssl._create_unverified_context
    response = urllib.request.urlopen(request)
    data = response.read().decode('utf-8')

    if today in data:

        # 用於匹配內容的正則表達式部分
        str_re = "<.*" + today + ".*"
        res = re.findall(str_re, data)

        for line in res:

            sec_sub_list = []  # 收集單獨的漏洞信息

            # 收集漏洞標題
            title_craw = re.findall("/vulndb/\d+.*</a>", line)  # 獲取標題
            title = title_craw[0][15:-4]
            sec_sub_list.append(title)

            # 收集漏洞url
            url_craw = re.findall("/vulndb/\d+", line)  # 獲取連接
            sub_url = str_domain + url_craw[0]
            sec_sub_list.append(sub_url)

            # 收集漏洞受影響的版本
            vul_request = urllib.request.Request(sub_url)
            vul_response = urllib.request.urlopen(vul_request)
            vul_data = vul_response.read().decode('utf-8')

            affected_version = re.findall("<blockquote>.*</blockquote>", vul_data, re.S)
            affected_version = str(affected_version[0][12:-13])
            aff_ver = affected_version.replace("<br />","")
            sec_sub_list.append(aff_ver)

            # 將全部收集的子列表保存至彙總列表sec_all_list中
            sec_all_list.append(sec_sub_list)

# ---------------------------------------------
# 【2】信息篩選
# ---------------------------------------------

# 篩選後的內容最終會保存至msg變量中
msg = ""

# 調用get_sec_info函數，將目標系統或應用名稱做爲參數傳入，便可獲取相關爬蟲告警信息
def get_sec_info(vul):
    if vul in line[0]:
        sec_info = "\n漏洞名稱：" + line[0] + "\n漏洞連接：" + line[1] + "\n受影響的版本：\n" + line[2]+"\n"
        global msg
        msg += sec_info

for line in sec_all_list:

    get_sec_info("Apache")
    get_sec_info("Cisco")
    get_sec_info("EMC")
    get_sec_info("Samba")

# 爲放置數據丟失，同時將篩選後的爬蟲信息寫入文本f中，f指向secInfo-lvmeng.txt文檔。
f = open("secInfo-lvmeng.txt", 'w', encoding='utf-8')
f.writelines(msg)

# ---------------------------------------------
# 【3】信息發送
# ---------------------------------------------

chst = email.charset.Charset(input_charset = 'utf-8')
header = ("From: %s\nTo: %s\nSubject: %s\n\n" %
          ("from_mail@163.com",
           "to_mail@163.com",
           chst.header_encode("[爬蟲安全通告-綠盟]")))

# 借用163smtp服務器發送郵件，將上面讀到的報警信息做爲郵件正文發送。

email_con = header.encode('utf-8') + msg.encode('utf-8')
smtp = smtplib.SMTP("smtp.163.com")
smtp.login("from_mail@163.com","from_mail_pass")
smtp.sendmail('from_mail@163.com','to_mail',email_con)
print('mail send success!')
smtp.quit()
每日一句
每一个你不满意的现在，都有一个你没有努力的曾经。