先貼鏈接,讓各位觀衆老爺看看,對不對大家的胃口html
工控行業系統漏洞mysql
能夠看到,這個網頁是html靜態的,因此問題變的很是的簡單sql
只須要用request請求網頁就能夠了app
話很少說,直接貼代碼url
import requests from urllib.parse import urlencode from lxml import etree import pymysql import time import xlwt import xlrd def makeurl(): # http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0 baseurl = 'http://ics.cnvd.org.cn/?' params = { 'tdsourcetag': 's_pctim_aiomsg', 'max': '20' } for page in range(MAX_PAGE): params['offset'] = page * 20 url = baseurl + urlencode(params) print('url is ', url) yield url def get_page_urllist(url): headers = { 'Host': 'ics.cnvd.org.cn', 'Referer': 'http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=40', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } response = requests.get(url, headers=headers) return response.text def parse_urllist(content): html = etree.HTML(content) for li in html.xpath('//tbody[@id="tr"]/tr'): yield li.xpath('td/a/@href')[0] def get_page(url): headers = { 'Host': 'www.cnvd.org.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } response = requests.get(url, headers=headers) return response.text def parse_page(content, url): html = etree.HTML(content) item = {} item['url'] = url item['標題'] = str(html.xpath('//div[@class="blkContainerSblk"]/h1/text()')[0]) item['CNVD_ID'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="CNVD-ID"]/following-sibling::*[1]//text()')]) item['公開日期'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="公開日期"]/following-sibling::*[1]//text()')]) item['危害級別'] = ''.join([i.strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') for i in html.xpath('//tbody/tr/td[text()="危害級別"]/following-sibling::*[1]//text()')]) item['影響產品'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="影響產品"]/following-sibling::*[1]//text()')]) try: item['BUGTRAQ_ID'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="BUGTRAQ ID"]/following-sibling::*[1]//text()')]) except: item['BUGTRAQ_ID'] = '' item['CVE_ID'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//text()')]) + ' ' + ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="CVE ID"]/following-sibling::*[1]//@href')]) item['漏洞描述'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞描述"]/following-sibling::*[1]//text()')]) item['漏洞類型'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞類型"]/following-sibling::*[1]//text()')]) item['參考連接'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="參考連接"]/following-sibling::*[1]//text()')]) item['漏洞解決方案'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞解決方案"]/following-sibling::*[1]//text()')]) item['廠商補丁'] = ''.join( [i.strip() for i in html.xpath( '//tbody/tr/td[text()="廠商補丁"]/following-sibling::*[1]//text()')]) + ' http://www.cnvd.org.cn' + ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="廠商補丁"]/following-sibling::*[1]//@href')]) item['驗證信息'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="驗證信息"]/following-sibling::*[1]//text()')]) item['報送時間'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="報送時間"]/following-sibling::*[1]//text()')]) item['收錄時間'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="收錄時間"]/following-sibling::*[1]//text()')]) item['更新時間'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="更新時間"]/following-sibling::*[1]//text()')]) item['漏洞附件'] = ''.join( [i.strip() for i in html.xpath('//tbody/tr/td[text()="漏洞附件"]/following-sibling::*[1]//text()')]) return item def save_data(index, item, workbook): sheet = workbook.get_sheet('sheet1') # 建立一個sheet表格 for col, value in enumerate(item.values()): sheet.write(index, col, value) workbook.save(filename) print('保存成功') def excel_prepare(heads): workbook = xlwt.Workbook() sheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) # 建立一個sheet表格 for col, value in enumerate(heads): sheet.write(0, col, value) return workbook def urlisexist(url, urlset): if url in urlset: return True else: return False def getallurl(filename): workbook = xlrd.open_workbook(filename) sheet1 = workbook.sheet_by_name('sheet1') results = sheet1.col_values(0, 1) return results def read_old(filename): workbook = xlrd.open_workbook(filename) sheet1 = workbook.sheet_by_name('sheet1') alloldset = [] for index in range(sheet1.nrows): alloldset.append(sheet1.row_values(index)) return alloldset, sheet1.nrows def save_old(index, olditem): sheet = workbook.get_sheet('sheet1') # 建立一個sheet表格 for col, value in enumerate(olditem): sheet.write(index, col, value) workbook.save(filename) if __name__ == '__main__': # http://ics.cnvd.org.cn/?tdsourcetag=s_pctim_aiomsg&max=20&offset=0 # 睡眠時間 TIMESLEEP = 0 filename = '工程控制系統漏洞.xls' MAX_PAGE = 96 heads = ['url', '標題', 'CNVD_ID', '公開日期', '危害級別', '影響產品', 'BUGTRAQ_ID', 'CVE_ID', '漏洞描述', '漏洞類型', '參考連接', '漏洞解決方案', '廠商補丁', '驗證信息', '報送時間', '收錄時間', '更新時間', '漏洞附件'] try: alloldset, length = read_old(filename) except: alloldset = [] length = 1 workbook = excel_prepare(heads) for index, olditem in enumerate(alloldset): save_old(index, olditem) try: urlset = getallurl(filename) except: urlset = [] index = length for urlofpage in makeurl(): pagelistcontent = get_page_urllist(urlofpage) for url in parse_urllist(pagelistcontent): print('url is >>>', url) if not urlisexist(url, urlset): time.sleep(TIMESLEEP) result = get_page(url) item = parse_page(result, url) print('item is >>>', item) save_data(index, item, workbook) index = index + 1 workbook.save(filename)
不懂的地方,下方評論提問spa