urllib2抓取HTML存入Excel

時間 2019-11-20

標籤 urllib2 urllib 抓取 html 存入 excel 欄目 HTML 简体版

原文原文鏈接

經過urllib2抓取HTML網頁，而後過濾出包含特定字符的行，並寫入Excel文件：編碼

# -*- coding: utf-8 -*-

import sys
#import urllib
import urllib2

from xlwt import Workbook

def getdata(keywords, line):
    date = ''
    if keywords in line: # 本行包含keywords
        start = line.find('>',)
        end = line.find('</', start)
        data = line[start+1:end]
        return data
    return False

def FetchDataByUrllib(checkUrl):
    book=Workbook(encoding='gbk')
    # add_sheet新增sheet，默認不能overwrite數據，必須顯示指定可更改。
    sheet=book.add_sheet('mySheet', cell_overwrite_ok=True)
        
    try:
        checkFile = urllib2.urlopen(checkUrl)
    except Exception, e:
    
        print e
        return

    type = sys.getfilesystemencoding()
    
    i = 1
    for line in checkFile:
        # 根據網頁編碼格式來解碼
        line = line.decode("UTF-8").encode(type)
        #line = line.decode("GBK").encode(type)

        # 逐行所有寫入excel文件。
        #sheet.write(i,1,line)
        #i+=1
        
        # 查找所需的特定數據，寫入Excel文件。
        targetStr = getdata('體育', line) # 包含'體育'的行
        if targetStr != False:
            sheet.write(i,1,targetStr)
            i+=1
            
    book.save('simple.xls')
    print 'finish!'

print '開始...'

myUrl = 'http://www.sina.com.cn'

FetchDataByUrllib(myUrl)

輸出結果：url

相關標籤/搜索