一個爬蟲例子,抓取糗百笑話。

# -*- coding: utf-8 -*-import reimport urllib2def getimage(pagenum):    url = 'http://m.qiushibaike.com/text/page/%s?s=4784059' % pagenum    req = urllib2.Request(url,headers={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'})    res = urllib2.urlopen(req)    c = res.read().decode('utf-8')    p = re.compile(r'<div class="content">(.*?)</div>',re.S)    l = re.findall(p,c)    return ldef getinfo():    f = open('E:\\qiubai.txt','w')    num = 35    slen = 0    for page in range(1,num+1):        list = getimage(page)        slen += len(list)        for line in list:            f.write(line.encode('utf-8'))    f.close()    print 'total download [%d] ' % slengetinfo()
相關文章
相關標籤/搜索