Python爬蟲

1.簡單爬蟲抓取圖片

#1.先獲取源代碼複製到txt
f = open('source.txt','r')
html = f.read();
f.close()

#2.讀取和下載圖片
pic_url = re.findall('<img src="(.*?)" class="lessonimg"',html)
i = 0
for each in pic_url:
    pic = requests.get(each)
    fp = open('pic\\'+str(i) + '.jpg','wb')
    fp.write(pic.content)
    fp.close()
    i += 1

2.網頁爬蟲

#header是爲了模擬電腦訪問
import re
import requests
header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
html = requests.get('http://jp.tingroom.com/yuedu/',headers = header)
fifter_txt = re.findall('<li><span class="f_r px11">(.*?)</li>',html.content)
for each in fifter_txt:
    link = re.findall('href="(.*?)"',each)[0]
    title = re.findall('title="(.*?)"',each)[0]
    print 'title:'+title.decode('utf-8')+'href:'+link

3.單線程爬蟲

//實戰,對極客學院課程的獲取
# -*-coding:gbk-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class spider(object):
    def __int__(self):
        print('開啓爬取內容...')

    #獲取網頁源代碼
    def getsource(self,url):
        html = requests.get(url)
        return html.text

    #獲取和生產不一樣頁數的連接
    def changepage(self,url,total_page):
        now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
        page_group = []
        for i in range(now_page,total_page+1):
            link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
            page_group.append(link)
        return page_group

    #抓取每一個課程塊的信息
    def geteveryclass(self,source):
        everyclass = re.findall('(<li deg="".*?</li>)',source,re.S)
        return everyclass
    #從每一個課程塊中提取出咱們須要的信息
    def getinfo(self,eachclass):
        info = {}
        info['title'] = re.search('target="_blank">(.*?)</a>',eachclass,re.S).group(1)
        info['content'] = re.search('</h2><p>(.*?)</p>',eachclass,re.S).group(1)
        timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
        info['classtime'] = timeandlevel[0]
        info['classlevel'] = timeandlevel[1]
        info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
        return info

    #保存到文件中
    def saveinfo(self,classinfo):
        f = open('info.txt','a')
        for each in classinfo:
            f.writelines('title:' + each['title'] + '\n')
            f.writelines('content:' + each['content'] + '\n')
            f.writelines('classtime:' + each['classtime'] + '\n')
            f.writelines('classlevel:' + each['classlevel'] + '\n')
            f.writelines('learnnum:' + each['learnnum'] +'\n\n')
        f.close()

if __name__ == '__main__':
    classinfo = []
    url = 'http://www.jikexueyuan.com/course/?pageNum=1'
    myspider = spider()
    all_links = myspider.changepage(url,20)
    for link in all_links:
        print '正在處理頁面:'+link
        html = myspider.getsource(link)
        everyclass = myspider.geteveryclass(html)
        for each in everyclass:
            info = myspider.getinfo(each)
            classinfo.append(info)
        myspider.saveinfo(classinfo)

4.XPath的使用

ps:能夠在chrom開發者工具複雜xpathhtml

# encoding=utf8
from lxml import etree


html='''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="content">
        <ul id="useful">
            <li>這是第一條信息</li>
            <li>這是第二條信息</li>
            <li>這是第三條信息</li>
        </ul>
        <ul id="useless">
            <li>這不是第一條信息</li>
            <li>這不是第二條信息</li>
            <li>這不是第三條信息</li>
        </ul>
    </div>

    <div id="ul1">
        <a href="http://www.baidu.com">百度</a>
        <a href="http://www.taobao.com">淘寶</a>
    </div>

    <div class="xxx">
        <p>哈哈</p>
    </div>
</body>
</html>'''

#抓取文本
selector = etree.HTML(html)
content = selector.xpath('//ul[@id="useful"]/li/text()')
for each in  content:
    print each

selector = etree.HTML(html)
p = selector.xpath('//div[@class="xxx"]/p/text()')
print p

#抓取屬性
link = selector.xpath('//div[@id="ul1"]/a/@href')
print link

5.XPath的特殊用法

1).相同標籤開頭網頁爬蟲

#[starts-with(xxxxx)]表示xxxx標籤開頭的全部標籤

# encoding=utf8
from lxml import etree

html1 = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test-1">須要的內容1</div>
    <div id="test-2">須要的內容1</div>
    <div id="test-3">須要的內容1</div>
</body>
</html>'''

selector1 = etree.HTML(html1)
test_data = selector1.xpath('//div[starts-with(@id,"test")]/text()')
for each in test_data:
    print each

2).嵌套標籤標籤內的全部文本內容多線程

#data.xpath('string(.)')

# encoding=utf8
from lxml import etree

html = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test3">
        我左青龍,
        <span id="tiger">
            右白虎
            <ul>
                上朱雀,
                <li>下玄武</li>
            </ul>
            老牛在當中,
        </span>
        龍頭在胸口
    </div>
</body>
</html>'''

selector = etree.HTML(html)
data = selector.xpath('//div[@id="test3"]')[0]#第一次使用xpath,注意這裏沒使用text(),因此仍是xml數據
info = data.xpath('string(.)')#第二次使用xpath,提取xml裏的string
content = info.replace('\n','').replace(' ','')#把\n和空格去掉
print(content)

6.多線程爬蟲

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def spider(url):
    html = requests.get("http://www.jikexueyuan.com/course/?pageNum=4")
    selector = etree.HTML(html.text)
    title = selector.xpath('//div[@class="lesson-infor"]/h2/a/text()')
    introduce = selector.xpath('//div[@class="lesson-infor"]/p/text()')
    for i in  range(0,int(len(title))):
        t = title[i]
        s = introduce[i]
        print(t+" "+s)
        f.writelines(str(index)+'-'+str(i)+'.'+t+" "+s+'\n')

if __name__ == '__main__':
    page = []
    index = 0
    pool = ThreadPool(4)
    f = open('shuang.txt','a')
    for i in range(1,20):
        index = i
        newpage = 'http://www.jikexueyuan.com/course/?pageNum='+str(i)
        page.append(newpage)
        results = pool.map(spider,page)
    pool.close()
    pool.join()
    f.close()
相關文章
相關標籤/搜索