#1.先獲取源代碼複製到txt f = open('source.txt','r') html = f.read(); f.close() #2.讀取和下載圖片 pic_url = re.findall('<img src="(.*?)" class="lessonimg"',html) i = 0 for each in pic_url: pic = requests.get(each) fp = open('pic\\'+str(i) + '.jpg','wb') fp.write(pic.content) fp.close() i += 1
#header是爲了模擬電腦訪問 import re import requests header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'} html = requests.get('http://jp.tingroom.com/yuedu/',headers = header) fifter_txt = re.findall('<li><span class="f_r px11">(.*?)</li>',html.content) for each in fifter_txt: link = re.findall('href="(.*?)"',each)[0] title = re.findall('title="(.*?)"',each)[0] print 'title:'+title.decode('utf-8')+'href:'+link
//實戰,對極客學院課程的獲取 # -*-coding:gbk-*- import requests import re import sys reload(sys) sys.setdefaultencoding("utf-8") class spider(object): def __int__(self): print('開啓爬取內容...') #獲取網頁源代碼 def getsource(self,url): html = requests.get(url) return html.text #獲取和生產不一樣頁數的連接 def changepage(self,url,total_page): now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1)) page_group = [] for i in range(now_page,total_page+1): link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S) page_group.append(link) return page_group #抓取每一個課程塊的信息 def geteveryclass(self,source): everyclass = re.findall('(<li deg="".*?</li>)',source,re.S) return everyclass #從每一個課程塊中提取出咱們須要的信息 def getinfo(self,eachclass): info = {} info['title'] = re.search('target="_blank">(.*?)</a>',eachclass,re.S).group(1) info['content'] = re.search('</h2><p>(.*?)</p>',eachclass,re.S).group(1) timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S) info['classtime'] = timeandlevel[0] info['classlevel'] = timeandlevel[1] info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1) return info #保存到文件中 def saveinfo(self,classinfo): f = open('info.txt','a') for each in classinfo: f.writelines('title:' + each['title'] + '\n') f.writelines('content:' + each['content'] + '\n') f.writelines('classtime:' + each['classtime'] + '\n') f.writelines('classlevel:' + each['classlevel'] + '\n') f.writelines('learnnum:' + each['learnnum'] +'\n\n') f.close() if __name__ == '__main__': classinfo = [] url = 'http://www.jikexueyuan.com/course/?pageNum=1' myspider = spider() all_links = myspider.changepage(url,20) for link in all_links: print '正在處理頁面:'+link html = myspider.getsource(link) everyclass = myspider.geteveryclass(html) for each in everyclass: info = myspider.getinfo(each) classinfo.append(info) myspider.saveinfo(classinfo)
ps:能夠在chrom開發者工具複雜xpathhtml
# encoding=utf8 from lxml import etree html='''<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div id="content"> <ul id="useful"> <li>這是第一條信息</li> <li>這是第二條信息</li> <li>這是第三條信息</li> </ul> <ul id="useless"> <li>這不是第一條信息</li> <li>這不是第二條信息</li> <li>這不是第三條信息</li> </ul> </div> <div id="ul1"> <a href="http://www.baidu.com">百度</a> <a href="http://www.taobao.com">淘寶</a> </div> <div class="xxx"> <p>哈哈</p> </div> </body> </html>''' #抓取文本 selector = etree.HTML(html) content = selector.xpath('//ul[@id="useful"]/li/text()') for each in content: print each selector = etree.HTML(html) p = selector.xpath('//div[@class="xxx"]/p/text()') print p #抓取屬性 link = selector.xpath('//div[@id="ul1"]/a/@href') print link
1).相同標籤開頭網頁爬蟲
#[starts-with(xxxxx)]表示xxxx標籤開頭的全部標籤 # encoding=utf8 from lxml import etree html1 = '''<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div id="test-1">須要的內容1</div> <div id="test-2">須要的內容1</div> <div id="test-3">須要的內容1</div> </body> </html>''' selector1 = etree.HTML(html1) test_data = selector1.xpath('//div[starts-with(@id,"test")]/text()') for each in test_data: print each
2).嵌套標籤標籤內的全部文本內容多線程
#data.xpath('string(.)') # encoding=utf8 from lxml import etree html = '''<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div id="test3"> 我左青龍, <span id="tiger"> 右白虎 <ul> 上朱雀, <li>下玄武</li> </ul> 老牛在當中, </span> 龍頭在胸口 </div> </body> </html>''' selector = etree.HTML(html) data = selector.xpath('//div[@id="test3"]')[0]#第一次使用xpath,注意這裏沒使用text(),因此仍是xml數據 info = data.xpath('string(.)')#第二次使用xpath,提取xml裏的string content = info.replace('\n','').replace(' ','')#把\n和空格去掉 print(content)
#-*-coding:utf8-*- from lxml import etree from multiprocessing.dummy import Pool as ThreadPool import requests import sys reload(sys) sys.setdefaultencoding('utf-8') def spider(url): html = requests.get("http://www.jikexueyuan.com/course/?pageNum=4") selector = etree.HTML(html.text) title = selector.xpath('//div[@class="lesson-infor"]/h2/a/text()') introduce = selector.xpath('//div[@class="lesson-infor"]/p/text()') for i in range(0,int(len(title))): t = title[i] s = introduce[i] print(t+" "+s) f.writelines(str(index)+'-'+str(i)+'.'+t+" "+s+'\n') if __name__ == '__main__': page = [] index = 0 pool = ThreadPool(4) f = open('shuang.txt','a') for i in range(1,20): index = i newpage = 'http://www.jikexueyuan.com/course/?pageNum='+str(i) page.append(newpage) results = pool.map(spider,page) pool.close() pool.join() f.close()