本身琢磨寫了一個Python的小爬蟲,用來爬學校的招聘信息,如下是代碼。php
1 __author__ = 'WCQ' 2 # -*- coding: utf-8 -*- 3 4 import urllib2 5 import urllib 6 import re 7 import thread 8 import time 9 10 11 #----------- 加載招聘信息 ----------- 12 class Spider_Model: 13 def __init__(self): 14 self.page = 1 15 self.enable = False 16 self.endPage = 2 17 18 # 獲取網址的HTML 並編碼 19 def GetHTML(self, myUrl): 20 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 21 headers = {'User-Agent': user_agent} 22 req = urllib2.Request(myUrl, headers=headers) 23 myResponse = urllib2.urlopen(req) 24 myPage = myResponse.read() 25 # encode的做用是將unicode編碼轉換成其餘編碼的字符串 26 # decode的做用是將其餘編碼的字符串轉換成unicode編碼 27 unicodePage = myPage.decode("GBK") 28 return unicodePage 29 30 31 # 將招聘信息摳出來,添加到列表中而且返回列表 32 def GetPage(self, page): 33 myUrl = "http://www.job.ustc.edu.cn/list.php?trans=7&page=" + str(page) + "&MenuID=002002" 34 unicodePage = self.GetHTML(myUrl) 35 # 找出全部class="content"的div標記 36 # re.S是任意匹配模式,也就是.能夠匹配換行符 37 jobList = re.findall('<div class="Joplistone">(.*?)</div>', unicodePage, re.S) 38 jobItems = re.findall('<li><a href="(.*?)" style="color:#">(.*?)</a><span class="zhiwei">(.*?)</span><span class="zhuanye">(.*?)</span></li>', jobList[0], re.S) 39 jobs = [] 40 for job in jobItems: 41 # job 中第一個元素是招聘連接 42 # job 中第二個元素是招聘公司 43 # job 中第三個元素是職位 44 # job 中第四個元素是發佈日期 45 jobs.append([job[1], "http://www.job.ustc.edu.cn/" + job[0], job[2], job[3]]) 46 return jobs 47 48 # 得到招聘細節 49 def getJobDetail(self, joburl): 50 jobHtml = self.GetHTML(joburl) 51 jobDetail = re.findall('<div class="textone">(.*?)</div>', jobHtml, re.S) 52 #print jobDetail 53 return jobDetail 54 55 # 得到完整的招聘信息 56 def getJobDetailList(self, jobs): 57 jobDetailList = [] 58 for job in jobs: 59 jobDetailList.append([job[0], job[1], job[2], job[3], self.getJobDetail(job[1])]) 60 return jobDetailList 61 62 # 先展現一下 63 def showJob(self, page): 64 jobs = self.GetPage(page) 65 jobDetailList = self.getJobDetailList(jobs) 66 for jobDetail in jobDetailList: 67 for iterm in jobDetail: 68 print iterm 69 70 def Start(self): 71 self.enable = True 72 page = self.page 73 while self.enable & (page < self.endPage): 74 # 展現招聘信息 75 self.showJob(page) 76 page += 1 77 78 79 print u'招聘內容:' 80 myModel = Spider_Model() 81 myModel.Start()