Python小爬蟲

  本身琢磨寫了一個Python的小爬蟲,用來爬學校的招聘信息,如下是代碼。php

 1 __author__ = 'WCQ'
 2 # -*- coding: utf-8 -*-
 3 
 4 import urllib2
 5 import urllib
 6 import re
 7 import thread
 8 import time
 9 
10 
11 #----------- 加載招聘信息 -----------
12 class Spider_Model:
13     def __init__(self):
14         self.page = 1
15         self.enable = False
16         self.endPage = 2
17 
18     # 獲取網址的HTML 並編碼
19     def GetHTML(self, myUrl):
20         user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
21         headers = {'User-Agent': user_agent}
22         req = urllib2.Request(myUrl, headers=headers)
23         myResponse = urllib2.urlopen(req)
24         myPage = myResponse.read()
25         # encode的做用是將unicode編碼轉換成其餘編碼的字符串
26         # decode的做用是將其餘編碼的字符串轉換成unicode編碼
27         unicodePage = myPage.decode("GBK")
28         return unicodePage
29 
30 
31     # 將招聘信息摳出來,添加到列表中而且返回列表
32     def GetPage(self, page):
33         myUrl = "http://www.job.ustc.edu.cn/list.php?trans=7&page=" + str(page) + "&MenuID=002002"
34         unicodePage = self.GetHTML(myUrl)
35         # 找出全部class="content"的div標記
36         # re.S是任意匹配模式,也就是.能夠匹配換行符
37         jobList = re.findall('<div class="Joplistone">(.*?)</div>', unicodePage, re.S)
38         jobItems = re.findall('<li><a href="(.*?)" style="color:#">(.*?)</a><span class="zhiwei">(.*?)</span><span class="zhuanye">(.*?)</span></li>', jobList[0], re.S)
39         jobs = []
40         for job in jobItems:
41             # job 中第一個元素是招聘連接
42             # job 中第二個元素是招聘公司
43             # job 中第三個元素是職位
44             # job 中第四個元素是發佈日期
45             jobs.append([job[1], "http://www.job.ustc.edu.cn/" + job[0], job[2], job[3]])
46         return jobs
47 
48     # 得到招聘細節
49     def getJobDetail(self, joburl):
50         jobHtml = self.GetHTML(joburl)
51         jobDetail = re.findall('<div class="textone">(.*?)</div>', jobHtml, re.S)
52         #print jobDetail
53         return jobDetail
54 
55     # 得到完整的招聘信息
56     def getJobDetailList(self, jobs):
57         jobDetailList = []
58         for job in jobs:
59             jobDetailList.append([job[0], job[1], job[2], job[3], self.getJobDetail(job[1])])
60         return jobDetailList
61 
62     # 先展現一下
63     def showJob(self, page):
64         jobs = self.GetPage(page)
65         jobDetailList = self.getJobDetailList(jobs)
66         for jobDetail in jobDetailList:
67             for iterm in jobDetail:
68                 print iterm
69 
70     def Start(self):
71         self.enable = True
72         page = self.page
73         while self.enable & (page < self.endPage):
74             # 展現招聘信息
75             self.showJob(page)
76             page += 1
77 
78 
79 print u'招聘內容:'
80 myModel = Spider_Model()
81 myModel.Start()

參考資料:[Python]網絡爬蟲(八):糗事百科的網絡爬蟲(v0.3)源碼及解析(簡化更新)網絡

相關文章
相關標籤/搜索