import scrapy from jobspider.items import JobspiderItem import logging class JobSpider(scrapy.Spider): name = "job_spider" start_urls = [ "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=java&isadv=0&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&p=1" ] def parse(self, response): # save html file. # filename = 'zhaopin.html' # with open(filename, 'wb') as f: # f.write(response.body) # self.log('Saved file %s' % filename) jobs = response.xpath('//div[@id="newlist_list_content_table"]/table[@class="newlist"]') for job in jobs[1:]: item = JobspiderItem() item['jobname'] = ''.join(job.xpath('.//td[@class="zwmc"]/div/a//text()').extract()) item['companyname'] = job.xpath('.//td[@class="gsmc"]/a/text()').extract_first() item['salary'] = job.xpath('.//td[@class="zwyx"]/text()').extract_first() item['workplace'] = job.xpath('.//td[@class="gzdd"]/text()').extract_first() yield item
爬取智聯招聘。javascript
智聯html結構:html
<div class="newlist_list_content" id="newlist_list_content_table"> <table class="newlist" width="853" cellspacing="0" cellpadding="0"> <tr> <td class="zwmc" style="width: 250px;"> <input name="vacancyid" data-monitor="CZ751712970J00017764214|3" value="CZ751712970J00017764214_719_1_03_409__1_" onclick="zlapply.uncheckAll('allvacancyid')" type="checkbox"> <div style="width: 224px;*width: 218px; _width:200px; float: left"> <a style="font-weight: bold" par="ssidkey=y&ss=409&ff=03&sg=df4b40a6bfaf49c08ef0cb9e8e2181f2&so=3" href="http://jobs.zhaopin.com/CZ751712970J00017764214.htm" target="_blank"><b>java</b>開發工程師 </a><a href="http://e.zhaopin.com/products/1/detail.do" target="_blank" title="點擊「頂」字,瞭解更多"><img src="/assets/images/top.png" border="0" align="absmiddle"> <img src="/assets/images/jp.gif" border="0" align="absmiddle"></a> </div> </td> <td style="width: 60px;" class="fk_lv"><span>64%</span></td> <td class="gsmc"><a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank">北京中科網聯信息技術研究院(有限合夥)</a> <a href="http://company.zhaopin.com/CZ751712970.htm" target="_blank" style="vertical-align: top;"><img src="//img03.zhaopin.cn/IHRNB/img/souvip1002.png" alt="1002" class="icon_vip" border="0" align="absmiddle"></a></td> <td class="zwyx">4001-6000</td> <td class="gzdd">鄭州</td> <td class="gxsj"><span>置頂</span><a class="newlist_list_xlbtn" href="javascript:;"></a></td> </tr> <tr style="display: none" class="newlist_tr_detail"> <td style="line-height: 0;" colspan="6" width="833px"> <div class="newlist_detail"> <div class="clearfix"> <ul> <li class="newlist_deatil_two"><span>地點:鄭州</span><span>公司性質:民營</span><span>經驗:1-3年</span><span>學歷:不限</span><span>職位月薪:4001-6000元/月</span></li><li class="newlist_deatil_last">...<b>Java</b>開發經驗,熟悉J2EE體系結構,並能熟悉掌握SSH等開源框架; 3. 能熟練掌握和開發Web Service、SOAP、Socket、NIO等開發技術,對http、tcp、udp協議有必定的瞭解; 4. 精通Ajax、<b>Java</b>Script、HTML5等前...</li> </ul> <dl> <dt> <a href="javascript:zlapply.searchjob.ajaxApplyBrig1('CZ751712970J00017764214_719','ssi','_1_03_409__2_');searchMonitor.logSingleApplyData('CZ751712970J00017764214|3');"> <img src="/assets/images/newlist_sqimg_03.jpg"> </a> </dt> <dd><a href="javascript:zlapply.searchjob.saveOne('CZ751712970J00017764214_719');"><img src="/assets/images/newlist_scimg_06.jpg"></a></dd> </dl> </div> </div> </td></tr> </table> </div>