# -*- coding: utf-8 -*-

-- coding: utf-8 --

import scrapyhtml

from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem
class QianchengSpiderSpider(scrapy.Spider):
name = 'qiancheng_spider'
# allowed_domains = ['www.qq.com']
start_urls = [
#關鍵字數據分析
'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='算法

#關鍵字數據挖掘
    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #關鍵字算法
    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
              
    #關鍵字機器學習
    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #關鍵字深度學習
    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #關鍵字人工智能
    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
]
#以上是第一步,獲取搜索到這些關鍵字的都有哪些url
#第二步驟,這些個檢索頁,下面有不少頁,要翻頁,每一頁中的每一個詳情頁的裏面的數據
#那麼首先咱們要先寫提取一個頁面當中的url(每個詳情頁的url),這應該是一個a標籤

def parse(self, response):
    xpath="//div[@class='el']" #這裏面要過濾篩選一下用這個xpth得到
    items = response.xpath(xpath); #這裏面得到是不知足條件的el標籤
    print(items)
    for item in items:
    #遍歷一下這個items,把不符合需求的過濾掉
    # 如何去過濾呢?也就是說如何選擇下面這個if的條件呢,咱們來看看原始網頁代碼的特色
        #觀察以後咱們發現每個t1標籤前面還都有一個p標籤,t1後面還有一個空格
        if not len(item.xpath("./p[@class='t1 ']")):
            continue
                              #一個點表示當前節點#p標籤 t1 (這裏有個空格)#這部分爲這麼這樣寫我不是特別明白
               #也就是說我在上面這個items下面我再查找,有沒有"p標籤",有沒有class等於‘t1空格’
        url = item.xpath("./p[@class='t1 ']//a/@href").extract_first()#這裏得到是詳情頁的所有內容
    #./p[@class='t1 '#照抄,由於他下面只有一個「a標籤」,獲取他下面所有內容@
    #href屬性,
        yield scrapy.Request(url, callback=self.detail_parse)
    #下面開始是想獲得他的翻頁行爲
    next_page_url = response.xpath("//a[@id='rtNext']/@href").extract_first()
    if not next_page_url is None:
         yield scrapy.Request(next_page_url, callback=self.parse)
def detail_parse(self,response):
    item = JobscrawlerQianchengwuyouItem()
    # 招聘名稱
    item["job_name"] = response.xpath("//div[@class='cn']/h1/text()").extract_first().strip()
    # 能夠得到沒有白空格的job_name

    # 職位信息
    item["job_info"] = "".join(response.xpath("//div[@calss='bmsg job_msg inbox']//text()").extract()).strip()
    # 薪資
    item["job_salary"] = "".join(response.xpath('//div[@class="sp4"]/text()').extract()).strip()
    # 職位福利
    item["job_welfare"] = ",".join(response.xpath("//span[@class='sp4']/text()").extract())
    #item["job_welfare"] = response.xpath("//span[@class='sp4']/text()這樣會得到一個列表,可是咱們須要的是一個字符串
    # 經驗要求
    item["job_exp_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip()
    item["job_edu_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip()

    # 學歷要求#獲取詳情頁的細節信息
    # 公司名稱
    item["company_name"] = response.xpath('//div[@class="com_msg"]//p/text').extract_first().strip()
    # 公司行業
    # 公司性質
    itme["company_industry"] = "".join(response.xpath('//span[@class="i_trade"]/../text()').extract()).strip()
    item["company_nature"] = "".join(response.xpath('//span[@class="i_flag"]/../text()').extract()).strip()
    #「..(點點)」的意思是我但願定位到父標籤的text,可是定位不到父標籤,能定位到子標籤同一級的標籤,而後經過子標籤點點,就能夠了
    #"".join(....)意思是獲得的是一個列表,join一下,就加到前面的「」當中去了,就變成str格式了
    #這裏若是但願把全部的白空格都處理掉的話,就須要for循環,可是數據量比較大,就把收尾的白空格去掉就能夠了
    # 公司人數
    item["company_people"] = "".join(response.xpath('//span[@class="i_people"]/../text()').extract()).strip()
    # 公司地址
    item["company_location"] = ""
    # 公司概況
    item["company_overview"] = "".join(response.xpath('//div[@class="tmsg inbox"]//text()').extract()).strip()

    # 公司融資階段
    item["company_financing_stage"] = ""
    yield item
相關文章
相關標籤/搜索