import scrapyhtml
from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem
class QianchengSpiderSpider(scrapy.Spider):
name = 'qiancheng_spider'
# allowed_domains = ['www.qq.com']
start_urls = [
#關鍵字數據分析
'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='算法
#關鍵字數據挖掘 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' #關鍵字算法 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' #關鍵字機器學習 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' #關鍵字深度學習 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' #關鍵字人工智能 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' ] #以上是第一步,獲取搜索到這些關鍵字的都有哪些url #第二步驟,這些個檢索頁,下面有不少頁,要翻頁,每一頁中的每一個詳情頁的裏面的數據 #那麼首先咱們要先寫提取一個頁面當中的url(每個詳情頁的url),這應該是一個a標籤 def parse(self, response): xpath="//div[@class='el']" #這裏面要過濾篩選一下用這個xpth得到 items = response.xpath(xpath); #這裏面得到是不知足條件的el標籤 print(items) for item in items: #遍歷一下這個items,把不符合需求的過濾掉 # 如何去過濾呢?也就是說如何選擇下面這個if的條件呢,咱們來看看原始網頁代碼的特色 #觀察以後咱們發現每個t1標籤前面還都有一個p標籤,t1後面還有一個空格 if not len(item.xpath("./p[@class='t1 ']")): continue #一個點表示當前節點#p標籤 t1 (這裏有個空格)#這部分爲這麼這樣寫我不是特別明白 #也就是說我在上面這個items下面我再查找,有沒有"p標籤",有沒有class等於‘t1空格’ url = item.xpath("./p[@class='t1 ']//a/@href").extract_first()#這裏得到是詳情頁的所有內容 #./p[@class='t1 '#照抄,由於他下面只有一個「a標籤」,獲取他下面所有內容@ #href屬性, yield scrapy.Request(url, callback=self.detail_parse) #下面開始是想獲得他的翻頁行爲 next_page_url = response.xpath("//a[@id='rtNext']/@href").extract_first() if not next_page_url is None: yield scrapy.Request(next_page_url, callback=self.parse) def detail_parse(self,response): item = JobscrawlerQianchengwuyouItem() # 招聘名稱 item["job_name"] = response.xpath("//div[@class='cn']/h1/text()").extract_first().strip() # 能夠得到沒有白空格的job_name # 職位信息 item["job_info"] = "".join(response.xpath("//div[@calss='bmsg job_msg inbox']//text()").extract()).strip() # 薪資 item["job_salary"] = "".join(response.xpath('//div[@class="sp4"]/text()').extract()).strip() # 職位福利 item["job_welfare"] = ",".join(response.xpath("//span[@class='sp4']/text()").extract()) #item["job_welfare"] = response.xpath("//span[@class='sp4']/text()這樣會得到一個列表,可是咱們須要的是一個字符串 # 經驗要求 item["job_exp_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip() item["job_edu_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip() # 學歷要求#獲取詳情頁的細節信息 # 公司名稱 item["company_name"] = response.xpath('//div[@class="com_msg"]//p/text').extract_first().strip() # 公司行業 # 公司性質 itme["company_industry"] = "".join(response.xpath('//span[@class="i_trade"]/../text()').extract()).strip() item["company_nature"] = "".join(response.xpath('//span[@class="i_flag"]/../text()').extract()).strip() #「..(點點)」的意思是我但願定位到父標籤的text,可是定位不到父標籤,能定位到子標籤同一級的標籤,而後經過子標籤點點,就能夠了 #"".join(....)意思是獲得的是一個列表,join一下,就加到前面的「」當中去了,就變成str格式了 #這裏若是但願把全部的白空格都處理掉的話,就須要for循環,可是數據量比較大,就把收尾的白空格去掉就能夠了 # 公司人數 item["company_people"] = "".join(response.xpath('//span[@class="i_people"]/../text()').extract()).strip() # 公司地址 item["company_location"] = "" # 公司概況 item["company_overview"] = "".join(response.xpath('//div[@class="tmsg inbox"]//text()').extract()).strip() # 公司融資階段 item["company_financing_stage"] = "" yield item