items.pyhtml
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 class LagouItem(scrapy.Item): 10 # define the fields for your item here like: 11 # name = scrapy.Field() 12 #id 13 obj_id=scrapy.Field() 14 #職位名 15 positon_name=scrapy.Field() 16 #工做地點 17 work_place=scrapy.Field() 18 #發佈日期 19 publish_time=scrapy.Field() 20 #工資 21 salary=scrapy.Field() 22 #工做經驗 23 work_experience=scrapy.Field() 24 #學歷 25 education=scrapy.Field() 26 #full_time 27 full_time=scrapy.Field() 28 #標籤 29 tags=scrapy.Field() 30 #公司名字 31 company_name=scrapy.Field() 32 # #產業 33 # industry=scrapy.Field() 34 #職位誘惑 35 job_temptation=scrapy.Field() 36 #工做描述 37 job_desc=scrapy.Field() 38 #公司logo地址 39 logo_image=scrapy.Field() 40 #領域 41 field=scrapy.Field() 42 #發展階段 43 stage=scrapy.Field() 44 #公司規模 45 company_size=scrapy.Field() 46 # 公司主頁 47 home = scrapy.Field() 48 #職位發佈者 49 job_publisher=scrapy.Field() 50 #投資機構 51 financeOrg=scrapy.Field() 52 #爬取時間 53 crawl_time=scrapy.Field()
spiders>lagou.pymysql
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from LaGou.items import LagouItem 6 from LaGou.utils.MD5 import get_md5 7 from datetime import datetime 8 9 10 class LagouSpider(CrawlSpider): 11 name = 'lagou' 12 allowed_domains = ['lagou.com'] 13 start_urls = ['https://www.lagou.com/zhaopin/'] 14 content_links=LinkExtractor(allow=(r"https://www.lagou.com/jobs/\d+.html")) 15 page_links=LinkExtractor(allow=(r"https://www.lagou.com/zhaopin/\d+")) 16 rules = ( 17 Rule(content_links, callback="parse_item", follow=False), 18 Rule(page_links,follow=True) 19 ) 20 21 def parse_item(self, response): 22 item=LagouItem() 23 #獲取到公司拉鉤主頁的url做爲ID 24 item["obj_id"]=get_md5(response.url) 25 #公司名稱 26 item["company_name"]=response.xpath('//dl[@class="job_company"]//a/img/@alt').extract()[0] 27 # 職位 28 item["positon_name"]=response.xpath('//div[@class="job-name"]//span[@class="name"]/text()').extract()[0] 29 #工資 30 item["salary"]=response.xpath('//dd[@class="job_request"]//span[1]/text()').extract()[0] 31 # 工做地點 32 work_place=response.xpath('//dd[@class="job_request"]//span[2]/text()').extract()[0] 33 item["work_place"]=work_place.replace("/","") 34 # 工做經驗 35 work_experience=response.xpath('//dd[@class="job_request"]//span[3]/text()').extract()[0] 36 item["work_experience"]=work_experience.replace("/","") 37 # 學歷 38 education=response.xpath('//dd[@class="job_request"]//span[4]/text()').extract()[0] 39 item["education"]=education.replace("/","") 40 # full_time 41 item['full_time']=response.xpath('//dd[@class="job_request"]//span[5]/text()').extract()[0] 42 #tags 43 tags=response.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()').extract() 44 item["tags"]=",".join(tags) 45 #publish_time 46 item["publish_time"]=response.xpath('//dd[@class="job_request"]//p[@class="publish_time"]/text()').extract()[0] 47 # 職位誘惑 48 job_temptation=response.xpath('//dd[@class="job-advantage"]/p/text()').extract() 49 item["job_temptation"]=",".join(job_temptation) 50 # 工做描述 51 job_desc=response.xpath('//dd[@class="job_bt"]/div//p/text()').extract() 52 item["job_desc"]=",".join(job_desc).replace("\xa0","").strip() 53 #job_publisher 54 item["job_publisher"]=response.xpath('//div[@class="publisher_name"]//span[@class="name"]/text()').extract()[0] 55 # 公司logo地址 56 logo_image=response.xpath('//dl[@class="job_company"]//a/img/@src').extract()[0] 57 item["logo_image"]=logo_image.replace("//","") 58 # 領域 59 field=response.xpath('//ul[@class="c_feature"]//li[1]/text()').extract() 60 item["field"]="".join(field).strip() 61 # 發展階段 62 stage=response.xpath('//ul[@class="c_feature"]//li[2]/text()').extract() 63 item["stage"]="".join(stage).strip() 64 # 投資機構 65 financeOrg=response.xpath('//ul[@class="c_feature"]//li[3]/p/text()').extract() 66 if financeOrg: 67 item["financeOrg"]="".join(financeOrg) 68 else: 69 item["financeOrg"]="" 70 #公司規模 71 if financeOrg: 72 company_size= response.xpath('//ul[@class="c_feature"]//li[4]/text()').extract() 73 item["company_size"]="".join(company_size).strip() 74 else: 75 company_size = response.xpath('//ul[@class="c_feature"]//li[3]/text()').extract() 76 item["company_size"] = "".join(company_size).strip() 77 # 公司主頁 78 item["home"]=response.xpath('//ul[@class="c_feature"]//li/a/@href').extract()[0] 79 # 爬取時間 80 item["crawl_time"]=datetime.now() 81 82 yield item
pipelines.pygit
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 import pymysql 9 class LagouPipeline(object): 10 11 def process_item(self, item, spider): 12 con = pymysql.connect(host="127.0.0.1", user="root", passwd="229801", db="lagou",charset="utf8") 13 cur = con.cursor() 14 sql = ("insert into lagouwang(obj_id,company_name,positon_name,salary,work_place,work_experience,education,full_time,tags,publish_time,job_temptation,job_desc,job_publisher,logo_image,field,stage,financeOrg,company_size,home,crawl_time)" 15 "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)") 16 lis=(item["obj_id"],item["company_name"],item["positon_name"],item["salary"],item["work_place"],item["work_experience"],item["education"],item['full_time'],item["tags"],item["publish_time"],item["job_temptation"],item["job_desc"],item["job_publisher"],item["logo_image"],item["field"],item["stage"],item["financeOrg"],item["company_size"],item["home"],item["crawl_time"]) 17 cur.execute(sql, lis) 18 con.commit() 19 cur.close() 20 con.close() 21 22 return item
middlewares.pygithub
1 from scrapy import signals 2 import random 3 #from LaGou.settings import USER_AGENTS 4 from fake_useragent import UserAgent 5 6 class RandomUserAgent(object): 7 # def __init__(self,crawl): 8 # super(RandomUserAgent,self).__init__() 9 # self.ua=UserAgent() 10 def process_request(self, request, spider): 11 #useragent = random.choice(USER_AGENTS) 12 ua=UserAgent() 13 request.headers.setdefault("User-Agent",ua.random)
settings.pyweb
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for LaGou project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'LaGou' 13 14 SPIDER_MODULES = ['LaGou.spiders'] 15 NEWSPIDER_MODULE = 'LaGou.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'LaGou (+http://www.yourdomain.com)' 20 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = False 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 DOWNLOAD_DELAY = 5 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 COOKIES_ENABLED = False 37 # USER_AGENTS = [ 38 # "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 39 # "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 40 # "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 41 # "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 42 # "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 43 # "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 44 # "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 45 # "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 46 # ] 47 # Disable Telnet Console (enabled by default) 48 #TELNETCONSOLE_ENABLED = False 49 50 # Override the default request headers: 51 #DEFAULT_REQUEST_HEADERS = { 52 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 53 # 'Accept-Language': 'en', 54 #} 55 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 56 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 57 SCHEDULER_PERSIST = True 58 # Enable or disable spider middlewares 59 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 60 #SPIDER_MIDDLEWARES = { 61 # 'LaGou.middlewares.LagouSpiderMiddleware': 543, 62 63 #} 64 65 # Enable or disable downloader middlewares 66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 67 DOWNLOADER_MIDDLEWARES = { 68 'LaGou.middlewares.RandomUserAgent': 1, 69 # 'LaGou.middlewares.MyCustomDownloaderMiddleware': 543, 70 } 71 72 # Enable or disable extensions 73 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 74 #EXTENSIONS = { 75 # 'scrapy.extensions.telnet.TelnetConsole': None, 76 #} 77 78 # Configure item pipelines 79 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 80 ITEM_PIPELINES = { 81 'scrapy_redis.pipelines.RedisPipeline':300, 82 83 #'LaGou.pipelines.LagouPipeline': 300, 84 } 85 86 # Enable and configure the AutoThrottle extension (disabled by default) 87 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 88 #AUTOTHROTTLE_ENABLED = True 89 # The initial download delay 90 #AUTOTHROTTLE_START_DELAY = 5 91 # The maximum download delay to be set in case of high latencies 92 #AUTOTHROTTLE_MAX_DELAY = 60 93 # The average number of requests Scrapy should be sending in parallel to 94 # each remote server 95 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 96 # Enable showing throttling stats for every response received: 97 #AUTOTHROTTLE_DEBUG = False 98 99 # Enable and configure HTTP caching (disabled by default) 100 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 101 #HTTPCACHE_ENABLED = True 102 #HTTPCACHE_EXPIRATION_SECS = 0 103 #HTTPCACHE_DIR = 'httpcache' 104 #HTTPCACHE_IGNORE_HTTP_CODES = [] 105 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
redis數據:redis
mysql數據:sql
申明:以上只限於參考學習交流!!!更多:https://github.com/huwei86/spiderlagouapi