爬蟲從安裝到入門

1 python環境的配置
   1.1 安裝python文件包,放到能夠找到的位置
   1.2 右鍵計算機->屬性->高級環境設置->系統變量->Path->編輯->複製python路徑位置
   1.3 管理員身份打開cmd,輸入python,測試環境是否安裝成功
![](https://img2018.cnblogs.com/blog/1586203/201907/1586203-20190713124545266-1894622414.png)html


2 安裝pycharm
    2.1 安裝pycharm文件包,放到能夠找到的位置
    2.2 新建文件夾,須要設置環境
    2.3 File->Setting->project ...->add->找到python.exepython

 


2.4 爬蟲須要的安裝包
    2.4.1 打開cmd(管理員身份)
    2.4.2 python -m pip install requests
    2.4.3 python -m pip install lxml
    2.4.4 python -m pip install bs4web

 


2.5爬取數據
    2.5.1 打開cmd 輸入scrapy startproject Demo(能夠先進入存放文件的目錄)
    2.5.2 打開pycharm打開文件Demo,新建python文件
    2.5.3 新建python文件begin.py輸入如下命令,運行begin能夠實現爬取數據
  
    2.5.4 打開settings.py設置輸出文件格式和文件位置以及User_agent
             FEED_URI = u'file:///C:/scrapy/test.csv'//輸出目錄
             FEED_FORMAT='CSV'
             FEED_EXPORT_ENCODING="gb18030"cookie


3 如下爲總框架圖片

app

 

4如下爲代碼框架

4.1爲Demo1.pydom

 1 import scrapy
 2 from Demo1.items import  Demo1Item
 3 from scrapy.http import Request
 4 from scrapy.selector import Selector
 5 class UestcInfoSpider(scrapy.spiders.Spider):
 6 
 7     #爬蟲名
 8     name = 'uestc'
 9     #設置訪問域
10     allowed_domains = ['ecar168.cn']
11     #網頁連接、
12     #start_urls = ['http://data.ecar168.cn/car/959/']
13     start_urls = ['http://www.ecar168.cn/xiaoliang/liebiao/60_0.htm']
14 
15     def parse1(self,response):
16         item = Demo1Item()
17         selector = Selector(response)
18         news1 = selector.xpath('(//tr[@class="yuefen_tr"])')
19         for each in news1:
20             item['ranking'] = each.xpath('normalize-space(td[1]/text())').extract()[0]
21             item['manufacturer'] = each.xpath('normalize-space(td[2]/text())').extract()[0]
22             item['vehicle_type'] = each.xpath('normalize-space(td[3]/text())').extract()[0] + each.xpath('normalize-space(td[3]/a/text())').extract()[0]
23             item['monthly_sales_volume'] = each.xpath('normalize-space(td[4]/text())').extract()[0]
24             item['accumulated_this_year'] = each.xpath('normalize-space(td[5]/text())').extract()[0]
25             item['last_month'] = each.xpath('normalize-space(td[6]/text())').extract()[0]
26             item['chain_ratio'] = each.xpath('normalize-space(td[7]/text())').extract()[0]
27             item['corresponding_period_of_last_year'] = each.xpath('normalize-space(td[8]/text())').extract()[0]
28             item['year_on_year'] = each.xpath('normalize-space(td[9]/text())').extract()[0]
29             item['url_title'] = selector.xpath('//div[@id="left"]/div[1]/div[2]/h1/text()').extract()[0]
30             # date1= selector.xpath('//div[@id="left"]/div[1]/div[2]/h1/text()').extract()[0]
31             #item['month'] = (selector.xpath('//div[@id="left"]/div[1]/div[3]/text()').extract()[0])[3:10]
32             # item['web'] = selector.xpath('//div[@id="left"]/div[1]/div[1]/a[3]').extract()[0]
33             #item['url_href'] = selector.xpath('//div[@id="left"]/div[1]/div[1]/a[1]//@href').extract()[0] + selector.xpath('//div[@id="left"]/div[1]/div[1]/a[3]//@href').extract()[0]
34             series = (selector.xpath('//div[@id="left"]/div[1]/div[2]/h1/text()').extract()[0])
35             begin=series.find("")
36             end=series.find("")
37             seriestest = series[begin + 1:end]
38             item['series'] = seriestest
39 
40 
41             item['url_car_detail'] = each.xpath('normalize-space(td[3]/a/@href)').extract()[0]
42             monthdan = selector.xpath('//div[@id="left"]/div[1]/div[2]/h1/text()').extract()[0][0:7]
43 
44             if monthdan.find("")==-1:
45                 month=monthdan+""
46 
47             else: month=monthdan
48             item['month']=month
49 
50             yield item
51 
52     def parse2(self,response):
53         item = Demo1Item()
54         selector = Selector(response)
55         news = selector.xpath('(//ul[@class="biaoti_ul"])')
56         for each in news:
57             url2 = 'http://www.ecar168.cn' + each.xpath('normalize-space(li[1]/a/@href)').extract()[0]
58             print(url2)
59             yield Request(url2,callback=self.parse1)
60 
61 
62     def parse(self,response):
63         #建立item
64         item = Demo1Item()
65         #分析response
66         selector = Selector(response)
67         news = selector.xpath('(//div[@id="jiludaohang"])')
68         #i=0
69         #while i <=180:
70             #url=["http://www.ecar168.cn/xiaoliang/liebiao/60_%s.htm"%i]
71             #print(url)
72         #url='http://www.ecar168.cn/xiaoliang/liebiao/60_0.htm'
73         for each in news:
74             for i in range(3,9) :
75                 url='http://www.ecar168.cn'+each.xpath('ul/li[%s]/a/@href'%i).extract()[0]
76                 print(url)
77                 yield Request(url, callback=self.parse2)
View Code

4.2 爲bigin.pyscrapy

1 from scrapy import cmdline
2 
3 cmdline.execute("scrapy crawl uestc".split())
View Code

4.3 爲settings.pyide

# -*- coding: utf-8 -*-

# Scrapy settings for Demo1 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'Demo1'

SPIDER_MODULES = ['Demo1.spiders']
NEWSPIDER_MODULE = 'Demo1.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Demo1 (+http://www.yourdomain.com)'
USER_AGENT ='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'Demo1.middlewares.Demo1SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'Demo1.middlewares.Demo1DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'Demo1.pipelines.Demo1Pipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_URI = u'file:///C:/scrapy/Demotest.csv'
FEED_FORMAT='CSV'
FEED_EXPORT_ENCODING="gb18030"
View Code

4.4 爲item.py測試

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class Demo1Item(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     manufacturer = scrapy.Field()
15     ranking = scrapy.Field()
16     vehicle_type = scrapy.Field()
17     monthly_sales_volume = scrapy.Field()
18     accumulated_this_year = scrapy.Field()
19     last_month = scrapy.Field()
20     chain_ratio = scrapy.Field()
21     corresponding_period_of_last_year = scrapy.Field()
22     year_on_year = scrapy.Field()
23     url_title = scrapy.Field()
24     month = scrapy.Field()
25     # url_href = scrapy.Field()
26     series = scrapy.Field()
27     url_car_detail = scrapy.Field()
28 
29     pass
View Code
相關文章
相關標籤/搜索