愛卡_xcar_汽車詳解抓取

爬蟲練手,主要運用requests,因爲要對script內部進行分析,因此就直接用了 re 正則匹配,平時也能夠用用Beautifulsoup, 應該更加方便html

思路:

讀取首頁,就是如 http://newcar.xcar.com.cn/2674/2015/detail/1.htm;爲了所有抓取,咱們這裏都是 1.htm 結尾python

找到頁面裏面的 script 標籤裏面的 " var nextUrl " ,這個這個地址就是頁面自動播放的下個頁面;同時讀出 img src 即 圖片地址,而後保存(我這裏是把 說明 等內容所有放進 目標文件的名稱中了,其實你們只要圖片的話徹底不須要)web

遞歸抓取 所有頁面app

這裏另外作的一點實際 弄了個xcar_lst 記錄全部頁面、圖片等信息,只是留做記錄,暫時沒用url

上代碼:

python# coding:utf-8



__author__ = 'BONFY CHEN'



import requests

import re



PROXIES = None



HEADERS = {

    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'

    , 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'

    , 'Accept-Encoding': 'gzip,deflate,sdch'

    , 'Accept-Language': 'zh-CN,zh;q=0.8'

}



BASE_FOLDER = 'D:/xxx_folder/'





class xcarDown(object):



    _base_folder = None

    _proxies = None

    _headers = None

    _website = 'http://newcar.xcar.com.cn'

    _xcar_lst = []



    def set_base_folder(self, base_folder):

        self._base_folder = base_folder



    def set_headers(self, headers):

        self._headers = headers



    def set_proxies(self, proxies):

        self._proxies = proxies



    def __init__(self, base_folder=BASE_FOLDER, proxies=PROXIES, headers=HEADERS):

        self.set_base_folder(base_folder)

        self.set_headers(headers)

        self.set_proxies(proxies)



    def download_image_from_url(self, url, name=None):



        """

        download_image_from_url

        :param url: the resource image url

        :param name: he destination file name

        :return:

        """



        local_filename = name + '_' + url.split('/')[-1]

        r = requests.get(url, proxies=self._proxies, headers=self._headers, stream=True)

        with open(self._base_folder + local_filename, 'wb') as f:

            for chunk in r.iter_content(chunk_size=1024):

                if chunk:

                    f.write(chunk)

                    f.flush()

            f.close()

        return local_filename





    def download_xcar(self, url):



        """



        :param url: the source url in xcar.com.cn

                    http://newcar.xcar.com.cn/2674/2015/detail/1.htm

        :return:

        """



        r = requests.get(url, proxies=self._proxies, headers=self._headers)

        # print r.encoding

        r.encoding = 'gbk'



        m1 = re.search(r"var nextUrl = '(?P<n_url>.*.htm)'", r.text)

        next_url = m1.groupdict()['n_url'] if m1 else None



        m2 = re.search(r"<div class=\"zs_img\"><img src=\"(?P<pic_url>.*.jpg)\"", r.text)

        pic_url = m2.groupdict()['pic_url'] if m2 else None



        m3 = re.search(r"<div class=\"zs_t\">(?P<title>.*)</div>", r.text)

        title = m3.groupdict()['title'] if m3 else ''



        m4 = re.search(r"<div class=\"zs_c\">(?P<cont>.*)</div>", r.text)

        cont = m4.groupdict()['cont'] if m4 else ''



        m5 = re.search(r"<title>(?P<model>.*)</title>", r.text)

        model = m5.groupdict()['model'] if m5 else ''



        if pic_url:

            try:

                self.download_image_from_url(pic_url, name='_'.join([model, title, cont]))

                print 'download complete: pic from {} '.format(pic_url)

            except IOError:

                print 'file name IOERROR'

                self.download_image_from_url(pic_url, name=model)

                print 'download complete: pic from {} '.format(pic_url)

            except Exception as e:

                print e



        dct = dict(pic_url=pic_url, next_url=next_url, title=title, cont=cont, model=model)

        self._xcar_lst.append(dct)

       if next_url[-4:] == '.htm':

            self.download_xcar(self._website + next_url)





if __name__ == '__main__':

    print("Welcome to the Pic Download for xcar.com")

    print("Downloaded files in the folder: " + BASE_FOLDER )

    print("---------------------------------------")



    id_modell = int(input("Please enter the modell id(eg.2674): "))

    year = int(input("Please enter the year (eg.2015): "))



    url = 'http://newcar.xcar.com.cn/{}/{}/detail/1.htm'.format(id_modell, year)

    xcar = xcarDown()

    xcar.download_xcar(url)
相關文章
相關標籤/搜索