1.經過python爬蟲循環爬取古詩詞網站古詩名句
2.落地到本地數據庫python
首先經過firedebug進行頁面定位:
數據庫
其次源碼定位:
python爬蟲
最終生成lxml etree定位div標籤源碼:ide
response = etree.HTML(data) for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'): content = row.xpath('a/text()')[0] origin = row.xpath('a/text()')[-1] self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())})
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' @Date : 2017/12/21 12:35 @Author : kaiqing.huang @File : mingJuSpider.py ''' from utils import MySpider, MongoBase from datetime import date from lxml import etree import sys class mingJuSpider(): def __init__(self): self.db = MongoBase() self.spider = MySpider() def download(self): for pageId in range(1,117): url = 'http://so.gushiwen.org/mingju/Default.aspx?p={}&c=&t='.format(pageId) print url data = self.spider.get(url) if data: self.parse(data) def parse(self, data): response = etree.HTML(data) for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'): content = row.xpath('a/text()')[0] origin = row.xpath('a/text()')[-1] self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())}) if __name__ == '__main__': sys.setrecursionlimit(100000) do = mingJuSpider() do.download()