1 import scrapy 2 import sys 3 # import io 4 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18303') 5 from scrapy.selector import Selector, HtmlXPathSelector 6 class ChoutiSpider(scrapy.Spider): 7 name = 'chouti' 8 # allowed_domains = ['chouti.com'] 9 start_urls = ['http://dig.chouti.com/'] 10 11 def parse(self, response): 12 # print(response.text) 13 # content = str(response.body, encoding='utf-8') 14 # print(content) 15 # hxs = Selector(response=response).xpath('//a').extract() 16 # for i in hxs: 17 # print(i) 18 # hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]').extract() 19 # for i in hxs: 20 # print(i) 21 hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]') # 標籤對像列表 22 for obj in hxs: 23 a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first() 24 print(a.strip()) 25 26 ''' 27 // 表示子孫中 28 .// 當前對像的子孫中 29 / 兒子 30 /div 兒子中的div標籤 31 /div[@id="i1" 兒子中的div標籤且id=i1 32 obj.extract() 列表中的每個對象轉換成字符串 =>[] 33 obj.extract_first() 列表中的每個對象轉換成字符串=>列表第一個元素 34 //div/text() 獲取某個標籤的文本