guilin.sql:html
CREATE TABLE `guilin_ask` (
`id` INT(11) NOT NULL AUTO_INCREMENT COMMENT '主鍵',
`question` VARCHAR(255) DEFAULT NULL COMMENT '問題的標題',
`full_question` VARCHAR(255) DEFAULT NULL COMMENT '問題的詳情',
`keyword` VARCHAR(255) DEFAULT NULL COMMENT '關鍵字',
`ask_time` VARCHAR(255) DEFAULT NULL COMMENT '提問時間',
`accept_answer` TEXT COMMENT '提問者採納的答案',
`recommend_answer` TEXT COMMENT '旅遊推薦的答案',
`agree_answer` TEXT COMMENT '贊同數最高的答案',
PRIMARY KEY (`id`),
UNIQUE KEY `question` (`question`)
) ENGINE=INNODB DEFAULT CHARSET=utf8 COMMENT='桂林_問答表'
guilin.py:mysql
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from QuestionAndAnswer.items import QuestionandanswerItem
from pyquery import PyQuery as pq
class GuilinSpider(scrapy.Spider):
name = 'guilin'
allowed_domains = ['you.ctrip.com']
def start_requests(self):
# 重寫start_requests方法
ctrip_url = "http://you.ctrip.com/asks/search/?keywords=%e6%a1%82%e6%9e%97&type=2"
# 攜程~攻略~問答~桂林~已回答問題
yield Request(ctrip_url, callback=self.list_page)
def list_page(self, response):
result = pq(response.text)
# 調用pyquery.PyQuery
result_list = result(".cf")
# 問題列表
question_urls = []
# 問題連接列表
for ask_url in result_list.items():
question_urls.append(ask_url.attr("href"))
while None in question_urls:
question_urls.remove(None)
# 去除None
for url in question_urls:
yield response.follow(url, callback=self.detail_page)
result.make_links_absolute(base_url="http://you.ctrip.com/")
# 把相對路徑轉換成絕對路徑
next_link = result(".nextpage")
next_url = next_link.attr("href")
# 下一頁
if next_url is not None:
# 若是下一頁不爲空
yield scrapy.Request(next_url, callback=self.list_page)
def detail_page(self, response):
detail = pq(response.text)
question_frame = detail(".detailmain")
# 問答框
for i_item in question_frame.items():
ask = QuestionandanswerItem()
ask["question"] = i_item(".ask_title").text()
ask["full_question"] = i_item("#host_asktext").text()
ask["keyword"] = i_item(".asktag_oneline.cf").text()
ask["ask_time"] = i_item(".ask_time").text().strip("發表於")
ask["accept_answer"] = i_item(".bestanswer_con > div > p.answer_text").text()
ask["recommend_answer"] = i_item(".youyouanswer_con > div > p.answer_text").text()
ask["agree_answer"] = i_item("#replyboxid > ul > li:nth-child(1) > div > p.answer_text").text()
yield ask
items.py:sql
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class QuestionandanswerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
question = scrapy.Field()
# 問題的標題
full_question = scrapy.Field()
# 問題的詳情
keyword = scrapy.Field()
# 關鍵字
ask_time = scrapy.Field()
# 提問時間
accept_answer = scrapy.Field()
# 提問者採納的答案
recommend_answer = scrapy.Field()
# 旅遊推薦的答案
agree_answer = scrapy.Field()
# 贊同數最高的答案
MySQLPipline.py:數據庫
from pymysql import connect
class MySQLPipeline(object):
def __init__(self):
self.connect = connect(
host='192.168.1.108',
port=3306,
db='scrapy',
user='root',
passwd='Abcdef@123456',
charset='utf8',
use_unicode=True)
# MySQL數據庫
self.cursor = self.connect.cursor()
# 使用cursor()方法獲取操做遊標
def process_item(self, item, spider):
self.cursor.execute(
"""select * from guilin_ask WHERE question = %s""",
item['question'])
# 是否有重複問題
repetition = self.cursor.fetchone()
if repetition:
pass
# 丟棄
else:
self.cursor.execute(
"""insert into guilin_ask(
question, full_question, keyword, ask_time, accept_answer, recommend_answer, agree_answer)
VALUE (%s, %s, %s, %s, %s, %s, %s)""",
(item['question'],
item['full_question'],
item['keyword'],
item['ask_time'],
item['accept_answer'],
item['recommend_answer'],
item['agree_answer']
))
# 執行sql語句,item裏面定義的字段和表字段一一對應
self.connect.commit()
# 提交
return item
# 返回item
def close_spider(self, spider):
self.cursor.close()
# 關閉遊標
self.connect.close()
# 關閉數據庫鏈接