# 安裝虛擬環境
>> pip install virtualenvwrapper-win
# 新建虛擬環境
>> mkvirtualenv dirname
# 退出虛擬環境
>> deactivate
# 查看當前全部的虛擬環境
>> workon
# 進入某個虛擬環境
>> workon dirname
# 設置虛擬環境默認存放位置
>> 在環境變量中新建一個變量:WORKON_HOME,指定路徑便可
複製代碼
個人當前環境默認爲python3,因此新建python3的虛擬環境不用指明python的路徑html
# 新建python3虛擬環境
>> mkvirtualenv dirname
複製代碼
# 安裝scripy
>> pip3 install scrapy
# 安裝pymysql
>> pip3 install pymysql
# 安裝pymongo
>> pip3 install pymongo
複製代碼
安裝Scrapy時報錯Failed building wheel for Twistedpython
點這裏,去下載相應版本的twisted安裝便可mysql
>> pip install Twisted-18.9.0-cp37-cp37m-win_amd64.whl
複製代碼
ModuleNotFoundError: No module named 'win32api'sql
>> pip3 install pywin32
複製代碼
scrapy startproject Test
複製代碼
>> cd Test
>> cd Test
>> scrapy genspider test www.baidu.com
複製代碼
settings.py文件中的變量定義數據庫
# mongo相關變量
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DB = 'Tencent'
MONGODB_SET = 'jobs'
## mysql相關變量
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'Tencent'
複製代碼
pipelines.py文件api
# 這裏的數據庫文件相關的配置變量都定義在settings.py裏面
from Test.settings import *
import pymongo
import pymysql
class mongoPipeline(object):
def __init__(self):
# 建立鏈接對象
self.conn = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
# 建立庫對象
self.db = self.conn[MONGODB_DB]
# 建立集合對象
self.myset = self.db[MONGODB_SET]
def process_item(self, item, spider):
# 把一個item轉換爲字段數據類型
d =dict(item)
self.myset.insert_one(d)
return item
class MysqlPipeline(object):
def __init__(self):
self.db = pymysql.connect(
host=MYSQL_HOST,
port = MYSQL_PORT,
user = MYSQL_USER,
password = MYSQL_PWD,
database = MYSQL_DB
)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
ins = 'insert into jobs(career,type,number,address,time,link) values(%s,%s,%s,%s,%s,%s)'
L = [
item['career'],
item['type'],
item['number'],
item['address'],
item['time'],
item['link']
]
self.cursor.execute(ins,L)
self.db.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.db.close()
print("Mysql 數據庫斷開鏈接")
複製代碼
ITEM_PIPELINES = {
'Test.pipelines.TestPipeline': 300,
'Test.pipelines.MongoPipeline':250,
'Test.pipelines.MysqlPipeline':200
}
複製代碼
# 是否遵照robots協議
ROBOTSTXT_OBEY = False
# headers定義
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Molliza/5.0",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# 日誌級別的定義
LOG_LEVEL = "WARNING"
複製代碼