//div[@class="j-r-list-c-desc"]/a/text()
put()
get()
Queue.empty()
:是否爲空Queue.join()
:若是隊列爲空,執行其餘程序threading.Thread(target=...)
import requests from lxml import etree from queue import Queue import threading import time class BsSpider: def __init__(self): self.baseurl = "http://www.budejie.com/" self.headers = {"User_Agent": "Mozilla/5.0"} self.urlQueue = Queue() # url隊列 self.resQueue = Queue() # 響應隊列 # 生成URL隊列 def get_url(self): for num in range(1, 51): url = self.baseurl + str(num) # 1是第一頁 self.urlQueue.put(url) # 響應隊列 def get_html(self): while True: url = self.urlQueue.get() res = requests.get(url, headers=self.headers) res.encoding = 'utf-8' html = res.text # 放到響應隊列 self.resQueue.put(html) # 清除此任務 self.urlQueue.task_done() # 解析頁面 def get_content(self): while True: # 從響應隊列中一次獲取html源碼 html = self.resQueue.get() parse_html = etree.HTML(html) r_list = parse_html.xpath('//div[@class="j-r-list-c-desc"]/a/text()') for r in r_list: print(r + "\n") # 清除任務 self.resQueue.task_done() def main(self): # 存放全部的線程 thread_list = [] # 獲取url隊列 self.get_url() # 建立getpage線程 for i in range(3): thread_res = threading.Thread(target=self.get_html) thread_list.append(thread_res) for i in range(2): thread_parse = threading.Thread(target=self.get_content) thread_list.append(thread_parse) # 全部線程開始幹活 for th in thread_list: th.setDaemon(True) th.start() # 若是隊列爲空,則執行其餘程序 self.urlQueue.join() self.resQueue.join() print("運行結束") if __name__ == '__main__': begin = time.time() spider = BsSpider() spider.main() end = time.time() print("運行時間:", end - begin)
python -m pip install beautifulsoup4
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.find_all(name="屬性值")
soup.節點名
:soup.a、soup.ul
soup.節點名.string
find_all()
:返回列表
r_list = soup.find_all(屬性名="屬性值")
r_list = soup.find_all(class="test")
# 報錯嘗試使用class_r_list=soup.find_all("節點名",attrs={"名":"值"})
r_list=soup.find_all("div",attrs={"class":"test"}
from bs4 import BeautifulSoup html = '<div id="text">哈哈</div>' # 建立解析對象 soup = BeautifulSoup(html, 'lxml') # 查找節點 r_list = soup.find_all(id="text") print(r_list) for r in r_list: print(r.get_text()) r_list = soup.find_all("div", attrs={'id': "text"}) print(r_list) #################################### html = '''<div class="test">你好</div> <div class="test">再見</div> <div class="test2"> <span>第二次</span> </div>''' # class爲test的div的文本內容 soup = BeautifulSoup(html, 'lxml') divs = soup.find_all("div", attrs={"class": "test"}) print(type(divs)) for div in divs: print(div.string) print(div.get_text()) # class爲test2的div下的span中的文本內容 divs = soup.find_all("div", attrs={"class": "test2"}) for div in divs: print(div.span.string)
scrapy startproject 項目名
scrapy genspider 文件名 域名
scrapy crawl 爬蟲名
# 是否遵照robots協議,該爲False ROBOTSTXT_OBEY = False # 最大併發量,默認爲16個 CONCURRENT_REQUESTS = 32 # 下載延遲時間爲3秒 DOWNLOAD_DELAY = 3 # 請求報頭 DEFAULT_REQUEST_HEADERS = { 'User-Agent': "Mozilla/5.0", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # 蜘蛛中間件 SPIDER_MIDDLEWARES = { 'testspider.middlewares.TestspiderSpiderMiddleware': 543, } # 下載器中間件 DOWNLOADER_MIDDLEWARES = { 'testspider.middlewares.TestspiderDownloaderMiddleware': 543, } # 管道文件 ITEM_PIPELINES = { 'testspider.pipelines.TestspiderPipeline': 300, }
scrapy startproject baidu
cd baidu/baidu
subl items.py(此示例可不用操做)
cd spiders
scrapy genspider baiduspider baidu.com
subl settings.py
cd spiders
scrapy crawl baiduspider
pycharm運行scrapy項目
生成器
# Fib.py def fib(n): a, b, s = 0, 1, 0 while s < n: a, b = b, a + b s += 1 yield b print(fib(5).__next__()) for i in fib(10): print(i)
1 1 2 3 5 8 13 21 34 55 89
知識點
extract()
:獲取選擇器對象中的文本內容
response.xpath('.../text()')
獲得選擇器對象(節點文本) [<selector ...,data='文本內容'>]
extract()
把選擇器對象中的文本取出來 ['文本內容']
start_urls = []
process_item(self,item,spider)
,固然還能夠寫任何其餘函數存入MongoDB數據庫
MONGODB_HOST =
MONGODB_PORT =
在settings.py文件中設置你的項目管道
ITEM_PIPELINES = {
"Daomu.pipelines.DaomumongoPipeline":100,
}html
self.db.commit()
Csdn項目存到mongodb和mysql
python
https://careers.tencent.com/search.html?index=1
https://careers.tencent.com/search.html?index=2
//div[@class="search-content"]
.//h4/text()
.//span[2]/text()
.//span[3]/text()
.//span[4]/text()
.//p[2]/text()
- Fiddler(設置抓包) - 在手機上安裝證書 - 手機瀏覽器打開:http://IP地址:8888 (IP地址是你電腦的IP,8888是Fiddler設置的端口) - 在頁面上下載(FiddlerRoot certificate) - 下載文件名:FiddlerRoot.cer 0 直接安裝 - 設置代理 - 打開手機上已鏈接的無線, 代理設置 -> 改爲 手動 - IP地址:你電腦的IP (ipconfig / ifconfig) - 端口號:8888
DEFAULT_REQUEST_HEADER={"User-Agent":"",}
user_agents = ['','','','','']
from scrapy.pipelines.images import ImagesPipeline
http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0
dont_filter參數mysql
scrapy.Request(url,callback=...,dont_filter=False) dont_filter參數 :False->自動對URL進行去重 True -> 不會對URL進行去重
DOWNLOADER_MIDDLEWARES={"Jd.middleware.seleniumMiddleware":20}
https://sourceforge.net/projects/tesseract-ocr-alt/files/tesseract-ocr-setup-3.02.02.exe/download
suo apt-get install tesseract-ocr
brew install tesseract
python -m pip install pytesseract
from PIL import Image