對於多線程的使用,通常對於起多少個線程數目,對於這點我通常會考慮CPU核心數,消耗的資源以及是不是瓶頸html
下面我用一個示例大體解釋下個人思路java
public class Demo { BlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1024); BlockingQueue<Html> htmlQueue = new ArrayBlockingQueue<Html>(1024); BlockingQueue<Meta> metaQueue = new ArrayBlockingQueue<Meta>(1024); public void execute() throws InterruptedException { new Thread(new QueryThread()).start(); Thread[] spiders = new Thread[5]; for (int x = 0; x < spiders.length; x++) { spiders[x] = new Thread(new SpiderThread()); spiders[x].start(); } Thread[] parsers = new Thread[5]; for (int x = 0; x < parsers.length; x++) { parsers[x] = new Thread(new ParserThread()); parsers[x].start(); } Thread[] writers = new Thread[3]; for (int x = 0; x < writers.length; x++) { writers[x] = new Thread(new WriteThread()); writers[x].start(); } //等待Spider線程結束 for (int x = 0; x < spiders.length; x++) { spiders[x].join(); } //往htmlQueue通信隊列中放入結束信號 putEmptySingeleToHtmlQueue(); //等待Parser線程結束 for (int x = 0; x < parsers.length; x++) { parsers[x].join(); } //往metaQueue通信隊列中放入結束信號 putEmptySingeleToMetaQueue(); //等待Writer線程結束 for (int x = 0; x < writers.length; x++) { writers[x].join(); } //Writer線程所有結束,程序結束 } private void putEmptySingeleToMetaQueue() throws InterruptedException { Meta meta = new Meta(); meta.setEmpty(true); metaQueue.put(meta); } private void putEmptySingeleToHtmlQueue() throws InterruptedException { Html empty = new Html(); empty.setEmpty(true); htmlQueue.put(empty); } class QueryThread implements Runnable{ @Override public void run() { try { String url = null; while ((url = getUrl()) != null) { if (url.length() == 0) { continue; } urlQueue.put(url); } urlQueue.put(""); } catch (InterruptedException e) { e.printStackTrace(); } } private String getUrl() { return null; } } class SpiderThread implements Runnable { @Override public void run() { try { while (true) { String url = urlQueue.take(); if (url.length() == 0) { //get empty single put back and stop thread //then the other thread can get the empty single urlQueue.put(url); break; } Html html = crawl(url); if (html == null) { // deal fail continue; } htmlQueue.put(html); } } catch (InterruptedException e) { e.printStackTrace(); } } private Html crawl(String url) { return null; } } class ParserThread implements Runnable { @Override public void run() { try { while (true) { Html take = htmlQueue.take(); if (take.isEmpty()) { htmlQueue.put(take); break; } Meta meta = translate(take); metaQueue.put(meta); } } catch (InterruptedException e) { e.printStackTrace(); } } private Meta translate(Html take) { //parse data return null; } } class WriteThread implements Runnable { @Override public void run() { try { while (true) { Meta take = metaQueue.take(); if (take.isEmpty()) { metaQueue.put(take); break; } write(take); } } catch (InterruptedException e) { e.printStackTrace(); } } private void write(Meta take) { // write data } } class Html { private boolean empty; public boolean isEmpty() { return empty; } public void setEmpty(boolean empty) { this.empty = empty; } } class Meta { private boolean empty; public boolean isEmpty() { return empty; } public void setEmpty(boolean empty) { this.empty = empty; } } }
這是一個簡單的爬取和解析的簡單爬蟲,其中主要分:一、取數據,二、爬取數據,三、解析數據,四、寫數據到硬盤網絡
首先我會分析各部分所耗資源的點多線程
一、取數據:消耗資源的是磁盤,佔用讀取速度,這裏不會是瓶頸。ide
二、爬取數據:消耗的資源是網絡資源,相對於1來講是很大的瓶頸,因此1只用起一個線程足矣。而這部分該起多少線程,固然是越多越好,可是還要考慮爬取站點的通暢性,適可的增長。this
三、解析數據:消耗的是CPU資源,哪這裏我就會考慮CPU的核心數,通常來講會起和CPU核心數相同的線程數。假如咱們多起兩個線程,咱們能夠想一想,若是線程數比CPU核心數多,必然會出現兩個解析線程爭搶一個CPU核心的資源。而這部分又是消耗CPU資源的,從而致使解析這塊一定有線程處於阻塞狀態,導致下降效率,因此在消耗CPU這塊儘可能保證線程數超過CPU核心數目。url
四、寫數據到硬盤:消耗的資源是硬盤,佔用寫速度,這裏視爬取那塊而定。但通常不會起太多線程,由於寫入速度也是一個瓶頸,起太多不會對效率提升有多大影響。spa