對generate.max.count參數的處理在org.apache.nutch.crawl.Generator內部類Selector中java
org.apache.nutch.crawl.Generator中相關變量聲明狀況apache
private HashMap<String, int[]> hostCounts = new HashMap<String, int[]>(); private int maxCount;
內部類Selector的config方法中數組
maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);
reduce方法中的處理dom
/*** 一、獲取 某一主機下的int[] ,若是爲null,聲明一個數組,放入map中,int數組第2個值+1; */ //1 int[] hostCount = hostCounts.get(hostordomain); if (hostCount == null) { hostCount = new int[] { 1, 0 }; hostCounts.put(hostordomain, hostCount); } hostCount[1]++;// increment hostCount //二、檢查是否到了topN的數量,若是hostCount的第一個值大於limit // check if topN reached, select next segment if it is while (segCounts[hostCount[0] - 1] >= limit//segCounts : && hostCount[0] < maxNumSegments) { hostCount[0]++; hostCount[1] = 0; } // reached the limit of allowed URLs per host / domain // see if we can put it in the next segment? if (hostCount[1] >= maxCount) { if (hostCount[0] < maxNumSegments) { hostCount[0]++; hostCount[1] = 0; } else { if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) { LOG.info("Host or domain " + hostordomain + " has more than " + maxCount + " URLs for all " + maxNumSegments + " segments. Additional URLs won't be included in the fetchlist."); } // skip this entry continue; } } entry.segnum = new IntWritable(hostCount[0]); segCounts[hostCount[0] - 1]++;