本博客屬原創文章,歡迎轉載!轉載請務必註明出處:http://guoyunsky.iteye.com/blog/613412javascript
本博客已遷移到本人獨立博客: http://www.yun5u.com/html
歡迎加入Heritrix羣(QQ):109148319,10447185 , Lucene/Solr羣(QQ) : 118972724 java
order.xml是整個Heritrix的核心,裏面的每一個一個配置都關係到Heritrix的運行狀況,沒讀源碼以前我只能從有限的渠道去獲知這些配置的運用.讀完以後才知道Heritrix居然有如此靈活的運用,如能夠控制抓取速度,能夠優化電腦性能,能夠在某一次的抓取上繼續抓取.固然整個order.xml裏我也沒有所有掌握,只知道大部分配置的做用,但願你們指點改正以及補充,謝謝!正則表達式
- <meta></meta> 表明着該抓取JOB的元素,至關於Html的meta
- <meta>
- <name>myheritrix</name>
- <description>my heritrix</description>
- <operator>Admin</operator>
- <organization></organization>
- <audience></audience>
- <date>20090520051654</date>
2.<controller></controller> 跟抓取有關的全部參數,因爲內容較多,而且Heritrix也已將他們分紅不一樣模塊,因此這裏我也將他們拆分來講明.算法
- <controller>
- <string name="settings-directory">settings</string>
- <string name="disk-path"></string>
- <string name="logs-path">logs</string>
- <string name="checkpoints-path">checkpoints</string>
- <string name="state-path">state</string>
- <string name="scratch-path">scratch</string>
- <long name="max-bytes-download">0</long>
- <long name="max-document-download">0</long>
- <long name="max-time-sec">0</long>
- <integer name="max-toe-threads">30</integer>
- <integer name="recorder-out-buffer-bytes">4096</integer>
- <integer name="recorder-in-buffer-bytes">65536</integer>
- <integer name="bdb-cache-percent">0</integer>
- <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
- </newObject>
- <map name="http-headers">
- </map>
- <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- </newObject>
- <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier"> <!-- Frontier 調度器,等下拆分來講明-- >
- </newObject>
- <map name="uri-canonicalization-rules">
- </map>
- <map name="pre-fetch-processors">
- </map>
- <map name="fetch-processors">
- </map>
- <map name="extract-processors">
- </map>
- <map name="write-processors">
- </map>
- <map name="post-processors">
- </map>
- <map name="loggers">
- </map>
- <newObject name="credential-store" class="org.archive.crawler.datamodel.CredentialStore">
- </newObject>
- </controller>
3.接下來拆分每一個組件的配置文件一一進行說明,最後對Heritrix主要的配置也就是咱們能夠影響抓取的配置進行說明。緩存
3.1:抓取範圍<newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">服務器
- <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
- <boolean name="enabled">false</boolean>
- <string name="seedsfile">seeds.txt</string>
- <boolean name="reread-seeds-on-config">true</boolean>
- <newObject name="decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- <newObject name="rejectByDefault" class="org.archive.crawler.deciderules.RejectDecideRule">
- </newObject>
- <newObject name="acceptIfSurtPrefixed" class="org.archive.crawler.deciderules.SurtPrefixedDecideRule">
- <string name="decision">ACCEPT</string>
- <string name="surts-source-file"></string>
- <boolean name="seeds-as-surt-prefixes">true</boolean>
- <string name="surts-dump-file"></string>
- <boolean name="also-check-via">false</boolean>
- <boolean name="rebuild-on-reconfig">true</boolean>
- </newObject>
- <newObject name="rejectIfTooManyHops" class="org.archive.crawler.deciderules.TooManyHopsDecideRule">
- <integer name="max-hops">20</integer>
- </newObject>
- <newObject name="acceptIfTranscluded" class="org.archive.crawler.deciderules.TransclusionDecideRule">
- <integer name="max-trans-hops">3</integer>
- <integer name="max-speculative-hops">1</integer>
- </newObject>
- <newObject name="rejectIfPathological" class="org.archive.crawler.deciderules.PathologicalPathDecideRule">
- <integer name="max-repetitions">2</integer>
- </newObject>
- <newObject name="rejectIfTooManyPathSegs" class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">
- <integer name="max-path-depth">20</integer>
- </newObject>
- <newObject name="acceptIfPrerequisite" class="org.archive.crawler.deciderules.PrerequisiteAcceptDecideRule">
- </newObject>
- </map>
- </newObject>
- </newObject>
3.2: HTTP協議<map name="http-headers">cookie
- <map name="http-headers">
- <string name="user-agent">Mozilla/5.0 (compatible; heritrix/1.14.3 +http://127.0.0.1)</string>
- <string name="from">guoyunsky@hotmail.com</string>
- </map>
3.3:爬蟲協議 <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">網絡
- <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- <string name="type">classic</string>
- <boolean name="masquerade">false</boolean>
- <text name="custom-robots"></text>
- <stringList name="user-agents">
- </stringList>
- </newObject>
3.4:Frontier 調度器<newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier"><!-- Frontier 調度器-->socket
- <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier">
- <float name="delay-factor">4.0</float>
- <integer name="max-delay-ms">20000</integer>
- <integer name="min-delay-ms">2000</integer>
- <integer name="respect-crawl-delay-up-to-secs">300</integer>
- <integer name="max-retries">30</integer>
- <long name="retry-delay-seconds">900</long>
- <integer name="preference-embed-hops">1</integer>
- <integer name="total-bandwidth-usage-KB-sec">0</integer>
- <integer name="max-per-host-bandwidth-usage-KB-sec">0</integer>
- <string name="queue-assignment-policy">org.archive.crawler.frontier.HostnameQueueAssignmentPolicy</string>
- <string name="force-queue-assignment"></string>
- <boolean name="pause-at-start">false</boolean>
- <boolean name="pause-at-finish">false</boolean>
- <boolean name="source-tag-seeds">false</boolean>
- <boolean name="recovery-log-enabled">true</boolean>
- <boolean name="hold-queues">true</boolean>
- <integer name="balance-replenish-amount">3000</integer>
- <integer name="error-penalty-amount">100</integer>
- <long name="queue-total-budget">-1</long>
- <string name="cost-policy">org.archive.crawler.frontier.ZeroCostAssignmentPolicy</string>
- <long name="snooze-deactivate-ms">300000</long>
- <integer name="target-ready-backlog">50</integer>
- <string name="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>
- <boolean name="dump-pending-at-close">false</boolean>
- </newObject>
3.5:URL規範化規則,主要用來規範化每一個URL,用Heritrix默認的就行了,這裏不作說明了,其實也是經過各類規則
3.6:預先處理鏈組件: <map name="pre-fetch-processors">
- <map name="pre-fetch-processors">
- <newObject name="Preselector" class="org.archive.crawler.prefetch.Preselector">
- <boolean name="enabled">true</boolean>
- <newObject name="Preselector#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <boolean name="override-logger">false</boolean>
- <boolean name="recheck-scope">true</boolean>
- <boolean name="block-all">false</boolean>
- <string name="block-by-regexp"></string>
- <string name="allow-by-regexp"></string>
- </newObject>
- <newObject name="Preprocessor" class="org.archive.crawler.prefetch.PreconditionEnforcer">
- <boolean name="enabled">true</boolean>
- <newObject name="Preprocessor#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <integer name="ip-validity-duration-seconds">86400</integer>
- <integer name="robot-validity-duration-seconds">86400</integer>
- <boolean name="calculate-robots-only">false</boolean>
- </newObject>
- </map>
3.7:獲取組件:<map name="fetch-processors">
- <map name="fetch-processors">
- <newObject name="DNS" class="org.archive.crawler.fetcher.FetchDNS">
- <boolean name="enabled">true</boolean>
- <newObject name="DNS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <boolean name="accept-non-dns-resolves">false</boolean>
- <boolean name="digest-content">true</boolean>
- <string name="digest-algorithm">sha1</string>
- </newObject>
- <newObject name="HTTP" class="org.archive.crawler.fetcher.FetchHTTP">
- <boolean name="enabled">true</boolean>
- <newObject name="HTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <newObject name="midfetch-decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <integer name="timeout-seconds">1200</integer>
- <integer name="sotimeout-ms">20000</integer>
- <integer name="fetch-bandwidth">0</integer>
- <long name="max-length-bytes">0</long>
- <boolean name="ignore-cookies">false</boolean>
- <boolean name="use-bdb-for-cookies">true</boolean>
- <string name="load-cookies-from-file"></string>
- <string name="save-cookies-to-file"></string>
- <string name="trust-level">open</string>
- <stringList name="accept-headers">
- </stringList>
- <string name="http-proxy-host"></string>
- <string name="http-proxy-port"></string>
- <string name="default-encoding">GB2312</string>
- <boolean name="digest-content">true</boolean>
- <string name="digest-algorithm">sha1</string>
- <boolean name="send-if-modified-since">true</boolean>
- <boolean name="send-if-none-match">true</boolean>
- <boolean name="send-connection-close">true</boolean>
- <boolean name="send-referer">true</boolean>
- <boolean name="send-range">false</boolean>
- <string name="http-bind-address"></string>
- </newObject>
- </map>
3.8:抽取組件<map name="extract-processors"> <!-- 抽取鏈 -->
- <map name="extract-processors">
- <newObject name="ExtractorHTTP" class="org.archive.crawler.extractor.ExtractorHTTP">
- <boolean name="enabled">true</boolean>
- <newObject name="ExtractorHTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- </newObject>
- <newObject name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML">
- <boolean name="enabled">true</boolean>
- <newObject name="ExtractorHTML#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <boolean name="extract-javascript">true</boolean>
- <boolean name="treat-frames-as-embed-links">true</boolean>
- <boolean name="ignore-form-action-urls">true</boolean>
- <boolean name="extract-only-form-gets">true</boolean>
- <boolean name="extract-value-attributes">true</boolean>
- <boolean name="ignore-unexpected-html">true</boolean>
- </newObject>
- </map>
3.9:寫組件<map name="write-processors">
- <map name="write-processors">
- <newObject name="Archiver" class="com.steel.heritrix.extend.MyWriterMirror">
- <boolean name="enabled">true</boolean>
- <newObject name="Archiver#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <boolean name="case-sensitive">true</boolean>
- <stringList name="character-map" />
- <stringList name="content-type-map" />
- <string name="directory-file">index.html</string>
- <string name="dot-begin">%2E</string>
- <string name="dot-end">.</string>
- <stringList name="host-map" />
- <boolean name="host-directory">true</boolean>
- <string name="path">mirror</string>
- <integer name="max-path-length">1023</integer>
- <integer name="max-segment-length">255</integer>
- <boolean name="port-directory">false</boolean>
- <boolean name="suffix-at-end">true</boolean>
- <string name="too-long-directory">LONG</string>
- <stringList name="underscore-set" />
- </newObject>
- </map>
3.10:請求鏈組件<map name="post-processors">裏面能夠配置本身的調度器
- <map name="post-processors">
- <newObject name="Updater" class="org.archive.crawler.postprocessor.CrawlStateUpdater">
- <boolean name="enabled">true</boolean>
- <newObject name="Updater#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- </newObject>
- <newObject name="LinksScoper" class="org.archive.crawler.postprocessor.LinksScoper">
- <boolean name="enabled">true</boolean>
- <newObject name="LinksScoper#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <boolean name="override-logger">false</boolean><!-- 若是啓用則覆蓋這個類的默認日誌器,默認日誌器將日誌打印在控制檯.覆蓋的日誌器將把全部日誌發送到
- 在日誌目錄下的以本類命名的日誌文件中。在heritrix.properties中設置好日誌等級和日誌格式,這個屬性在重啓後知獲取一次. -->
- <boolean name="seed-redirects-new-seed">true</boolean>
- <integer name="preference-depth-hops">-1</integer>
- <newObject name="scope-rejected-url-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- </newObject>
- <newObject name="Scheduler" class="com.steel.heritrix.extend.MyFrontierScheduler">
- <boolean name="enabled">true</boolean>
- <newObject name="Scheduler#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- </newObject>
- </map>
3.11:統計跟蹤鏈組件<map name="loggers">
- <map name="loggers">
- <newObject name="crawl-statistics" class="org.archive.crawler.admin.StatisticsTracker">
- <integer name="interval-seconds">20</integer>
- </newObject>
- </map>