以urlmeta爲例:html
在NUTCH_HOME/src/plugin/urlmeta下使用命令:ls -R 查看目錄結構java
build.xml ivy.xml plugin.xml src ./src: java ./src/java: org ./src/java/org: apache ./src/java/org/apache: nutch ./src/java/org/apache/nutch: indexer scoring ./src/java/org/apache/nutch/indexer: urlmeta ./src/java/org/apache/nutch/indexer/urlmeta: package.html URLMetaIndexingFilter.java ./src/java/org/apache/nutch/scoring: urlmeta ./src/java/org/apache/nutch/scoring/urlmeta: package.html URLMetaScoringFilter.java
plugin.xml文件
express
<!--定義了插件的id,name,version 及提供者的name--> <plugin id="urlmeta" name="URL Meta Indexing Filter" version="1.0.0" provider-name="sgonyea"> <!--jar的名稱--> <runtime> <library name="urlmeta.jar"> <export name="*"/> </library> </runtime> <!--導入nutch-extensionpoints插件--> <requires> <import plugin="nutch-extensionpoints"/> </requires> <!--定義了extension的id,name,以及實現的id,類名稱--> <extension id="org.apache.nutch.indexer.urlmeta" name="URL Meta Indexing Filter" point="org.apache.nutch.indexer.IndexingFilter"> <implementation id="indexer-urlmeta" class="org.apache.nutch.indexer.urlmeta.URLMetaIndexingFilter"/> </extension> <!----> <extension id="org.apache.nutch.scoring.urlmeta" name="URL Meta Scoring Filter" point="org.apache.nutch.scoring.ScoringFilter"> <implementation id="scoring-urlmeta" class="org.apache.nutch.scoring.urlmeta.URLMetaScoringFilter" /> </extension> </plugin>
build.xml文件apache
<project name="urlmeta" default="jar-core"> <import file="../build-plugin.xml"/> </project>
URLMetaIndexingFilter .java 源代碼json
package org.apache.nutch.indexer.urlmeta; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; /** * This is part of the URL Meta plugin. It is designed to enhance the NUTCH-655 * patch, by doing two things: 1. Meta Tags that are supplied with your Crawl * URLs, during injection, will be propagated throughout the outlinks of those * Crawl URLs. 2. When you index your URLs, the meta tags that you specified * with your URLs will be indexed alongside those URLs--and can be directly * queried, assuming you have done everything else correctly. * * The flat-file of URLs you are injecting should, per NUTCH-655, be * tab-delimited in the form of: * * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN] * * Be aware that if you collide with keywords that are already in use (such as * nutch.score/nutch.fetchInterval) then you are in for some unpredictable * behavior. * * Furthermore, in your nutch-site.xml config, you must specify that this plugin * is to be used (1), as well as what (2) Meta Tags it should actively look for. * This does not mean that you must use these tags for every URL, but it does * mean that you must list _all_ of meta tags that you have specified. If you * want them to be propagated and indexed, that is. * * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows: * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call * this plugin. * * 2. You must also specify the property "urlmeta.tags", who's values are * comma-delimited <value>key1, key2, key3</value> * * TODO: It may be ideal to offer two separate properties, to specify what gets * indexed versus merely propagated. * */ public class URLMetaIndexingFilter implements IndexingFilter { private static final Logger LOG = LoggerFactory .getLogger(URLMetaIndexingFilter.class); private static final String CONF_PROPERTY = "urlmeta.tags"; private static String[] urlMetaTags; private Configuration conf; /** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; for (String metatag : urlMetaTags) { Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); if (metadata != null) doc.add(metatag, metadata.toString()); } return doc; } /** Boilerplate */ public Configuration getConf() { return conf; } /** * handles conf assignment and pulls the value assignment from the * "urlmeta.tags" property */ public void setConf(Configuration conf) { this.conf = conf; if (conf == null) return; urlMetaTags = conf.getStrings(CONF_PROPERTY); } }
激活插件ide
1,修改nutch-site.xml,添加urlmeta插件oop
<property> <name>plugin.includes</name> <value>protocol-http|urlfilter-regex|parse-(html|tika)|index- (basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|urlmeta</ value> <description> As you can see above, I have added urlmeta. Same way you can create Apache Nutch plugin according to your needs. Regular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By defaultNutch includes crawling just HTML and plain text via HTTP, and basic indexing and search plugins. </description> </property>
2,編譯插件
fetch
在$NUTCH_HOME/src/plugin/build.xml中添加
ui
<ant dir="urlmeta" target="deploy" />
this
3,運行
bin/nutch crawl ./urls/url.txt -solr http://localhost:8983/solr -depth=2
會本身編譯執行