場景介紹:java
在處理輸入的文本時,須要將http://bit.ly/3ynriE等短鏈接轉換爲真實鏈接lucene.apache.org/solr正則表達式
1,實現TokenFilterapache
package com.url.plugin; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.io.IOException; import java.util.regex.Pattern; public class ResolveUrlTokenFilter extends TokenFilter { private final CharTermAttribute charTermAttribute=addAttribute(CharTermAttribute.class); private final Pattern patternToMatchShortenedUrls; public ResolveUrlTokenFilter(TokenStream input, Pattern patternToMatchShortenedUrls) { super(input); this.patternToMatchShortenedUrls = patternToMatchShortenedUrls; } @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; //charTermAttribute會保存讀取char char[] term=charTermAttribute.buffer(); int len=term.length; //構造字符串 String token=new String(term,0,len); //匹配token中是否出現咱們須要重構的場景 if(patternToMatchShortenedUrls.matcher(token).matches()){ charTermAttribute.setEmpty().append(resolveUrlToken(token)); } return true; } private String resolveUrlToken(String token) { //TODO 根據實際需求處理token try { if ("http://bit.ly/3ynriE".equals(token)) { return "lucene.apache.org/solr"; } else if ("http://bit.ly/15tzw".equals(token)) { return "manning.com"; } } catch (Exception exc) { // rather than failing analysis if you can't resolve the URL, // you should log the error and return the un-resolved value exc.printStackTrace(); } return token; } }
2,實現TokenFilterFactoryapp
package com.url.plugin; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.TokenFilterFactory; import java.util.Map; import java.util.regex.Pattern; public class ResolveUrlTokenFilterFactory extends TokenFilterFactory { private Pattern patternToMatchShortenedUrls; public ResolveUrlTokenFilterFactory(Map<String, String> args) { super(args); assureMatchVersion(); //從solr讀取的配置文件信息中獲取正則表達式信息 String shortenedUrls=require(args,"shortenedUrlPattern"); patternToMatchShortenedUrls=Pattern.compile(shortenedUrls); } @Override public TokenFilter create(TokenStream tokenStream) { //建立ResolveUrlTokenFilter實例對象 return new ResolveUrlTokenFilter(tokenStream,patternToMatchShortenedUrls); } }
3,將其打成jar包ide
4,在solr的schema文件中添加以下內容
ui
<fieldType name="text_plugin" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="com.url.plugin.ResolveUrlTokenFilterFactory" shortenedUrlPattern="http:\/\/bit.ly\/[\w\-]+" /> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> </fieldType>
5,在solr的根目錄下建立plugin文件夾,(位置同dist,contrib文件),並將3生成的jar放入其中this
6,在solrconfg.xml中添加url
<lib dir="../../../plugins/" regex=".*\.jar" /> code
7,java -jar start.jarxml