nutch 存儲到數據庫

時間 2019-11-14

標籤 nutch 存儲數據庫欄目 SQL 简体版

原文原文鏈接

就像咱們知道的同樣,nutch是一個架構在lucene之上的網絡爬蟲+搜索引擎.shell

是由lucene的做者在lucene基礎之上開發,並整合了hadoop,實如今分佈式雲計算,使用google標準的HFDS文件系統做爲存儲結構,是一款高伸縮性能與高效高併發的網絡爬蟲+搜索引擎.數據庫

FaceYe在後臺已經整合了nutch,在適當的時候,就能夠開始爲用戶提供高質量的知識索引服務.順便說一下,nutch在生產環境中,並不能在windows下運行,須要在liux下運行,這其中主要是hadoop採用了一些shello腳本,固然,開發平臺仍是能夠搭建在window下,但須要安裝cygwin,來模擬shell環境.廢話少說,入nutch正題apache

正像上面說到的,nutch使用HFDS來存儲索引文件,並無將爬取來的數據存儲入數據庫,這是由於HFDS是一種比數據庫更高效,更容易實現負載均衡的結構,對於像搜索引擎這樣的應用,使用數據庫將對嚴重製約性能,因此,使用HFDS再加上倒派索引,會取理滿意的性能,HFDS也是目前搜索巨頭google,以及yahoo所正在使用的文件格式.windows

雖然有了HFDS,但在進行網絡爬取的時候,咱們仍是但願,能夠將爬取的一些個數據,好比網頁url,好比網頁標題等關鍵信息存儲到數據庫中,但nutch並無提供這樣的功能,怎麼辦?動手發明輪子~網絡

nutch支持強大的plugin 機制,這種機制與eclipse中的plugin機制同出一轍,同樣能夠方便的進行插拔.架構

開發將爬取記錄存入數據庫的nutch plugin過程以下.併發

1.定義這一nutch plugin要實現的主要功能:負載均衡

在使用nutch爬取網絡資源的同時,將網絡資源的主要信息存儲入數據庫.eclipse

2.新建plugin 包:分佈式

org.apache.nutch.indexer.store

並開發StoreIndexingFilter工具類以下:

public class StoreIndexingFilter implements IndexingFilter
{
public static final Log LOG = LogFactory.getLog(StoreIndexingFilter.class);

/** A flag that tells if magic resolution must be performed */
private boolean MAGIC;

/** Get the MimeTypes resolver instance. */
private MimeUtil MIME;

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException
{

IResourceEntityService resourceEntityService = (IResourceEntityService) SpringUtil.getInstance().getBean(「resourceEntityService」);

String _url = doc.getFieldValue(「url」);
String _title = doc.getFieldValue(「title」);
if (StringUtils.isNotEmpty(_url))
{
if (!resourceEntityService.isExists(ResourceEntity.class, 「url」, _url))
{
ResourceEntity resourceEntity = new ResourceEntity();
resourceEntity.setUrl(_url);
if (StringUtils.isNotEmpty(_title))
{
if (_title.length() > 255)
{
_title = _title.substring(0, 254);
}
}
resourceEntity.setName(_title);
resourceEntityService.saveResourceEntity(resourceEntity);
}
}
return doc;
}
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum)
{
long time = -1;

String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null)
{ // try parse last-modified
time = getTime(lastModified, url); // use as time
// store as string
doc.add(「lastModified」, Long.toString(time));
}

if (time == -1)
{ // if no last-modified
time = datum.getFetchTime(); // use fetch time
}

SimpleDateFormat sdf = new SimpleDateFormat(「yyyyMMdd」);
sdf.setTimeZone(TimeZone.getTimeZone(「GMT」));
String dateString = sdf.format(new Date(time));

// un-stored, indexed and un-tokenized
doc.add(「date」, dateString);

return doc;
}

private long getTime(String date, String url)
{
long time = -1;
try
{
time = HttpDateFormat.toLong(date);
} catch (ParseException e)
{
// try to parse it as date in alternative format
try
{
Date parsedDate = DateUtils.parseDate(date, new String[] { 「EEE MMM dd HH:mm:ss yyyy」, 「EEE MMM dd HH:mm:ss yyyy zzz」,
「EEE, MMM dd HH:mm:ss yyyy zzz」, 「EEE, dd MMM yyyy HH:mm:ss zzz」, 「EEE,dd MMM yyyy HH:mm:ss zzz」, 「EEE, dd MMM yyyy HH:mm:sszzz」,
「EEE, dd MMM yyyy HH:mm:ss」, 「EEE, dd-MMM-yy HH:mm:ss zzz」, 「yyyy/MM/dd HH:mm:ss.SSS zzz」, 「yyyy/MM/dd HH:mm:ss.SSS」,
「yyyy/MM/dd HH:mm:ss zzz」, 「yyyy/MM/dd」, 「yyyy.MM.dd HH:mm:ss」, 「yyyy-MM-dd HH:mm」, 「MMM dd yyyy HH:mm:ss. zzz」,
「MMM dd yyyy HH:mm:ss zzz」, 「dd.MM.yyyy HH:mm:ss zzz」, 「dd MM yyyy HH:mm:ss zzz」, 「dd.MM.yyyy; HH:mm:ss」, 「dd.MM.yyyy HH:mm:ss」,
「dd.MM.yyyy zzz」 });
time = parsedDate.getTime();
// if (LOG.isWarnEnabled()) {
// LOG.warn(url + 「: parsed date: 」 + date +」 to:」+time);
// }
} catch (Exception e2)
{
if (LOG.isWarnEnabled())
{
LOG.warn(url + 「: can’t parse erroneous date: 」 + date);
}
}
}
return time;
}

// Add Content-Length
private NutchDocument addLength(NutchDocument doc, ParseData data, String url)
{
String contentLength = data.getMeta(Response.CONTENT_LENGTH);

if (contentLength != null)
doc.add(「contentLength」, contentLength);

return doc;
}
private NutchDocument addType(NutchDocument doc, ParseData data, String url)
{
MimeType mimeType = null;
String contentType = data.getMeta(Response.CONTENT_TYPE);
if (contentType == null)
{
mimeType = MIME.getMimeType(url);
} else
{
mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
}

// Checks if we solved the content-type.
if (mimeType == null)
{
return doc;
}

contentType = mimeType.getName();

doc.add(「type」, contentType);

String[] parts = getParts(contentType);

for (String part : parts)
{
doc.add(「type」, part);
}

return doc;
}

static String[] getParts(String mimeType)
{
return mimeType.split(「/」);
}

private PatternMatcher matcher = new Perl5Matcher();

private Configuration conf;
static Perl5Pattern patterns[] = { null, null };
static
{
Perl5Compiler compiler = new Perl5Compiler();
try
{
// order here is important
patterns[0] = (Perl5Pattern) compiler.compile(「//bfilename=['/"](.+)['/"]「);
patterns[1] = (Perl5Pattern) compiler.compile(「//bfilename=(//S+)//b」);
} catch (MalformedPatternException e)
{
// just ignore
}
}

private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url)
{
String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
if (contentDisposition == null)
return doc;

MatchResult result;
for (int i = 0; i < patterns.length; i++)
{
if (matcher.contains(contentDisposition, patterns[i]))
{
result = matcher.getMatch();
doc.add("title", result.group(1));
break;
}
}

return doc;
}

public void addIndexBackendOptions(Configuration conf)
{

LuceneWriter.addFieldOptions("type", LuceneWriter.STORE.NO, LuceneWriter.INDEX.UNTOKENIZED, conf);

LuceneWriter.addFieldOptions("primaryType", LuceneWriter.STORE.YES, LuceneWriter.INDEX.UNTOKENIZED, conf);
LuceneWriter.addFieldOptions("subType", LuceneWriter.STORE.YES, LuceneWriter.INDEX.UNTOKENIZED, conf);

LuceneWriter.addFieldOptions("contentLength", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf);

LuceneWriter.addFieldOptions("lastModified", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf);

// un-stored, indexed and un-tokenized
LuceneWriter.addFieldOptions("date", LuceneWriter.STORE.NO, LuceneWriter.INDEX.UNTOKENIZED, conf);
}

public void setConf(Configuration conf)
{
this.conf = conf;
MIME = new MimeUtil(conf);
}

public Configuration getConf()
{
return this.conf;
}

}

其中最主要的方法爲:

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException
{

IResourceEntityService resourceEntityService = (IResourceEntityService) SpringUtil.getInstance().getBean("resourceEntityService");
String _url = doc.getFieldValue("url");
String _title = doc.getFieldValue("title");
if (StringUtils.isNotEmpty(_url))
{
if (!resourceEntityService.isExists(ResourceEntity.class, "url", _url))
{
ResourceEntity resourceEntity = new ResourceEntity();
resourceEntity.setUrl(_url);
if (StringUtils.isNotEmpty(_title))
{
if (_title.length() > 255)
{
_title = _title.substring(0, 254);
}
}
resourceEntity.setName(_title);
resourceEntityService.saveResourceEntity(resourceEntity);
}
}
return doc;
}
也就是說,要在使用nutch構建document文檔的同時,這一資源,存入數據庫.