本身寫的一個正文提取算法,在三個網站上測試沒問題javascript
須要使用第三方的jar jsoupjava
package com.extract; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class ExtractNovel { public static void main(String[] args) throws IOException { //dijiuzww.com String test = FileUtils .readFileToString(new File("C://Users//Administrator//Desktop//sina.com")); // String test = FileUtils // .readFileToString(new File("C://Users//Administrator//Desktop//testextaractContent.txt")); // Document doc = Jsoup.parse(test); doc = denoiseElementForDoc(doc); // System.out.println(doc.text()); int size = doc.text().length(); Element e = doc.getAllElements().get(0); Element target[] = new Element[1]; check(e,size); } public static void check(Element e, float size) { Element son = findRealSon(e, size); System.out.println(son.toString()); System.out.println(son.text()); } public static Element findRealSon(Element e, float size) { Elements els = e.children(); Element son = null; for (Element tempson : els) { float length = tempson.text().length(); if (length / size > 0.75) { Element element = findRealSon(tempson, size); if(element ==null){ son = tempson; return son; }else{ son = element; } } } return son; } public static Document denoiseElementForDoc(Document document) { document.getElementsByTag("script").remove(); document.getElementsByTag("style").remove(); document.getElementsByTag("select").remove(); document.getElementsByTag("link").remove(); document.getElementsByTag("input").remove(); document.getElementsByTag("object").remove(); document.getElementsByTag("textarea").remove(); document.getElementsByTag("ul").remove(); document.getElementsByTag("img").remove(); document.getElementsByTag("a").attr("href", "javascript:void(0)").remove(); document.getElementsByAttributeValue("display", "none").remove(); document.getElementsByAttributeValueStarting("class", "foot").remove(); document.getElementsByAttributeValue("class", "settings").remove(); document.getElementsByAttributeValueContaining("style", "display:none").remove(); document.getElementsByAttributeValueContaining("style", "overflow: hidden").remove(); return document; } }