網頁中的腳本通常在<script>標籤中,例如 javascript
<script type="text/javascript"> | |
F.use(["/static/common/ui/tangram/base/base.js","/static/widget/common/searchbox/searchbox.js","/static/common/ui/vs/suggestion/suggestion.js"], function(baidu,searchbox,suggestion){ | |
baidu.dom.ready(function(){ | |
searchbox(); | |
if (navigator.cookieEnabled && !/sug?=0/.test(document.cookie)){ | |
suggestion(); | |
} | |
}); | |
}); | |
</script> |
代碼: html
import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class html { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { Document doc = Jsoup.connect("http://news.baidu.com").get(); //獲取網頁內容 //獲取網頁內容中非腳本信息 getTag(doc); } public static void getTag(Document doc) { Elements el = doc.select("*");//先遍歷整個HTML List<String>list = new ArrayList<String>(); for(Element element:el){ String text = element.tagName(); if(text.endsWith("script"))continue;//刪除HTML中的腳本 else{ if(element.hasText() == true)list.add(element.text()+'\n'); } } System.out.println(list); }