package com.tag;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import toptrack.tools.JQuery;
import javax.activation.DataHandler;
import javax.activation.DataSource;
import javax.activation.MimetypesFileTypeMap;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Session;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import javax.mail.internet.MimePartDataSource;
/**
* mht文件解析類
* @author dl
*/
public
class Html2MHTCompiler {
private URL strWeb =
null;
/**網頁地址*/
private String strText =
null;
/**網頁文本內容*/
private String strFileName =
null;
/**本地文件名*/
private String strEncoding =
null;
/**網頁編碼*/
//mht格式附加信息
private String from =
"[email]dongle2001@126.com[/email]";
private String to;
private String subject =
"mht compile";
private String cc;
private String bcc;
private String smtp =
"localhost";
public
static
void main(String[] args) {
String strUrl =
"http://www.mtime.com/my/tropicofcancer/blog/843555/";
String strEncoding = "utf-8";
String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
if (strText == null)
return;
Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
h2t.compile();
//Html2MHTCompiler.mht2html("test.mht", "a.html");
}
/**
*<br>方法說明:初始化
*<br>輸入參數:strText 網頁文本內容; strUrl 網頁地址; strEncoding 網頁編碼; strFileName 本地文件名
*<br>返回類型:
*/
public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
// TODO Auto-generated constructor stub
try {
strWeb = new URL(strUrl);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return;
}
this.strText = strText;
this.strEncoding = strEncoding;
this.strFileName = strFileName;
}
/**
*<br>方法說明:執行下載操做
*<br>輸入參數:
*<br>返回類型:
*/
public boolean compile() {
if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
return false;
HashMap urlMap = new HashMap();
NodeList nodes = new NodeList();
try {
Parser parser = createParser(strText);
parser.setEncoding(strEncoding);
nodes = parser.parse(null);
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
extractAllScriptNodes(nodes);
ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String)entry.getKey();
String val = (String)entry.getValue();
strText = JHtmlClear.replace(strText, val, key);
}
try {
createMhtArchive(strText, urlScriptList, urlImageList);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
/**
*<br>方法說明:創建HTML parser
*<br>輸入參數:inputHTML 網頁文本內容
*<br>返回類型:HTML parser
*/
private Parser createParser(String inputHTML) {
// TODO Auto-generated method stub
Lexer mLexer = new Lexer(new Page(inputHTML));
return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
}
/**
*<br>方法說明:抽取基礎URL地址
*<br>輸入參數:nodes 網頁標籤集合
*<br>返回類型:
*/
private void extractAllScriptNodes(NodeList nodes) {
NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
"BASE"), true);
if (filtered != null && filtered.size() > 0) {
Tag tag = (Tag) filtered.elementAt(0);
String href = tag.getAttribute("href");
if (href != null && href.length() > 0) {
try {
strWeb = new URL(href);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
/**
*<br>方法說明:抽取網頁包含的css,js連接
*<br>輸入參數:nodes 網頁標籤集合; urlMap 已存在的url集合
*<br>返回類型:css,js連接的集合
*/
private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {
ArrayList urlList = new ArrayList();
NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);
for (int i = 0; i < filtered.size(); i++) {
Tag tag = (Tag) filtered.elementAt(i);
String src = tag.getAttribute("src");
// Handle external css file's url
if (src != null && src.length() > 0) {
String innerURL = src;
String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
urlMap.put(absoluteURL, innerURL);
ArrayList urlInfo = new ArrayList();
urlInfo.add(innerURL);
urlInfo.add(absoluteURL);
urlList.add(urlInfo);
}
tag.setAttribute("src", absoluteURL);
}
}
filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);
for (int i = 0; i < filtered.size(); i++) {
Tag tag = (Tag) filtered.elementAt(i);
String type = (tag.getAttribute("type"));
String rel = (tag.getAttribute("rel"));
String href = tag.getAttribute("href");
boolean isCssFile = false;
if (rel != null) {
isCssFile = rel.indexOf("stylesheet") != -1;
} else if (type != null) {
isCssFile |= type.indexOf("text/css") != -1;
}
// Handle external css file's url
if (isCssFile && href != null && href.length() > 0) {
String innerURL = href;
String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
urlMap.put(absoluteURL, innerURL);
ArrayList urlInfo = new ArrayList();
urlInfo.add(innerURL);
urlInfo.add(absoluteURL);
urlList.add(urlInfo);
}
tag.setAttribute("href", absoluteURL);
}
}
return urlList;
}
/**
*<br>方法說明:抽取網頁包含的圖像連接
*<br>輸入參數:nodes 網頁標籤集合; urlMap 已存在的url集合
*<br>返回類型:圖像連接集合
*/
private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {
ArrayList urlList = new ArrayList();
NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("IMG"), true);
for (int i = 0; i < filtered.size(); i++) {
Tag tag = (Tag) filtered.elementAt(i);
String src = tag.getAttribute("src");
// Handle external css file's url
if (src != null && src.length() > 0) {
String innerURL = src;
String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
urlMap.put(absoluteURL, innerURL);
ArrayList urlInfo = new ArrayList();
urlInfo.add(innerURL);
urlInfo.add(absoluteURL);
urlList.add(urlInfo);
}
tag.setAttribute("src", absoluteURL);
}
}
return urlList;
}
/**
*<br>方法說明:相對路徑轉絕對路徑
*<br>輸入參數:strWeb 網頁地址; innerURL 相對路徑連接
*<br>返回類型:絕對路徑連接
*/
public static String makeAbsoluteURL(URL strWeb, String innerURL) {
// TODO Auto-generated method stub
//去除後綴
int pos = innerURL.indexOf("?");
if (pos != -1) {
innerURL = innerURL.substring(0, pos);
}
if (innerURL != null
&& innerURL.toLowerCase().indexOf("http") == 0) {
System.out.println(innerURL);
return innerURL;
}
URL linkUri = null;
try {
linkUri = new URL(strWeb, innerURL);
} catch (MalformedURLException e) {
//TODO Auto-generated catch block
e.printStackTrace();
return null;
}
String absURL = linkUri.toString();
absURL = JHtmlClear.replace(absURL, "../", "");
absURL = JHtmlClear.replace(absURL, "./", "");
System.out.println(absURL);
return absURL;
}
/**
*<br>方法說明:建立mht文件
*<br>輸入參數:content 網頁文本內容; urlScriptList 腳本連接集合; urlImageList 圖片連接集合
*<br>返回類型:
*/
private void createMhtArchive(String content, ArrayList urlScriptList, ArrayList urlImageList) throws Exception {
//Instantiate a Multipart object
MimeMultipart mp = new MimeMultipart("related");
Properties props = new Properties();
props.put("mail.smtp.host", smtp);
Session session = Session.getDefaultInstance(props, null);
MimeMessage msg = new MimeMessage(session);
// set mailer
msg.setHeader("X-Mailer", "Code Manager .SWT");
// set from
if (from != null) {
msg.setFrom(new InternetAddress(from));
}
// set subject
if (subject != null) {
msg.setSubject(subject);
}
// to
if (to != null) {
InternetAddress[] toAddresses = getInetAddresses(to);
msg.setRecipients(Message.RecipientType.TO, toAddresses);
}
// cc
if (cc != null) {
InternetAddress[] ccAddresses = getInetAddresses(cc);
msg.setRecipients(Message.RecipientType.CC, ccAddresses);
}
// bcc
if (bcc != null) {
InternetAddress[] bccAddresses = getInetAddresses(bcc);
msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
}