使用java將網頁保存爲mht格式(1)



package com.tag;
     import java.io.BufferedInputStream;
     import java.io.BufferedOutputStream;
     import java.io.BufferedReader;
     import java.io.ByteArrayInputStream;
     import java.io.DataOutputStream;
     import java.io.File;
     import java.io.FileInputStream;
     import java.io.FileOutputStream;
     import java.io.FileWriter;
     import java.io.IOException;
     import java.io.InputStream;
     import java.io.InputStreamReader;
     import java.io.OutputStream;
     import java.io.Reader;
     import java.net.MalformedURLException;
     import java.net.URL;
     import java.util.*;
     import org.htmlparser.Parser;
     import org.htmlparser.Tag;
     import org.htmlparser.filters.TagNameFilter;
     import org.htmlparser.lexer.Lexer;
     import org.htmlparser.lexer.Page;
     import org.htmlparser.util.DefaultParserFeedback;
     import org.htmlparser.util.NodeList;
     import org.htmlparser.util.ParserException;
     import toptrack.tools.JQuery;
     import javax.activation.DataHandler;
     import javax.activation.DataSource;
     import javax.activation.MimetypesFileTypeMap;
     import javax.mail.Message;
     import javax.mail.MessagingException;
     import javax.mail.Multipart;
     import javax.mail.Session;
     import javax.mail.internet.InternetAddress;
     import javax.mail.internet.MimeBodyPart;
     import javax.mail.internet.MimeMessage;
     import javax.mail.internet.MimeMultipart;
     import javax.mail.internet.MimePartDataSource;
     /**
     * mht文件解析類
     * @author dl
     */

     public class Html2MHTCompiler {
         private URL strWeb = null; /**網頁地址*/
         private String strText = null; /**網頁文本內容*/
         private String strFileName = null; /**本地文件名*/
         private String strEncoding = null; /**網頁編碼*/
//mht格式附加信息
         private String from = "[email]dongle2001@126.com[/email]";
         private String to;
         private String subject = "mht compile";
         private String cc;
         private String bcc;
         private String smtp = "localhost";
         public static void main(String[] args) {
            String strUrl = "http://www.mtime.com/my/tropicofcancer/blog/843555/";
            String strEncoding = "utf-8";
            String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
            if (strText == null)
                return;
            Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
            h2t.compile();
            //Html2MHTCompiler.mht2html("test.mht", "a.html");
        }
        /**
         *<br>方法說明:初始化
         *<br>輸入參數:strText 網頁文本內容; strUrl 網頁地址; strEncoding 網頁編碼; strFileName 本地文件名
         *<br>返回類型:
         */

        public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
            // TODO Auto-generated constructor stub
            try {
                strWeb = new URL(strUrl);
            } catch (MalformedURLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return;
            }
            this.strText = strText;
            this.strEncoding = strEncoding;
            this.strFileName = strFileName;
        }
        /**
         *<br>方法說明:執行下載操做
         *<br>輸入參數:
         *<br>返回類型:
         */

        public boolean compile() {
            if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
                return false;
            HashMap urlMap = new HashMap();
            NodeList nodes = new NodeList();
            try {
                Parser parser = createParser(strText);
                parser.setEncoding(strEncoding);
                nodes = parser.parse(null);
            } catch (ParserException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            extractAllScriptNodes(nodes);
            ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
            ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
            for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
                Map.Entry entry = (Map.Entry) iter.next();
                String key = (String)entry.getKey();
                String val = (String)entry.getValue();
                strText = JHtmlClear.replace(strText, val, key);
            }
            try {
                createMhtArchive(strText, urlScriptList, urlImageList);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return false;
            }
            return true;
        }
        /**
         *<br>方法說明:創建HTML parser
         *<br>輸入參數:inputHTML 網頁文本內容
         *<br>返回類型:HTML parser
         */

        private Parser createParser(String inputHTML) {
            // TODO Auto-generated method stub
            Lexer mLexer = new Lexer(new Page(inputHTML));
            return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        }
        /**
         *<br>方法說明:抽取基礎URL地址
         *<br>輸入參數:nodes 網頁標籤集合
         *<br>返回類型:
         */

        private void extractAllScriptNodes(NodeList nodes) {
            NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
                    "BASE"), true);
            if (filtered != null && filtered.size() > 0) {
                Tag tag = (Tag) filtered.elementAt(0);
                String href = tag.getAttribute("href");
                if (href != null && href.length() > 0) {
                    try {
                        strWeb = new URL(href);
                    } catch (MalformedURLException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
        }
        /**
         *<br>方法說明:抽取網頁包含的css,js連接
         *<br>輸入參數:nodes 網頁標籤集合; urlMap 已存在的url集合
         *<br>返回類型:css,js連接的集合
         */

        private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {
            ArrayList urlList = new ArrayList();
            NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);
            for (int i = 0; i < filtered.size(); i++) {
                Tag tag = (Tag) filtered.elementAt(i);
                String src = tag.getAttribute("src");
                // Handle external css file's url
                if (src != null && src.length() > 0) {
                    String innerURL = src;
                    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
                    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
                        urlMap.put(absoluteURL, innerURL);
                        ArrayList urlInfo = new ArrayList();
                        urlInfo.add(innerURL);
                        urlInfo.add(absoluteURL);
                        urlList.add(urlInfo);
                    }
                    tag.setAttribute("src", absoluteURL);
                }
            }
            filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);
            for (int i = 0; i < filtered.size(); i++) {
                Tag tag = (Tag) filtered.elementAt(i);
                String type = (tag.getAttribute("type"));
                String rel = (tag.getAttribute("rel"));
                String href = tag.getAttribute("href");
                boolean isCssFile = false;
                if (rel != null) {
                    isCssFile = rel.indexOf("stylesheet") != -1;
                } else if (type != null) {
                    isCssFile |= type.indexOf("text/css") != -1;
                }
// Handle external css file's url
                if (isCssFile && href != null && href.length() > 0) {
                    String innerURL = href;
                    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
                    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
                        urlMap.put(absoluteURL, innerURL);
                        ArrayList urlInfo = new ArrayList();
                        urlInfo.add(innerURL);
                        urlInfo.add(absoluteURL);
                        urlList.add(urlInfo);
                    }
                    tag.setAttribute("href", absoluteURL);
                }
            }
            return urlList;
        }
        /**
         *<br>方法說明:抽取網頁包含的圖像連接
         *<br>輸入參數:nodes 網頁標籤集合; urlMap 已存在的url集合
         *<br>返回類型:圖像連接集合
         */

        private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {
            ArrayList urlList = new ArrayList();
            NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("IMG"), true);
            for (int i = 0; i < filtered.size(); i++) {
                Tag tag = (Tag) filtered.elementAt(i);
                String src = tag.getAttribute("src");
                // Handle external css file's url
                if (src != null && src.length() > 0) {
                    String innerURL = src;
                    String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
                    if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
                        urlMap.put(absoluteURL, innerURL);
                        ArrayList urlInfo = new ArrayList();
                        urlInfo.add(innerURL);
                        urlInfo.add(absoluteURL);
                        urlList.add(urlInfo);
                    }
                    tag.setAttribute("src", absoluteURL);
                }
            }
            return urlList;
        }
        /**
         *<br>方法說明:相對路徑轉絕對路徑
         *<br>輸入參數:strWeb 網頁地址; innerURL 相對路徑連接
         *<br>返回類型:絕對路徑連接
         */

        public static String makeAbsoluteURL(URL strWeb, String innerURL) {
            // TODO Auto-generated method stub
            //去除後綴
            int pos = innerURL.indexOf("?");
            if (pos != -1) {
                innerURL = innerURL.substring(0, pos);
            }
            if (innerURL != null
                    && innerURL.toLowerCase().indexOf("http") == 0) {
                System.out.println(innerURL);
                return innerURL;
            }
            URL linkUri = null;
            try {
                linkUri = new URL(strWeb, innerURL);
            } catch (MalformedURLException e) {
                //TODO Auto-generated catch block
                e.printStackTrace();
                return null;
            }
            String absURL = linkUri.toString();
            absURL = JHtmlClear.replace(absURL, "../", "");
            absURL = JHtmlClear.replace(absURL, "./", "");
            System.out.println(absURL);
            return absURL;
        }
        /**
         *<br>方法說明:建立mht文件
         *<br>輸入參數:content 網頁文本內容; urlScriptList 腳本連接集合; urlImageList 圖片連接集合
         *<br>返回類型:
         */

        private void createMhtArchive(String content, ArrayList urlScriptList, ArrayList urlImageList) throws Exception {
            //Instantiate a Multipart object
            MimeMultipart mp = new MimeMultipart("related");
            Properties props = new Properties();
            props.put("mail.smtp.host", smtp);
            Session session = Session.getDefaultInstance(props, null);
            MimeMessage msg = new MimeMessage(session);
            // set mailer
            msg.setHeader("X-Mailer", "Code Manager .SWT");
            // set from
            if (from != null) {
                msg.setFrom(new InternetAddress(from));
            }
            // set subject
            if (subject != null) {
                msg.setSubject(subject);
            }
            // to
            if (to != null) {
                InternetAddress[] toAddresses = getInetAddresses(to);
                msg.setRecipients(Message.RecipientType.TO, toAddresses);
            }
            // cc
            if (cc != null) {
                InternetAddress[] ccAddresses = getInetAddresses(cc);
                msg.setRecipients(Message.RecipientType.CC, ccAddresses);
            }
            // bcc
            if (bcc != null) {
                InternetAddress[] bccAddresses = getInetAddresses(bcc);
                msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
            }



source: [url]http://tech.ddvip.com/2008-12/1229653798101198.html[/url]
相關文章
相關標籤/搜索