基於jsoup的Java服務端http(s)代理程序-代理服務器Demo

時間 2020-07-17

標籤基於 jsoup java 服務 http 代理程序服務器 demo 欄目 Java 简体版

原文原文鏈接

親愛的開發者朋友們，知道百度網址翻譯麼？他們爲什麼可以翻譯源網頁呢，iframe但是不能跨域操做的哦，那麼能夠用代理實現。直接上代碼：javascript

本Demo基於MVC寫的，灰常簡單，copy過去，簡單改改就能夠用的哦。css

//Action層
/**
 * 網址翻譯代理服務器接口層
 * @Description: 此接口層可完成對所請求網址的代理,實現同域訪問
 * @author zhanglongping
 * @CreateDate:   2016-8-23 上午10:52:49  
 */

@At("/proxy")
public class ProxyModule {

    /**
     * 獲取網頁
     * @return
     * @author zhanglongping
     * @date 2016-8-23 上午10:54:13
     */
    @At("/gethtml")
    @Ok("Raw")
    @Authority("")
    public Object gethtml(@Param("yeekit_proxy_url") String url,HttpServletRequest request, HttpServletResponse response){
        try {
            
            String path = request.getContextPath();
            String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
            
            String html = new ProxyUtils().getUrlMap(url,basePath);
//            return html;
            InputStream is = new StringInputStream(html);
            BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8"));  
              
            String line;  
            PrintWriter out = response.getWriter();  
            while ((line = in.readLine()) != null) { 
                out.println(line);  
            }  
            out.flush();  
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        } 
        return null;
    }

     /** 
     * 使用GET提交到目標服務器。 
     *  
     * @param request 
     * @param response 
     * @param targetUrl 
     * @throws IOException 
     */
    @At("/forward")
    @Ok("Raw")
    @Authority("")
    public Object urlRedirect(@Param("yeekit_proxy_url") String targetUrl,HttpServletRequest request, HttpServletResponse response) throws IOException {  
  
        if(targetUrl.endsWith(".htm") || targetUrl.endsWith(".html") || targetUrl.endsWith(".shtml")){
            try {
                
                String path = request.getContextPath();
                String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
                
                String html = new ProxyUtils().getUrlMap(targetUrl,basePath);
//                return html;
                InputStream is = new StringInputStream(html);
                BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8"));  
                  
                String line;  
                PrintWriter out = response.getWriter();  
                while ((line = in.readLine()) != null) {  
                    out.println(line);  
                }  
                out.flush();  
                in.close();
            } catch (Exception e) {
                e.printStackTrace();
//                return null;
            }
        
        }else if(targetUrl.endsWith(".css") || targetUrl.endsWith(".js") || targetUrl.endsWith(".jpg")||
                targetUrl.endsWith(".png") || targetUrl.endsWith(".svg") || targetUrl.endsWith(".gif")){
            String fileName = targetUrl.split("/")[targetUrl.split("/").length-1];
//            response.setHeader("Content-Disposition", "attachment; filename="
//                    + java.net.URLEncoder.encode(fileName, "UTF-8"));
            //圖片的名稱
            String imgName = fileName;
            //名稱轉碼，避免中文亂碼
            imgName = new String(imgName.getBytes("iso8859-1"),"UTF-8");
            //圖片的資源地址，http://10.80.3.229:8081/mediaserver/574fe515e30ab97c9068d2e1
            //這是媒體服務器返回的地址，由於是網絡地址，因此須要使用HttpURLConnection去獲取圖片
            String imgUrl = targetUrl;
            //輸入流，用來讀取圖片
            InputStream ins = null;
            HttpURLConnection httpURL = null;
            try{
                URL url = new URL(imgUrl);
                //打開一個網絡鏈接
                httpURL = (HttpURLConnection)url.openConnection();
                //設置網絡鏈接超時時間
                httpURL.setConnectTimeout(3000);
                //設置應用程序要從網絡鏈接讀取數據
                httpURL.setDoInput(true);
                //設置請求方式
                httpURL.setRequestMethod("GET");
                //獲取請求返回碼
                int responseCode = httpURL.getResponseCode();
                if(responseCode == 200){
                    //若是響應爲「200」，表示成功響應，則返回一個輸入流
                    ins = httpURL.getInputStream();
                    //設置response響應頭
                    //encodeChineseDownloadFileName()用來解決文件名爲中文的問題，方法體在下面
                    if(fileName.indexOf(".css")>-1){
                        response.setContentType("text/css");
                    }
                    response.setHeader("content-disposition", "attachment;filename="+ ProxyUtils.encodeChineseDownloadFileName(request,imgName));
                    //輸出流到response中
                    byte[] data = new byte[1024];
                    int len = 0;
                    //輸出流
                    OutputStream out = response.getOutputStream();
                    while((len = ins.read(data)) > 0){
                        out.write(data, 0, len);
                    }
                    out.flush();
                    ins.close();
                    
                }
            }catch(Exception e){
                System.out.println("下載附件圖片出錯！"+targetUrl);
                e.printStackTrace();
            }
        }
        return null;
        
 
    }

工具類html

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.http.HttpServletRequest;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * 譯庫網址翻譯代理服務工具類
 * 
 * @Description:包含：提取HTML中網址,並轉換爲代理的網址服務地址;
 * @author zhanglongping
 * @CreateDate: 2016-8-23 上午10:15:08
 * @UpdateUser: zhanglongping
 * @UpdateDate: 2016-8-23 上午10:15:08
 * @UpdateRemark: 說明本次修改內容
 */
public class ProxyUtils {
//    public static void main(String[] args) throws IOException {
////        ProxyUtils pu = new ProxyUtils();
////        pu.getUrlMap("http://english.cas.cn");
//        Connection conn = Jsoup.connect("http://www.bbc.com");
//        Document doc_one = conn.get();
//        System.out.println(doc_one);
//    }

    /**
     * 獲取url哈希：key：源url value:代理url
     * @param url
     * @author zhanglongping
     * @date 2016-8-23 上午10:42:41
     */
    public String getUrlMap(String url,String basePath){
//        String url_protocol = "",url_host = "";
        try {
            //特殊網址轉換
            url = transformation(url);
            URL urlcurr = new URL(url);
//            url_protocol = urlcurr.getProtocol();
//            url_host = urlcurr.getHost();
            String hostname = urlcurr.getProtocol()+"://"+urlcurr.getHost();
            
//            String proxyHost = basePath;
            
//            String proxyHostName = proxyHost+"proxy/forward?yeekit_proxy_url=";
            
            Document doc_one;
            
            Connection conn = Jsoup.connect(hostname);
            doc_one = conn.get();
            doc_one.setBaseUri(hostname);

//            Elements links = doc_one.select("a[href]");
//            Elements media = doc_one.select("[src]");
//            Elements imports = doc_one.select("link[href]");
            
            Elements head = doc_one.select("meta");
            head.get(0).before("<base href=\""+hostname+"/"+"\" />");
            //鼠標懸停翻譯js腳本注入
            //懸停腳本引用
            String hover_js = "<script src=\""+basePath+"/yeekit_translate_url/js/yeekit_hover_trans.js\" type=\"text/javascript\"></script>";
            String jquery_js = "<script src=\"http://cdn.bootcss.com/jquery/3.1.0/jquery.min.js\" type=\"text/javascript\"></script>";
            head.get(0).after(jquery_js + hover_js);                        
//            for (Element src : media) {
//                String key = src.attr("abs:src");
//                src.attr("src", proxyHostName+key);
//            }
//            
//            for (Element link : imports) {
//                String key = link.attr("abs:href");
//                link.attr("href", proxyHostName+key);
//            }
//            
//            for (Element link : links) {
//                String key = link.attr("abs:href");
//                link.attr("href", proxyHostName+key);
//            }
            
            String dochtml = doc_one.html().toString();
            
            //加強型處理 - 處理js腳本里靜態資源地址引用
//            List<String> list_src_img = getImgSrc(dochtml);
//            for(String src:list_src_img){
//                if(src.indexOf("./") > -1){
//                    dochtml = dochtml.replaceAll(src, proxyHostName+hostname+src.substring(1));
//                }
//            }
            
//            System.out.println(dochtml);
            return dochtml;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
        
    }
    
    /**
     * 內容獲取
     * @return
     * @author zhanglongping
     * @throws IOException 
     * @date 2016-8-30 下午5:44:31
     */
    public String get_https_html(String url) throws IOException{
        
            URL urlcurr = new URL(url);
            String hostname = urlcurr.getProtocol()+"://"+urlcurr.getHost();
            
            Document doc_one;
            
            Connection conn = Jsoup.connect(hostname);
            doc_one = conn.post();
            doc_one.setBaseUri(hostname);
            
            Elements head = doc_one.select("meta");

            head.get(0).before("<base href=\""+hostname+"/"+"\" />");
            
            String dochtml = doc_one.html().toString();

            return dochtml;
    }
    
    
    /*
     * 解決文件爲中文名的亂碼問題
     */
    public static String encodeChineseDownloadFileName(HttpServletRequest request, String pFileName) throws UnsupportedEncodingException{
        String filename = null;
        //獲取請求頭中的瀏覽器標識
        String agent = request.getHeader("USER-AGENT");
        if(agent != null){
            if(agent.indexOf("Firefox") != -1){
                //Firefox
                filename = "=?UTF-8?B?" + 
                        (new String(Base64.encodeBase64(pFileName.getBytes("UTF-8")))) + "?=";
            }else if(agent.indexOf("Chrome") != -1){
                //Chrome
                filename = new String(pFileName.getBytes(), "ISO8859-1");
            }else{
                //IE7+
                filename = URLEncoder.encode(pFileName, "UTF-8");
                //替換空格
                filename = StringUtils.replace(filename, "+", "%20");
            }
        }else{
            filename = pFileName;
        }
        return filename;
    }
    
    /**
     * 獲取img標籤中的src值
     * @param content
     * @return
     */
    public  List<String> getImgSrc(String content){
         
        List<String> list = new ArrayList<String>();
        //目前img標籤標示有3種表達式
        //<img alt="" src="1.jpg"/>   <img alt="" src="1.jpg"></img>     <img alt="" src="1.jpg">
        //開始匹配content中的<img />標籤
        Pattern p_img = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)");
        Matcher m_img = p_img.matcher(content);
        boolean result_img = m_img.find();
        if (result_img) {
            while (result_img) {
                //獲取到匹配的<img />標籤中的內容
                String str_img = m_img.group(2);
                 
                //開始匹配<img />標籤中的src
                Pattern p_src = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");
                Matcher m_src = p_src.matcher(str_img);
                if (m_src.find()) {
                    String str_src = m_src.group(3);
                    list.add(str_src);
                }
                //結束匹配<img />標籤中的src
                 
                //匹配content中是否存在下一個<img />標籤，有則繼續以上步驟匹配<img />標籤中的src
                result_img = m_img.find();
            }
        }
        return list;
    }
    
    /**
     * 特殊網址轉換
     * @param url
     * @return
     * @author zhanglongping
     * @date 2016-8-30 下午6:18:48
     */
    public String transformation(String url){
        //百度的二級域名www.baidu.com重定向存在問題
        if(url.equals("http://www.baidu.com")){
            url = "http://baidu.com";
        }
        
        return url;
    }
}