Servlet 網頁去重

package com.distinct.servlet.url;html



import java.io.IOException;java

import java.io.PrintWriter;mysql

import java.sql.Connection;web

import java.sql.DriverManager;sql

import java.sql.ResultSet;數據庫

import java.sql.SQLException;app

import java.sql.Statement;異步

import java.util.HashSet;ide

import java.util.Iterator;測試

import java.util.Set;

import javax.servlet.ServletConfig;

import javax.servlet.ServletContext;

import javax.servlet.ServletContextEvent;

import javax.servlet.ServletException;

import javax.servlet.http.HttpServlet;

import javax.servlet.http.HttpServletRequest;

import javax.servlet.http.HttpServletResponse;




@SuppressWarnings("serial")

public class DBServlet extends HttpServlet {

// 定義一個ServletConfig對象

private ServletConfig config = null;

// 初始化結果集

ResultSet selectRes = null;

// 定義私有字符串常量並初始化

private String driverName = "";

// 定義的數據庫用戶名

private String  username = "";

// 定義的數據庫鏈接密碼

private String password = "";

//數據庫的鏈接路徑

    private String DB_URL = "";

//鏈接的數據庫的表

    private String mysqltable = "";

// 初始化鏈接

private Connection connect;

// 初始化數據庫操做

private Statement stmtement;

 

public void contextInitialized(ServletContextEvent sce) {

}


/**

 * 

 * @param path 配置文件的路徑

 * @param sc ServletContext對象

 */

public void init(ServletConfig config){

this.driverName = config.getInitParameter("DRIVERNAME"); 

this.DB_URL = config.getInitParameter("DB_URL");

this.username = config.getInitParameter("USERNAME");

this.password = config.getInitParameter("PASSWORD");

this.mysqltable = config.getInitParameter("MYTABLE");

 

    

}


// 處理 GET 方法請求的方法

protected void doGet(HttpServletRequest request, HttpServletResponse response)

throws ServletException, IOException {

// 設置響應內容類型

PrintWriter out = response.getWriter();

/* String title = "Using GET Method to Read Form Data"; String docType =

* "<!doctype html public \"-//w3c//dtd html 4.0 " +

* "transitional//en\">\n"; out.println(docType + "<html>\n" +

* "<head><title>" + title + "</title></head>\n" +

* "<body bgcolor=\"#f0f0f0\">\n" + "<h1 align=\"center\">" + title +

* "</h1>\n" + "<ul>\n" + "  <li><b>url</b>:" +

* request.getParameter("url") + "\n" + "  <li><b>title</b>:" +

* request.getParameter("title") + "\n" + "  <li><b>time</b>:" +

* request.getParameter("time") + "\n" + "  <li><b>source</b>:" +

* request.getParameter("source") + "\n" + "</ul>\n" +

* "</body></html>");

*/

}


// 處理 POST 方法請求的方法

public void doPost(HttpServletRequest request, HttpServletResponse response ) throws ServletException, IOException {

//頁面的顯示格式

response.setContentType("text/html");

request.setCharacterEncoding("UTF-8");


doGet(request, response);


}


@Override

protected synchronized void service(HttpServletRequest request, HttpServletResponse response)

throws ServletException, IOException{


doPost(request, response);

//獲取當前時間

long startTime = System.currentTimeMillis();

// 客戶端輸入的參數;

String m_Url ;

m_Url = request.getParameter("url");

System.out.println(m_Url);

// 抓取新聞的標題

String m_Title ;

   m_Title = request.getParameter("title");

System.out.println(m_Title);

// 新聞的發佈時間

String Createtime ;

Createtime = request.getParameter("time");

System.out.println(Createtime);

String m_CreatetimeFormat = "";

// 新聞的抓取來源

String m_Source = request.getParameter("source");

System.out.println(m_Source);

// 新聞的發佈單位

String m_publisher_location = request.getParameter("m_publisher_location");

System.out.println(m_publisher_location);

// 新聞的語言類

String language = request.getParameter("language");

System.out.println(language);

// 默認編碼

// System.out.println(java.nio.charset.Charset.defaultCharset());

//記錄抓取的所有路徑

String t_Url = "";

//數據查詢語句

String selectSql = "SELECT m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,id,language,t_Url FROM "

+ this.mysqltable ;

// 去重邏輯

if (m_Url != null && m_CreatetimeFormat != null && m_Title != null && m_Title != "") {

//截取時間的年月日的格式

if(Createtime.length()>=10 ){

m_CreatetimeFormat = Createtime.substring(0, 10);

}else{

m_CreatetimeFormat = Createtime;

}

String value4 = null;

String value6 = null;

try {

// 註冊 JDBC 驅動器

System.out.println(this.driverName);

Class.forName(this.driverName);

// 加載MYSQL JDBC驅動程序

System.out.println("Success loading Mysql Driver!");

//打開一個鏈接

connect = DriverManager.getConnection(this.DB_URL, this.username, this.password);


stmtement = connect.createStatement();

// 查詢數據並輸出

selectRes = stmtement.executeQuery(selectSql);

int i = 0;


while (selectRes.next()) {

// 循環輸出結果集

String value1 = selectRes.getString(1);

// response.getWriter().println(value1 + "<BR>");

String value2 = selectRes.getString(2);

// response.getWriter().println(value2+ "<BR>");

String value3 = selectRes.getString(3);

// response.getWriter().println(value3 + "<BR>");

value4 = selectRes.getString(4);

String value5 = selectRes.getString(5);

value6 = selectRes.getString(6);

String value7 = selectRes.getString(7);

String value8 = selectRes.getString(8);


float similarity = levenshtein(m_Title, value2);

// System.out.println(similarity);


// 抓取新聞的去重邏輯

// *m_Source可能存在重複值

if (!m_Url.equals(value1) && m_publisher_location.equals(value5)) {

// 計算標題類似度

if (similarity > 0.9) {

// m_Title==value2

// 存在不一樣url的相同網頁,請不用抓取

if (m_CreatetimeFormat.equals(value3)) {

float similarityurl = levenshtein(m_Url, value1);

if(similarityurl<0.8){

t_Url = m_Url;

t_Url = t_Url + ";" + value8;

}

t_Url = setDis(t_Url);

//m_Source和 t_Url去重

value4 = value4 + ";" + m_Source;

//去掉value裏面相同的值

value4 = setDis(value4);

i = -1;

break;

} else {// 存在不一樣url的更新網頁,請抓取

i = i + 1;

}

} else {

continue;

}


}


else if (!m_Url.equals(value1) && !m_publisher_location.equals(value5)) {

continue;

}


else { // m_Url == value1 && m_publisher_location == value5)

if (m_CreatetimeFormat.equals(value3)) {

i = -1;

break;

} else {

// 存在相同網頁的更新數據,請抓取;

i = i + 1;

continue;

}

}

}


if (i == -1) {


// response.getWriter().println("網頁數據已存在,這條數據不要抓取" +"<BR>");

response.getWriter().println("false");

String updateSql = "UPDATE " + this.mysqltable + " SET m_Source = '" + value4 + "', " + " t_Url ='" + t_Url + "' WHERE id ="

+ value6;

stmtement.executeUpdate(updateSql); 

} else if (i == 0) {


// response.getWriter().println("這條新數據須要抓取" + "<BR>");

response.getWriter().println("true");

String insertSql = "INSERT INTO " + this.mysqltable

+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"

+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"

+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";

stmtement.executeUpdate(insertSql);


} else {


// response.getWriter().println("網頁數據已更新,這條數據須要抓取" +"<BR>");

response.getWriter().println("true");

String insertSql = "INSERT INTO " + this.mysqltable

+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"

+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"

+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";

stmtement.executeUpdate(insertSql);

}

// 清理環境

selectRes.close();

stmtement.close();

connect.close();

} catch (SQLException | ClassNotFoundException e) {

// 處理 Class.forName 錯誤

e.printStackTrace();

//logger.error(e);

} finally {

// 最後是用於關閉資源的塊

try {

if (stmtement != null)

stmtement.close();

} catch (SQLException se2) {

// logger.error("關閉數據庫出現異常",se2);

} // 咱們不能作什麼

try {

if (connect != null)

connect.close();

} catch (SQLException se) {

se.printStackTrace();

//logger.error("鏈接數據庫出現異常",se);

} // end finally try

} // end try

} else {

// response.getWriter().println("輸入數據格式不正確,請從新輸入!" + "<BR>");

response.getWriter().println("null");

System.out.println("Some information was missing. Please see below for details.");

System.out.println("m_Url,m_Title,m_CreatetimeFormat :Value can't be null");

}

//測試程序運行時間

long endTime = System.currentTimeMillis();

System.out.println("程序運行時間:"+(endTime-startTime)+"ms");

}

//str爲多個子串之間以分號隔開

//對子串之間進行去重

//最後返回的字符串也是多個子串之間以分號隔開的字符串;

public static String setDis(String str) {

String[] aa = str.split(";");

Set set = new HashSet();

for (int il = 0; il < aa.length; il++) {

set.add(aa[il]);

}

str = "";

for (Iterator it = set.iterator(); it.hasNext();) {

String aaa = it.next().toString();

str = aaa + ";" + str;

}

return str;

}

//正向,反向字符串比較

public static float levenshtein(String str1, String str2) {


int len1 = str1.length();

int len2 = str2.length();


int[][] dif = new int[len1 + 1][len2 + 1];


for (int a = 0; a <= len1; a++) {

dif[a][0] = a;

}

for (int a = 0; a <= len2; a++) {

dif[0][a] = a;

}


int temp;

for (int i = 1; i <= len1; i++) {

for (int j = 1; j <= len2; j++) {

if (str1.charAt(i - 1) == str2.charAt(j - 1)) {

temp = 0;

} else {

temp = 1;

}

// 取三個值中最小的

dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1, dif[i - 1][j] + 1);

}

}

// System.out.println("字符串\"" + str1 + "\"與\"" + str2 + "\"的比較");

// System.out.println("差別步驟:" + dif[len1][len2]);

// 計算類似度

float similarity = 1 - (float) dif[len1][len2] / Math.max(str1.length(), str2.length());

// System.out.println("類似度:" + similarity);

return similarity;

}


private static int min(int... is) {

int min = Integer.MAX_VALUE;

for (int i : is) {

if (min > i) {

min = i;

}

}

return min;

}

}




web.xml

<?xml version="1.0" encoding="UTF-8"?>

<web-app version="2.5" 

    xmlns="http://java.sun.com/xml/ns/javaee" 

    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 

    xsi:schemaLocation="http://java.sun.com/xml/ns/javaee 

    http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd">

  <servlet>

    <servlet-name>DBServlet</servlet-name>

    <servlet-class>com.distinct.servlet.url.DBServlet</servlet-class>    

    <init-param>  

        <param-name>DRIVERNAME</param-name>  

        <param-value>com.mysql.jdbc.Driver</param-value>  

     </init-param>  

    <init-param>  

        <param-name>DB_URL</param-name>  

        <param-value>jdbc:mysql://192.168.4.114:3306/ZHCNNEWS</param-value>  

     </init-param>  

     <init-param>  

        <param-name>USERNAME</param-name>  

        <param-value>lao_test</param-value>  

     </init-param>  

   <init-param>  

        <param-name>PASSWORD</param-name>  

        <param-value>LAOtest_123</param-value>  

     </init-param>

<init-param>  

        <param-name>MYTABLE</param-name>  

        <param-value>uniformTest</param-value>  

     </init-param>  

  </servlet>

  <servlet-mapping>

        <servlet-name>DBServlet</servlet-name>

        <url-pattern>/DBServlet</url-pattern>

    </servlet-mapping>

相關文章
相關標籤/搜索