package com.distinct.servlet.url;html
import java.io.IOException;java
import java.io.PrintWriter;mysql
import java.sql.Connection;web
import java.sql.DriverManager;sql
import java.sql.ResultSet;數據庫
import java.sql.SQLException;app
import java.sql.Statement;異步
import java.util.HashSet;ide
import java.util.Iterator;測試
import java.util.Set;
import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
@SuppressWarnings("serial")
public class DBServlet extends HttpServlet {
// 定義一個ServletConfig對象
private ServletConfig config = null;
// 初始化結果集
ResultSet selectRes = null;
// 定義私有字符串常量並初始化
private String driverName = "";
// 定義的數據庫用戶名
private String username = "";
// 定義的數據庫鏈接密碼
private String password = "";
//數據庫的鏈接路徑
private String DB_URL = "";
//鏈接的數據庫的表
private String mysqltable = "";
// 初始化鏈接
private Connection connect;
// 初始化數據庫操做
private Statement stmtement;
public void contextInitialized(ServletContextEvent sce) {
}
/**
*
* @param path 配置文件的路徑
* @param sc ServletContext對象
*/
public void init(ServletConfig config){
this.driverName = config.getInitParameter("DRIVERNAME");
this.DB_URL = config.getInitParameter("DB_URL");
this.username = config.getInitParameter("USERNAME");
this.password = config.getInitParameter("PASSWORD");
this.mysqltable = config.getInitParameter("MYTABLE");
}
// 處理 GET 方法請求的方法
protected void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
// 設置響應內容類型
PrintWriter out = response.getWriter();
/* String title = "Using GET Method to Read Form Data"; String docType =
* "<!doctype html public \"-//w3c//dtd html 4.0 " +
* "transitional//en\">\n"; out.println(docType + "<html>\n" +
* "<head><title>" + title + "</title></head>\n" +
* "<body bgcolor=\"#f0f0f0\">\n" + "<h1 align=\"center\">" + title +
* "</h1>\n" + "<ul>\n" + " <li><b>url</b>:" +
* request.getParameter("url") + "\n" + " <li><b>title</b>:" +
* request.getParameter("title") + "\n" + " <li><b>time</b>:" +
* request.getParameter("time") + "\n" + " <li><b>source</b>:" +
* request.getParameter("source") + "\n" + "</ul>\n" +
* "</body></html>");
*/
}
// 處理 POST 方法請求的方法
public void doPost(HttpServletRequest request, HttpServletResponse response ) throws ServletException, IOException {
//頁面的顯示格式
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");
doGet(request, response);
}
protected synchronized void service(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException{
doPost(request, response);
//獲取當前時間
long startTime = System.currentTimeMillis();
// 客戶端輸入的參數;
String m_Url ;
m_Url = request.getParameter("url");
System.out.println(m_Url);
// 抓取新聞的標題
String m_Title ;
m_Title = request.getParameter("title");
System.out.println(m_Title);
// 新聞的發佈時間
String Createtime ;
Createtime = request.getParameter("time");
System.out.println(Createtime);
String m_CreatetimeFormat = "";
// 新聞的抓取來源
String m_Source = request.getParameter("source");
System.out.println(m_Source);
// 新聞的發佈單位
String m_publisher_location = request.getParameter("m_publisher_location");
System.out.println(m_publisher_location);
// 新聞的語言類
String language = request.getParameter("language");
System.out.println(language);
// 默認編碼
// System.out.println(java.nio.charset.Charset.defaultCharset());
//記錄抓取的所有路徑
String t_Url = "";
//數據查詢語句
String selectSql = "SELECT m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,id,language,t_Url FROM "
+ this.mysqltable ;
// 去重邏輯
if (m_Url != null && m_CreatetimeFormat != null && m_Title != null && m_Title != "") {
//截取時間的年月日的格式
if(Createtime.length()>=10 ){
m_CreatetimeFormat = Createtime.substring(0, 10);
}else{
m_CreatetimeFormat = Createtime;
}
String value4 = null;
String value6 = null;
try {
// 註冊 JDBC 驅動器
System.out.println(this.driverName);
Class.forName(this.driverName);
// 加載MYSQL JDBC驅動程序
System.out.println("Success loading Mysql Driver!");
//打開一個鏈接
connect = DriverManager.getConnection(this.DB_URL, this.username, this.password);
stmtement = connect.createStatement();
// 查詢數據並輸出
selectRes = stmtement.executeQuery(selectSql);
int i = 0;
while (selectRes.next()) {
// 循環輸出結果集
String value1 = selectRes.getString(1);
// response.getWriter().println(value1 + "<BR>");
String value2 = selectRes.getString(2);
// response.getWriter().println(value2+ "<BR>");
String value3 = selectRes.getString(3);
// response.getWriter().println(value3 + "<BR>");
value4 = selectRes.getString(4);
String value5 = selectRes.getString(5);
value6 = selectRes.getString(6);
String value7 = selectRes.getString(7);
String value8 = selectRes.getString(8);
float similarity = levenshtein(m_Title, value2);
// System.out.println(similarity);
// 抓取新聞的去重邏輯
// *m_Source可能存在重複值
if (!m_Url.equals(value1) && m_publisher_location.equals(value5)) {
// 計算標題類似度
if (similarity > 0.9) {
// m_Title==value2
// 存在不一樣url的相同網頁,請不用抓取
if (m_CreatetimeFormat.equals(value3)) {
float similarityurl = levenshtein(m_Url, value1);
if(similarityurl<0.8){
t_Url = m_Url;
t_Url = t_Url + ";" + value8;
}
t_Url = setDis(t_Url);
//m_Source和 t_Url去重
value4 = value4 + ";" + m_Source;
//去掉value裏面相同的值
value4 = setDis(value4);
i = -1;
break;
} else {// 存在不一樣url的更新網頁,請抓取
i = i + 1;
}
} else {
continue;
}
}
else if (!m_Url.equals(value1) && !m_publisher_location.equals(value5)) {
continue;
}
else { // m_Url == value1 && m_publisher_location == value5)
if (m_CreatetimeFormat.equals(value3)) {
i = -1;
break;
} else {
// 存在相同網頁的更新數據,請抓取;
i = i + 1;
continue;
}
}
}
if (i == -1) {
// response.getWriter().println("網頁數據已存在,這條數據不要抓取" +"<BR>");
response.getWriter().println("false");
String updateSql = "UPDATE " + this.mysqltable + " SET m_Source = '" + value4 + "', " + " t_Url ='" + t_Url + "' WHERE id ="
+ value6;
stmtement.executeUpdate(updateSql);
} else if (i == 0) {
// response.getWriter().println("這條新數據須要抓取" + "<BR>");
response.getWriter().println("true");
String insertSql = "INSERT INTO " + this.mysqltable
+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"
+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"
+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";
stmtement.executeUpdate(insertSql);
} else {
// response.getWriter().println("網頁數據已更新,這條數據須要抓取" +"<BR>");
response.getWriter().println("true");
String insertSql = "INSERT INTO " + this.mysqltable
+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"
+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"
+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";
stmtement.executeUpdate(insertSql);
}
// 清理環境
selectRes.close();
stmtement.close();
connect.close();
} catch (SQLException | ClassNotFoundException e) {
// 處理 Class.forName 錯誤
e.printStackTrace();
//logger.error(e);
} finally {
// 最後是用於關閉資源的塊
try {
if (stmtement != null)
stmtement.close();
} catch (SQLException se2) {
// logger.error("關閉數據庫出現異常",se2);
} // 咱們不能作什麼
try {
if (connect != null)
connect.close();
} catch (SQLException se) {
se.printStackTrace();
//logger.error("鏈接數據庫出現異常",se);
} // end finally try
} // end try
} else {
// response.getWriter().println("輸入數據格式不正確,請從新輸入!" + "<BR>");
response.getWriter().println("null");
System.out.println("Some information was missing. Please see below for details.");
System.out.println("m_Url,m_Title,m_CreatetimeFormat :Value can't be null");
}
//測試程序運行時間
long endTime = System.currentTimeMillis();
System.out.println("程序運行時間:"+(endTime-startTime)+"ms");
}
//str爲多個子串之間以分號隔開
//對子串之間進行去重
//最後返回的字符串也是多個子串之間以分號隔開的字符串;
public static String setDis(String str) {
String[] aa = str.split(";");
Set set = new HashSet();
for (int il = 0; il < aa.length; il++) {
set.add(aa[il]);
}
str = "";
for (Iterator it = set.iterator(); it.hasNext();) {
String aaa = it.next().toString();
str = aaa + ";" + str;
}
return str;
}
//正向,反向字符串比較
public static float levenshtein(String str1, String str2) {
int len1 = str1.length();
int len2 = str2.length();
int[][] dif = new int[len1 + 1][len2 + 1];
for (int a = 0; a <= len1; a++) {
dif[a][0] = a;
}
for (int a = 0; a <= len2; a++) {
dif[0][a] = a;
}
int temp;
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (str1.charAt(i - 1) == str2.charAt(j - 1)) {
temp = 0;
} else {
temp = 1;
}
// 取三個值中最小的
dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1, dif[i - 1][j] + 1);
}
}
// System.out.println("字符串\"" + str1 + "\"與\"" + str2 + "\"的比較");
// System.out.println("差別步驟:" + dif[len1][len2]);
// 計算類似度
float similarity = 1 - (float) dif[len1][len2] / Math.max(str1.length(), str2.length());
// System.out.println("類似度:" + similarity);
return similarity;
}
private static int min(int... is) {
int min = Integer.MAX_VALUE;
for (int i : is) {
if (min > i) {
min = i;
}
}
return min;
}
}
web.xml
<?xml version="1.0" encoding="UTF-8"?>
<web-app version="2.5"
xmlns="http://java.sun.com/xml/ns/javaee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd">
<servlet>
<servlet-name>DBServlet</servlet-name>
<servlet-class>com.distinct.servlet.url.DBServlet</servlet-class>
<init-param>
<param-name>DRIVERNAME</param-name>
<param-value>com.mysql.jdbc.Driver</param-value>
</init-param>
<init-param>
<param-name>DB_URL</param-name>
<param-value>jdbc:mysql://192.168.4.114:3306/ZHCNNEWS</param-value>
</init-param>
<init-param>
<param-name>USERNAME</param-name>
<param-value>lao_test</param-value>
</init-param>
<init-param>
<param-name>PASSWORD</param-name>
<param-value>LAOtest_123</param-value>
</init-param>
<init-param>
<param-name>MYTABLE</param-name>
<param-value>uniformTest</param-value>
</init-param>
</servlet>
<servlet-mapping>
<servlet-name>DBServlet</servlet-name>
<url-pattern>/DBServlet</url-pattern>
</servlet-mapping>