該系列主要總結了使用java處理數據過程當中使用到的工具以及一些能夠起到啓發性的代碼。經過本章節你能夠學習到:css
import java.io.File; import java.util.HashSet; import java.util.Set; public class ListFilesWithSimple { public static void main(String[] args) { // 1.使用java從分層目錄中提取全部文件名 Set<File> files = ListFilesWithSimple.listFiles(new File("D:/hadoop")); for (File file : files) { System.out.println(file.getPath()); } } // 1. 從分層目錄中提取全部文件名(注意不包含文件夾) public static Set<File> listFiles(File rootDir){ Set<File> fileSet = new HashSet<File>(); // when file is null or file is not a directory. if(rootDir == null || rootDir.listFiles() == null){ return fileSet; } for (File file : rootDir.listFiles()) { if(file.isFile()){ fileSet.add(file); }else{ fileSet.addAll(listFiles(file)); } } return fileSet; } }
引入依賴java
<!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.6</version> </dependency>
import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.TrueFileFilter; import java.io.File; import java.util.List; public class ListFilesWithCommonIo { public static void main(String[] args) { List<File> files = ListFilesWithCommonIo.listFiles(new File("D:/hadoop")); for (File file : files) { System.out.println(file.getAbsolutePath()); } } public static List<File> listFiles(File rootDir){ return (List<File>)FileUtils.listFiles(rootDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); // 若須要返回目錄 //return (List<File>)FileUtils.listFilesAndDirs(rootDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); } }
package read; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.stream.Stream; public class Java8Read { public static void main(String[] args) { // 文件 String file = "錦瑟.poetry"; try { Stream<String> lines = Files.lines(Paths.get(file)); // 顯示每一行數據 lines.forEach(line -> System.out.println(line)); } catch (IOException e) { e.printStackTrace(); } } }
package read; import org.apache.commons.io.FileUtils; import java.io.File; import java.io.IOException; public class CommonIORead { public static void main(String[] args) { String file = "天淨沙-秋思.poetry"; try { String content = FileUtils.readFileToString(new File(file), "UTF-8"); System.out.println(content); } catch (IOException e) { e.printStackTrace(); } } }
導入依賴node
<dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers</artifactId> <version>1.20</version> </dependency>
package read; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import java.io.FileInputStream; import java.io.InputStream; public class TiKaRead { public static void main(String[] args) { String content = readPDF("他不懂.pdf"); System.out.println(content); } public static String readPDF(String fileName){ InputStream inputStream = null; String content = ""; try { inputStream = new FileInputStream(fileName); // 建立一個自動解析器 AutoDetectParser parser = new AutoDetectParser(); // 使用-1表示不對文件內容的大小進行限制 BodyContentHandler handler = new BodyContentHandler(-1); Metadata metaData = new Metadata(); parser.parse(inputStream, handler, metaData, new ParseContext()); System.out.println(metaData); // 調用Handler對象的toString獲取正文內容 content = handler.toString(); } catch (Exception e) { e.printStackTrace(); }finally { if(inputStream != null){ try{ inputStream.close(); }catch (Exception e){ System.out.println("Error Closing input stream."); } } } return content; } }
package filter; import com.sun.media.jfxmedia.track.Track; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; public class FilterASCII { public static void main(String[] args) { try { System.out.println(cleanText("this s a 趙義 text.")); } catch (Exception e) { e.printStackTrace(); } } public static String cleanText(String text){ // 1.去除全部非ASCII字符 text = text.replaceAll("[^\\p{ASCII}]",""); // 2.去除多餘的空格 text = text.replaceAll("\\s+"," "); // 3.清除ASCII控制字符 text = text.replaceAll("[\\p{Cntrl}]",""); // 4.清除非打印字符 text = text.replaceAll("[^\\p{Print}]",""); return text; } }
引入依賴python
<dependency> <groupId>com.univocity</groupId> <artifactId>univocity-parsers</artifactId> <version>2.8.1</version> </dependency>
有不少採用Java編寫的csv文件解析器。不過,Univocity是相對比較快的一種。mysql
package parser; import com.univocity.parsers.common.processor.RowListProcessor; import com.univocity.parsers.csv.CsvParser; import com.univocity.parsers.csv.CsvParserSettings; import java.io.File; import java.util.Arrays; import java.util.List; public class UnivocityTest { public static void main(String[] args) throws Exception{ String fileName = "模塊表.csv"; parseCSV(fileName); } public static void parseCSV(String fileName) throws Exception { // 建立一個配置對象,並配置 CsvParserSettings csvParserSettings = new CsvParserSettings(); // 自動檢測輸入中的分隔符序列 csvParserSettings.setLineSeparatorDetectionEnabled(true); // 指定把每一個解析的行存儲在列表中,寫入配置:使用rowListProcessor配置解析器,用來對每一個解析行的值進行處理 RowListProcessor processor = new RowListProcessor(); csvParserSettings.setProcessor(processor); // 若CSV文件包含標題頭,則能夠把第一行看作文件中每一個列的標題;不然無需設置。 //csvParserSettings.setHeaderExtractionEnabled(true); // 使用給定的配置建立一個parser實例 CsvParser csvParser = new CsvParser(csvParserSettings); csvParser.parse(new File(fileName)); // 默認將第一行看作頭 String[] headers = processor.getHeaders(); System.out.println(Arrays.asList(headers)); List<String[]> rows = processor.getRows(); for (String[] row : rows) { System.out.println(Arrays.asList(row)); } } }
TSV和CSV幾乎是同一類文件,只不過是TAB分隔不一樣列。正則表達式
package parser; import com.univocity.parsers.tsv.TsvParser; import com.univocity.parsers.tsv.TsvParserSettings; import java.io.File; import java.util.Arrays; import java.util.List; public class UnivocityTest2 { public static void main(String[] args) { String fileName = "模塊表.tsv"; parseTSV(fileName); } public static void parseTSV(String fileName){ // 建立一個配置對象,並配置 TsvParserSettings settings = new TsvParserSettings(); // 設置行分隔符 settings.getFormat().setLineSeparator("\n"); // 使用配置建立一個parser TsvParser parser = new TsvParser(settings); // 將文件內容一次性解析出來 List<String[]> rows = parser.parseAll(new File(fileName)); for (String[] row : rows) { System.out.println(Arrays.asList(row)); } } }
導入依賴sql
<!-- https://mvnrepository.com/artifact/org.jdom/jdom2 --> <dependency> <groupId>org.jdom</groupId> <artifactId>jdom2</artifactId> <version>2.0.6</version> </dependency>
package parser; import org.jdom2.Document; import org.jdom2.Element; import org.jdom2.JDOMException; import org.jdom2.input.SAXBuilder; import java.io.File; import java.io.IOException; import java.util.List; public class JDomTester { public static void main(String[] args) { parserXML("類型.xml"); } public static void parserXML(String fileName){ File file = new File(fileName); SAXBuilder builder = new SAXBuilder(); try { // 建立一個Document對象,表明訪問的XML文件 Document doc = builder.build(file); // 獲取根元素 Element rootEle = doc.getRootElement(); // 獲取根元素下的全部數據 List<Element> records = rootEle.getChildren("RECORD"); // 遍歷節點 for (Element record : records) { System.out.println("====== record ======"); System.out.println("id:" + record.getChildText("type_id")); System.out.println("name:" + record.getChildText("type_name")); System.out.println("state:" + record.getChildText("state")); System.out.println("create time:" + record.getChildText("create_time")); } } catch (Exception e) { e.printStackTrace(); } } }
引入依賴數據庫
<!-- https://mvnrepository.com/artifact/com.googlecode.json-simple/json-simple --> <dependency> <groupId>com.googlecode.json-simple</groupId> <artifactId>json-simple</artifactId> <version>1.1.1</version> </dependency>
package parser; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import java.io.FileReader; import java.io.FileWriter; public class JsonTester { public static void main(String[] args) throws Exception{ String fileName = "書籍.json"; writeJSONFile(fileName); readJSONFile(fileName); } public static void writeJSONFile(String fileName){ // 建立JSON對象 JSONObject obj = new JSONObject(); obj.put("name","來自新世界"); obj.put("author","貴志佑介"); obj.put("introduce", "一部暢想將來的架空世界觀的小說,不一樣於大多數科幻做品的風格,做者對將來展示的不是高科技時代,而是看起來甚至是比如今還要倒退一千年的時代..."); // 添加3個書籍評論 JSONArray comments = new JSONArray(); comments.add("這是一本能夠打滿分的書。"); comments.add("這是一本能夠打90分的書。"); comments.add("這是一本通常般的書。"); obj.put("comments", comments); // 寫入json文件 FileWriter writer = null; try { writer = new FileWriter(fileName); System.out.println(obj.toJSONString()); writer.write(obj.toJSONString()); writer.flush(); writer.close(); } catch (Exception e) { e.printStackTrace(); } } public static void readJSONFile(String fileName) throws Exception { // 建立一個JSON解析器 JSONParser parser = new JSONParser(); // 解析 JSONObject object = (JSONObject)parser.parse(new FileReader(fileName)); System.out.println("====== book info ======="); System.out.println("name:" + object.get("name")); System.out.println("author:" + object.get("author")); System.out.println("introduce:" + object.get("introduce")); System.out.println("comments:" + object.get("comments")); } }
還有一個工具包叫作fastjson,一樣是google出品。apache
jsoup是一個不錯的解析工具,其解析速度很是之快。究其緣由,主要是不須要對網頁進行動態解析,也不支持對網頁就行任何動態操做,而是直接解析第一次訪問得到的內容。所以,在使用jsoup的時候若是有其餘需求,例如想要調用按鈕點擊事件、觸發遠程調用等,應該考慮使用其餘的工具,例如python一個很出名的爬蟲框架selenium,該工具在java也有一個對應的工具包。json
引入依賴
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency>
package parser; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.IOException; public class JSoupTester { public static void main(String[] args) { extractDataByJSoup("https://www.baidu.com/"); } public static void extractDataByJSoup(String url){ try { Document doc = Jsoup.connect(url).timeout(10 * 1000).ignoreHttpErrors(true).get(); if(doc == null){ System.out.println("This url is null."); return; } // 提取網址標題 System.out.println("====== The Web Info ======"); System.out.println("title:" + doc.title()); // 提取正文內容 System.out.println("body content:" + doc.body().text()); // 獲取全部超連接:css選擇器的寫法 System.out.println("all href:" + doc.select("a[href]")); } catch (Exception e) { e.printStackTrace(); } } }
使用這些工具的時候還須要在所在的服務器上安裝對應的瀏覽器包,這裏就不一一介紹了。
讀取mysql數據的方式相信只要是一個開發者,都掌握了各類五花八門的方式了,甚至編寫了本身的訪問框架。這裏咱們就學習一下比較原始的訪問方式,不去深究一些主流框架的使用。
引入依賴
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.13</version> </dependency>
package visit; import com.mysql.cj.jdbc.MysqlDataSource; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; public class MySQLTester { public static void main(String[] args) throws Exception { visitMysqlData(); } public static void visitMysqlData() throws Exception { MysqlDataSource dataSource = new MysqlDataSource(); dataSource.setUser("root"); dataSource.setPassword("your_database_password"); // 這裏輸入您的數據庫地址,例如本機localhost dataSource.setServerName("10.21.1.242"); // 創建鏈接 Connection connection = dataSource.getConnection(); Statement statement = connection.createStatement(); // 獲取查詢數據:sunrun_sdfs是庫名,sdfs_user是代表 ResultSet resultSet = statement.executeQuery("select * from sunrun_sdfs.sdfs_user limit 5"); while (resultSet.next()){ System.out.println("====== user info ======"); System.out.println("id: " + resultSet.getString("user_id")); System.out.println("name: " + resultSet.getString("user_name")); System.out.println("login name: " + resultSet.getString("login_name")); } resultSet.close(); statement.close(); connection.close(); } }