使用Java基於數據流直接抽取word文本

時間 2019-12-19

原文原文鏈接

以下代碼是直接基於數據流進行文本抽取，支持word97-word2003版本，以後的版本實際都是xml，抽取文本很是簡單，所以在此處再也不說明，代碼僅供研究學習使用，禁止用於商業用途。

public class WordExtractor { public static StringBuilder logBytes = new StringBuilder(); public static String bytesToString(byte[] ogiBytes, int start, int length, int fc) { StringBuilder content = new StringBuilder(); byte[] bytes = new byte[length]; System.arraycopy(ogiBytes, start, bytes, 0, length); if(fc == 0) { for(int i=0;i<bytes.length;i++) { if(i == bytes.length - 1) { return content.toString(); } String a = Integer.toHexString(bytes[i+1] & 0xFF); String b = Integer.toHexString(bytes[i] & 0xFF); if(a.length() == 1) { a = "0"+ a; } if(b.length() == 1) { b = "0"+ b; } String hexStr = a + b; int ch = Integer.valueOf(hexStr, 16); content.append( (char)ch ); i++; } } else { for(int i=0;i<bytes.length;i++) { int ch = bytes[i] & 0xFF; content.append( (char)ch ); } } return content.toString(); } public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc) { content.append( bytesToString(ogiBytes, start, length, fc) ); } public static void printLogBytes(List<Byte> legaled) throws Exception { logBytes = new StringBuilder(); logBytes.append(" ========================================================"); for(int a=0;a<legaled.size();a++) { if(a % 16 == 0) { logBytes.append(" "); } logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" "); } logBytes.append(" ========================================================"); FileUtil.writeAscFile("E:ytes.txt", logBytes.toString()); } public static int getOneTable(byte[] ogiBytes, Stream stream, int dirSect1) { for(int i=0;i<8;i++) { int offsetEntry = (dirSect1 + 1)*512 + i*128; StringBuilder content = new StringBuilder(); bytesToString(ogiBytes, content, offsetEntry, 64, 0); if(content.toString().indexOf("1Table") > -1) { return offsetEntry; } } return 0; } public static void main(String[] args) throws Exception { byte[] ogiBytes = FileUtil.readBinFile("D: oolsoletest est-old.doc"); System.out.println("Total bytes: "+ ogiBytes.length); if( ogiBytes.length < 8 || (ogiBytes[0] & 0xFF) != 208 || (ogiBytes[1] & 0xFF) != 207 || (ogiBytes[2] & 0xFF) != 17 || (ogiBytes[3] & 0xFF) != 224 || (ogiBytes[4] & 0xFF) != 161 || (ogiBytes[5] & 0xFF) != 177 || (ogiBytes[6] & 0xFF) != 26 || (ogiBytes[7] & 0xFF) != 225 ){ System.out.println("Not the doc file!"); return; } StringBuilder content = new StringBuilder(); Stream stream = new Stream(ogiBytes); int[] offset = new int[1]; offset[0] = 48; int dirSect1 = stream.getInteger(offset); int oneTable = getOneTable(ogiBytes, stream, dirSect1); offset[0] = oneTable + 116; int startSect = stream.getInteger(offset); int tableStream = (startSect + 1)*512; offset[0] = 930; int fcClx = stream.getInteger(offset); if(fcClx == -1) { System.out.println("This version of doc can not be parsed!"); return; } int offsetClx = tableStream + fcClx; offset[0] = offsetClx + 1; int lcb = stream.getInteger(offset); int countPcd = (lcb - 4)/12; int countCp = (lcb - countPcd*8)/4; int offsetPlcpcd = offsetClx + 5; for(int i=0;i<countPcd;i++) { int offsetPcd = offsetPlcpcd + countCp*4 + i*8; offset[0] = offsetPcd + 2; int start = stream.getInteger(offset); int fc = start >> 30; start = (start << 2) >> 2; offset[0] = offsetPlcpcd + i*4; int cpPre = stream.getInteger(offset); int cpNext = stream.getInteger(offset); int length = cpNext - cpPre -1; if(fc == 0) { length *= 2; } else { start = start/2; } start += 512; bytesToString(ogiBytes, content, start, length, fc); System.out.println(start +", "+ length); } FileUtil.writeAscFile("E:output.txt", content.toString(), false); System.out.println("Done!"); } }
參考資料：
http://blog.csdn.net/hu0406/article/details/3157192
http://msdn.microsoft.com/zh-cn/library/gg615596.aspxapp