以下代碼是直接基於數據流進行文本抽取,支持word97-word2003版本,以後的版本實際都是xml,抽取文本很是簡單,所以在此處再也不說明,代碼僅供研究學習使用,禁止用於商業用途。
public class WordExtractor {
public static StringBuilder logBytes = new StringBuilder();
public static String bytesToString(byte[] ogiBytes, int start, int length, int fc)
{
StringBuilder content = new StringBuilder();
byte[] bytes = new byte[length];
System.arraycopy(ogiBytes, start, bytes, 0, length);
if(fc == 0)
{
for(int i=0;i<bytes.length;i++)
{
if(i == bytes.length - 1)
{
return content.toString();
}
String a = Integer.toHexString(bytes[i+1] & 0xFF);
String b = Integer.toHexString(bytes[i] & 0xFF);
if(a.length() == 1)
{
a = "0"+ a;
}
if(b.length() == 1)
{
b = "0"+ b;
}
String hexStr = a + b;
int ch = Integer.valueOf(hexStr, 16);
content.append( (char)ch );
i++;
}
}
else
{
for(int i=0;i<bytes.length;i++)
{
int ch = bytes[i] & 0xFF;
content.append( (char)ch );
}
}
return content.toString();
}
public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)
{
content.append( bytesToString(ogiBytes, start, length, fc) );
}
public static void printLogBytes(List<Byte> legaled) throws Exception
{
logBytes = new StringBuilder();
logBytes.append(" ========================================================");
for(int a=0;a<legaled.size();a++)
{
if(a % 16 == 0)
{
logBytes.append(" ");
}
logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");
}
logBytes.append(" ========================================================");
FileUtil.writeAscFile("E:ytes.txt", logBytes.toString());
}
public static int getOneTable(byte[] ogiBytes, Stream stream, int dirSect1)
{
for(int i=0;i<8;i++)
{
int offsetEntry = (dirSect1 + 1)*512 + i*128;
StringBuilder content = new StringBuilder();
bytesToString(ogiBytes, content, offsetEntry, 64, 0);
if(content.toString().indexOf("1Table") > -1)
{
return offsetEntry;
}
}
return 0;
}
public static void main(String[] args) throws Exception
{
byte[] ogiBytes = FileUtil.readBinFile("D: oolsoletest est-old.doc");
System.out.println("Total bytes: "+ ogiBytes.length);
if(
ogiBytes.length < 8 ||
(ogiBytes[0] & 0xFF) != 208 ||
(ogiBytes[1] & 0xFF) != 207 ||
(ogiBytes[2] & 0xFF) != 17 ||
(ogiBytes[3] & 0xFF) != 224 ||
(ogiBytes[4] & 0xFF) != 161 ||
(ogiBytes[5] & 0xFF) != 177 ||
(ogiBytes[6] & 0xFF) != 26 ||
(ogiBytes[7] & 0xFF) != 225
){
System.out.println("Not the doc file!");
return;
}
StringBuilder content = new StringBuilder();
Stream stream = new Stream(ogiBytes);
int[] offset = new int[1];
offset[0] = 48;
int dirSect1 = stream.getInteger(offset);
int oneTable = getOneTable(ogiBytes, stream, dirSect1);
offset[0] = oneTable + 116;
int startSect = stream.getInteger(offset);
int tableStream = (startSect + 1)*512;
offset[0] = 930;
int fcClx = stream.getInteger(offset);
if(fcClx == -1)
{
System.out.println("This version of doc can not be parsed!");
return;
}
int offsetClx = tableStream + fcClx;
offset[0] = offsetClx + 1;
int lcb = stream.getInteger(offset);
int countPcd = (lcb - 4)/12;
int countCp = (lcb - countPcd*8)/4;
int offsetPlcpcd = offsetClx + 5;
for(int i=0;i<countPcd;i++)
{
int offsetPcd = offsetPlcpcd + countCp*4 + i*8;
offset[0] = offsetPcd + 2;
int start = stream.getInteger(offset);
int fc = start >> 30;
start = (start << 2) >> 2;
offset[0] = offsetPlcpcd + i*4;
int cpPre = stream.getInteger(offset);
int cpNext = stream.getInteger(offset);
int length = cpNext - cpPre -1;
if(fc == 0)
{
length *= 2;
}
else
{
start = start/2;
}
start += 512;
bytesToString(ogiBytes, content, start, length, fc);
System.out.println(start +", "+ length);
}
FileUtil.writeAscFile("E:output.txt", content.toString(), false);
System.out.println("Done!");
}
}
參考資料:
http://blog.csdn.net/hu0406/article/details/3157192
http://msdn.microsoft.com/zh-cn/library/gg615596.aspxapp