doc pdf ppt與 txt之間的轉換 :編程
組件的做用通常是將文件讀出成字符格式,並非單純的轉換文件名後綴,因此須要將讀出的東西寫入txt文件 。app
添加office引用ide
.net中對office中的word及ppt進行編程時,確保安裝office時已經安裝了word,ppt可編程組件(自定義安裝時可查看)或者安裝「Microsoft Office 2003 Primary Interop Assemblies」ui
安裝後,在編程頁面添加引用:.net
添加引用-com—microsoft powerpoint object 11.0 libaray/word 11.0 object library;ip
還得添加office組件get
using Microsoft.Office.Interop.Word;string
using Microsoft.Office.Interop.PowerPoint;it
using org.pdfbox.pdmodel;io
using org.pdfbox.util;
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.PowerPoint;
publicvoid pdf2txt(FileInfo file,FileInfo txtfile)
{
PDDocument doc =PDDocument.load(file.FullName);
PDFTextStripper pdfStripper =newPDFTextStripper();
string text = pdfStripper.getText(doc);
StreamWriter swPdfChange =newStreamWriter(txtfile.FullName,false,Encoding.GetEncoding("gb2312"));
swPdfChange.Write(text);
swPdfChange.Close();
}
對於doc文件中的表格,讀出的結果是去除掉了網格線,內容按行讀取。
Public void word2text(FileInfo file,FileInfo txtfile)
{
object readOnly =true;
object missing = System.Reflection.Missing.Value;
object fileName = file.FullName;
Microsoft.Office.Interop.Word.ApplicationClass wordapp =new Microsoft.Office.Interop.Word.ApplicationClass();
Document doc = wordapp.Documents.Open(ref fileName,
ref missing,ref readOnly,ref missing, ref missing,ref missing,
ref missing,ref missing,ref missing, ref missing,ref missing,
ref missing,ref missing,ref missing, ref missing,ref missing);
string text = doc.Content.Text;
doc.Close(ref missing,ref missing,ref missing);
wordapp.Quit(ref missing,ref missing,ref missing);
StreamWriter swWordChange =new StreamWriter(txtfile.FullName,false,Encoding.GetEncoding("gb2312"));
swWordChange.Write(text);
swWordChange.Close();
}
Public void ppt2txt(FileInfo file, FileInfo txtfile)
{
Microsoft.Office.Interop.PowerPoint.Application pa =new Microsoft.Office.Interop.PowerPoint.ApplicationClass();
Microsoft.Office.Interop.PowerPoint.Presentation pp = pa.Presentations.Open(file.FullName,
Microsoft.Office.Core.MsoTriState.msoTrue,
Microsoft.Office.Core.MsoTriState.msoFalse,
Microsoft.Office.Core.MsoTriState.msoFalse);
string pps ="";
StreamWriter swPPtChange =new StreamWriter(txtfile.FullName,false,Encoding.GetEncoding("gb2312"));
foreach (Microsoft.Office.Interop.PowerPoint.Slide slidein pp.Slides)
{
foreach (Microsoft.Office.Interop.PowerPoint.Shape shapein slide.Shapes)
pps += shape.TextFrame.TextRange.Text.ToString();
}
swPPtChange.Write(pps);
swPPtChange.Close();
}
讀取不一樣類型的文件
Public StreamReader text2reader(FileInfo file)
{
StreamReader st =null;
switch (file.Extension.ToLower())
{
case".txt":
st = new StreamReader(file.FullName,Encoding.GetEncoding("gb2312"));
break;
case".doc":
FileInfo wordfile =new FileInfo(@"E:/my programs/200807program/FileSearch/App_Data/word2txt.txt");//不能使用相對路徑,想辦法改進
word2text(file, wordfile);
st = newStreamReader(wordfile.FullName,Encoding.GetEncoding("gb2312"));
break;
case".pdf":
FileInfo pdffile =new FileInfo(@"E:/my programs/200807program/FileSearch/App_Data/pdf2txt.txt");
pdf2txt(file, pdffile);
st = new StreamReader(pdffile.FullName,Encoding.GetEncoding("gb2312"));
break;
case".ppt":
FileInfo pptfile =new FileInfo(@"E:/my programs/200807program/FileSearch/App_Data/ppt2txt.txt");
ppt2txt(file,pptfile);
st = new StreamReader(pptfile.FullName,Encoding.GetEncoding("gb2312"));
break;
}
return st;
}