C#讀取PDF文檔文字內容spa
經過iTextSharp讀取PDF文件內容,下載地址,下載後解壓itextsharp-dll-core.zip。.net
只能讀取英文和數字,文檔中包含的漢字沒法正常讀取:code
private string ReadPdfContent(string filepath) { try { string pdffilename = filepath; PdfReader pdfReader = new PdfReader(pdffilename); int numberOfPages = pdfReader.NumberOfPages; string text = string.Empty; for (int i = 1; i <= numberOfPages; ++i) { byte[] bufferOfPageContent = pdfReader.GetPageContent(i); text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent); } pdfReader.Close(); return text; } catch (Exception ex) { StreamWriter log = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\log.log"); log.WriteLine("出錯文件:" + e.FullPath + "緣由:" + ex.ToString()); log.Flush(); log.Close();return null; } }
能夠讀取中英文orm
private string OnCreated(string filepath) { try { string pdffilename = filepath; PdfReader pdfReader = new PdfReader(pdffilename); int numberOfPages = pdfReader.NumberOfPages; string text = string.Empty; for (int i = 1; i <= numberOfPages; ++i) { iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy); } pdfReader.Close(); return text; } catch (Exception ex) { StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\mylog.log"); wlog.WriteLine("出錯文件:" + e.FullPath + "緣由:" + ex.ToString()); wlog.Flush(); wlog.Close();return null; } }