C#讀取PDF文檔文字內容

C#讀取PDF文檔文字內容spa

經過iTextSharp讀取PDF文件內容,下載地址,下載後解壓itextsharp-dll-core.zip。.net

只能讀取英文和數字,文檔中包含的漢字沒法正常讀取:code

private string ReadPdfContent(string filepath)  
{  
    try  
    {  
        string pdffilename = filepath;  
        PdfReader pdfReader = new PdfReader(pdffilename);  
        int numberOfPages = pdfReader.NumberOfPages;  
        string text = string.Empty;  
  
        for (int i = 1; i <= numberOfPages; ++i)  
        {  
            byte[] bufferOfPageContent = pdfReader.GetPageContent(i);  
            text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent);  
        }  
        pdfReader.Close();  
  
        return text;  
    }  
    catch (Exception ex)  
    {  
        StreamWriter log = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\log.log");  
        log.WriteLine("出錯文件:" + e.FullPath + "緣由:" + ex.ToString());  
        log.Flush();  
        log.Close();return null;  
    } 
}  

 

能夠讀取中英文orm

private string OnCreated(string filepath)  
{  
    try  
    {  
        string pdffilename = filepath;  
        PdfReader pdfReader = new PdfReader(pdffilename);  
        int numberOfPages = pdfReader.NumberOfPages;  
        string text = string.Empty;  
  
        for (int i = 1; i <= numberOfPages; ++i)  
        {  
            iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
            text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
        }  
        pdfReader.Close();  
  
        return text;  
    }  
    catch (Exception ex)  
    {  
        StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\mylog.log");  
        wlog.WriteLine("出錯文件:" + e.FullPath + "緣由:" + ex.ToString());  
        wlog.Flush();  
        wlog.Close();return null;  
    }  
  
} 
相關文章
相關標籤/搜索