VBA使用 COM API 使用 OneNote 2013/2016 的圖像識別功能

最近由大量的掃描單據須要摘錄,就但願可以經過VBA程序輔助完成這項工做。通過一番檢索,在能獲取到的主要的OCR產品中,微軟產品的識別率相對較高。但目前經常使用的Office 2013和Office 2016  Microsoft OFFICE 2013之後,Microsoft Office Document Imaging就不在支持了,網上可以下載到繁體中文的ODI,但在Windows 10下沒法安裝。只能在OneNote的圖像識別功能了。html

根據網上的文章作了基於VBA的OCR,在編寫XML的過程當中頗費了一些周折。根據錯誤代碼判斷錯誤的問題點仍是頗有幫助的。node

https://msdn.microsoft.com/zh-cn/magazine/ff796230.aspxapp

http://www.cnblogs.com/BenAndWang/p/5826634.htmlide

https://msdn.microsoft.com/zh-cn/library/jj680117函數

如下爲代碼部分:編碼

Function GetTextFromSinglePicture(inPicPath As String) As String

    '圖片的信息編碼,並輸出到xml文本中

    Dim xmlDoc As New MSXML2.DOMDocument60
    Dim xmlNode As MSXML2.IXMLDOMNode
    Dim xmlEle As MSXML2.IXMLDOMElement

    Dim picBase64 As imageBase64

    '建立臨時的筆記本
    Dim onenoteFullName As String
    With New Scripting.FileSystemObject
        onenoteFullName = .GetSpecialFolder(TemporaryFolder) & "\" & .GetBaseName(.GetTempName) & ".one"
        '判斷函數值是否正常
        If .FileExists(inPicPath) = False Then
            GetTextFromPicture = "! Error File Path !"
            Exit Function
        End If
    End With

    Dim onenoteApp As New OneNote.Application
    If onenoteApp Is Nothing Then
        GetTextFromPicture = "! Error in Openning OneNote !"
        GoTo clear_variable_before_exit
    End If
    
    Dim sectionID As String
    Dim pageID As String
    
    Set xmlEle = CreateNotePageContentElement(2, inPicPath)
    
    Set xmlEle = AddNodeInfo(xmlEle)
    
    '建立臨時的筆記本,獲取sectionID
    onenoteApp.OpenHierarchy onenoteFullName, "", sectionID, cftSection
    
    '建立新的頁面,獲取pageID
    onenoteApp.CreateNewPage sectionID, pageID, npsBlankPageNoTitle

    '獲取頁面的XML格式
    Dim pageXmlText As String
    onenoteApp.GetPageContent pageID, pageXmlText, , xs2013
    
    '導入到XML中進行處理,將圖片形式導入到XML中
    If xmlDoc.LoadXML(pageXmlText) = False Then
        GetTextFromPicture = "! Error in Loading Xml !"
        GoTo clear_variable_before_exit
    End If
    With xmlDoc.getElementsByTagName("one:Page").Item(0)
        .appendChild xmlEle
    End With
    
    '更新Page內容
    onenoteApp.UpdatePageContent xmlDoc.DocumentElement.xml, , xs2013

    'OneNote識別圖片須要時間,如下開始輪詢結果,1秒*10次
    Sleep 1000

    Dim iCNT As Integer
    iCNT = 10
re_getPageContent:
    onenoteApp.GetPageContent pageID, pageXmlText, , xs2013
    xmlDoc.LoadXML pageXmlText
    Set xmlEle = xmlDoc.DocumentElement.getElementsByTagName("one:OCRText").Item(0)
    If xmlEle Is Nothing Then
        If iCNT > 0 Then
            Sleep 1000
            iCNT = iCNT - 1
            GoTo re_getPageContent
        Else
            GetTextFromPicture = "! Waiting OneNote Time Expired !"
        End If
    Else
        GetTextFromPicture = xmlEle.Text
    End If
        
clear_variable_before_exit:
    If Not onenoteApp Is Nothing Then
        If Len(pageID) > 0 Then
            onenoteApp.DeleteHierarchy pageID, , True
        End If
        Set onenoteApp = Nothing
    End If
    Kill onenoteFullName
End Function
OneNote識別的VBA主要函數

其中定義了圖片Base64類型:spa

Type imageBase64
    base64Text As String
    imageWidth As Long
    imageHeight As Long
End Type

引用了API函數,輪詢的時候不會致使程序無響應code

Private Declare PtrSafe Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)

 

Function CreateNotePageContentElement(contentType As Integer, paraContent As String) As MSXML2.IXMLDOMElement
    Dim xmlEle As MSXML2.IXMLDOMElement
    Dim xmlNode As MSXML2.IXMLDOMElement
                
    Dim ns As String
    ns = "one:"
    
    With New MSXML2.DOMDocument60
        Select Case contentType
            Case 1 '文本
                Set xmlNode = .createElement(ns & "T")
                xmlNode.Text = paraContent
            Case 2 '圖片
                Dim picBase64 As imageBase64
                picBase64 = getBase64(paraContent)
                
                '建立一個圖片XML信息
                
                Set xmlNode = .createElement(ns & "Image")
                xmlNode.setAttribute "format", "jpg"
                xmlNode.setAttribute "originalPageNumber", 0
                
                Set xmlEle = .createElement(ns & "Position")
                xmlEle.setAttribute "x", 0
                xmlEle.setAttribute "y", 0
                xmlEle.setAttribute "z", 0
                xmlNode.appendChild xmlEle
                
                Set xmlEle = .createElement(ns & "Size")
                xmlEle.setAttribute "width", picBase64.imageWidth
                xmlEle.setAttribute "height", picBase64.imageHeight
                xmlNode.appendChild xmlEle
                
                Set xmlEle = .createElement(ns & "Data")
                xmlEle.Text = picBase64.base64Text
                xmlNode.appendChild xmlEle
        End Select
    End With
    Set CreateNotePageContentElement = xmlNode
End Function


Function AddNodeInfo(ContentElement As MSXML2.IXMLDOMElement) As MSXML2.IXMLDOMElement
    Dim xmlEle As MSXML2.IXMLDOMElement
    Dim xmlNode As MSXML2.IXMLDOMElement
    Dim ns As String
    ns = "one:"
    Set xmlNode = ContentElement
    With New MSXML2.DOMDocument60
        Set xmlEle = .createElement(ns & "OE")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
        
        Set xmlEle = .createElement(ns & "OEChildren")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
        
        Set xmlEle = .createElement(ns & "Outline")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
    End With

    Set AddNodeInfo = xmlNode

End Function
XML處理的函數

 

Function getBase64(inBmpFile As String) As imageBase64
    
    Dim xmlEle As MSXML2.IXMLDOMElement
    With New MSXML2.DOMDocument60
        Set xmlEle = .createElement("Base64Data")
    End With
    xmlEle.DataType = "bin.base64"
    
    With New ADODB.Stream
        .Type = adTypeBinary
        .Open
        .LoadFromFile inBmpFile
        xmlEle.nodeTypedValue = .Read()
        .Close
    End With
    
    getBase64.base64Text = xmlEle.Text
    
    With CreateObject("WIA.ImageFile")
        .loadfile inBmpFile
        getBase64.imageHeight = .Height
        getBase64.imageWidth = .Width
    End With

End Function
圖片處理爲Base64編碼的函數

 

造成VBA模塊之後,OCR_Pictures_To_Text函數能夠直接在單元格引用,也能夠在主程序中引用orm

Sub OCR_Pictures_To_Text()
    Dim vFNi As Variant
    Dim sFNi As Variant
    
    Dim sFNo As String
    
    Dim oTS As TextStream
    
    vFNi = Application.GetOpenFilename("*.jpg,*.jpg", , , , True)
    
    If VarType(vFNi) = vbBoolean Then Exit Sub
    
    sFNo = Application.GetSaveAsFilename(, "*.txt,*.txt")
    
    If sFNo = "False" Then Exit Sub
    
    Dim sTmp As String
    
    With New Scripting.FileSystemObject
        Set oTS = .CreateTextFile(sFNo)
    End With
    For Each sFNi In vFNi
        sTmp = GetTextFromPicture(CStr(sFNi))
        While InStr(1, sTmp, " ") > 0
            sTmp = Replace(sTmp, " ", "")
        Wend
        oTS.Write sTmp
    Next
    oTS.Close
    MsgBox "OK"
End Sub
OCR主程序
相關文章
相關標籤/搜索