VBA使用 COM API 使用 OneNote 2013/2016 的圖像識別功能

最近由大量的掃描單據須要摘錄,就但願可以經過VBA程序輔助完成這項工做。通過一番檢索,在能獲取到的主要的OCR產品中,微軟產品的識別率相對較高。但目前經常使用的Office 2013和Office 2016  Microsoft OFFICE 2013之後,Microsoft Office Document Imaging就不在支持了,網上可以下載到繁體中文的ODI,但在Windows 10下沒法安裝。只能在OneNote的圖像識別功能了。






Function GetTextFromSinglePicture(inPicPath As String) As String


    Dim xmlDoc As New MSXML2.DOMDocument60
    Dim xmlNode As MSXML2.IXMLDOMNode
    Dim xmlEle As MSXML2.IXMLDOMElement

    Dim picBase64 As imageBase64

    Dim onenoteFullName As String
    With New Scripting.FileSystemObject
        onenoteFullName = .GetSpecialFolder(TemporaryFolder) & "\" & .GetBaseName(.GetTempName) & ".one"
        If .FileExists(inPicPath) = False Then
            GetTextFromPicture = "! Error File Path !"
            Exit Function
        End If
    End With

    Dim onenoteApp As New OneNote.Application
    If onenoteApp Is Nothing Then
        GetTextFromPicture = "! Error in Openning OneNote !"
        GoTo clear_variable_before_exit
    End If
    Dim sectionID As String
    Dim pageID As String
    Set xmlEle = CreateNotePageContentElement(2, inPicPath)
    Set xmlEle = AddNodeInfo(xmlEle)
    onenoteApp.OpenHierarchy onenoteFullName, "", sectionID, cftSection
    onenoteApp.CreateNewPage sectionID, pageID, npsBlankPageNoTitle

    Dim pageXmlText As String
    onenoteApp.GetPageContent pageID, pageXmlText, , xs2013
    If xmlDoc.LoadXML(pageXmlText) = False Then
        GetTextFromPicture = "! Error in Loading Xml !"
        GoTo clear_variable_before_exit
    End If
    With xmlDoc.getElementsByTagName("one:Page").Item(0)
        .appendChild xmlEle
    End With
    onenoteApp.UpdatePageContent xmlDoc.DocumentElement.xml, , xs2013

    Sleep 1000

    Dim iCNT As Integer
    iCNT = 10
    onenoteApp.GetPageContent pageID, pageXmlText, , xs2013
    xmlDoc.LoadXML pageXmlText
    Set xmlEle = xmlDoc.DocumentElement.getElementsByTagName("one:OCRText").Item(0)
    If xmlEle Is Nothing Then
        If iCNT > 0 Then
            Sleep 1000
            iCNT = iCNT - 1
            GoTo re_getPageContent
            GetTextFromPicture = "! Waiting OneNote Time Expired !"
        End If
        GetTextFromPicture = xmlEle.Text
    End If
    If Not onenoteApp Is Nothing Then
        If Len(pageID) > 0 Then
            onenoteApp.DeleteHierarchy pageID, , True
        End If
        Set onenoteApp = Nothing
    End If
    Kill onenoteFullName
End Function


Type imageBase64
    base64Text As String
    imageWidth As Long
    imageHeight As Long
End Type


Private Declare PtrSafe Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)


Function CreateNotePageContentElement(contentType As Integer, paraContent As String) As MSXML2.IXMLDOMElement
    Dim xmlEle As MSXML2.IXMLDOMElement
    Dim xmlNode As MSXML2.IXMLDOMElement
    Dim ns As String
    ns = "one:"
    With New MSXML2.DOMDocument60
        Select Case contentType
            Case 1 '文本
                Set xmlNode = .createElement(ns & "T")
                xmlNode.Text = paraContent
            Case 2 '圖片
                Dim picBase64 As imageBase64
                picBase64 = getBase64(paraContent)
                Set xmlNode = .createElement(ns & "Image")
                xmlNode.setAttribute "format", "jpg"
                xmlNode.setAttribute "originalPageNumber", 0
                Set xmlEle = .createElement(ns & "Position")
                xmlEle.setAttribute "x", 0
                xmlEle.setAttribute "y", 0
                xmlEle.setAttribute "z", 0
                xmlNode.appendChild xmlEle
                Set xmlEle = .createElement(ns & "Size")
                xmlEle.setAttribute "width", picBase64.imageWidth
                xmlEle.setAttribute "height", picBase64.imageHeight
                xmlNode.appendChild xmlEle
                Set xmlEle = .createElement(ns & "Data")
                xmlEle.Text = picBase64.base64Text
                xmlNode.appendChild xmlEle
        End Select
    End With
    Set CreateNotePageContentElement = xmlNode
End Function

Function AddNodeInfo(ContentElement As MSXML2.IXMLDOMElement) As MSXML2.IXMLDOMElement
    Dim xmlEle As MSXML2.IXMLDOMElement
    Dim xmlNode As MSXML2.IXMLDOMElement
    Dim ns As String
    ns = "one:"
    Set xmlNode = ContentElement
    With New MSXML2.DOMDocument60
        Set xmlEle = .createElement(ns & "OE")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
        Set xmlEle = .createElement(ns & "OEChildren")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
        Set xmlEle = .createElement(ns & "Outline")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
    End With

    Set AddNodeInfo = xmlNode

End Function


Function getBase64(inBmpFile As String) As imageBase64
    Dim xmlEle As MSXML2.IXMLDOMElement
    With New MSXML2.DOMDocument60
        Set xmlEle = .createElement("Base64Data")
    End With
    xmlEle.DataType = "bin.base64"
    With New ADODB.Stream
        .Type = adTypeBinary
        .LoadFromFile inBmpFile
        xmlEle.nodeTypedValue = .Read()
    End With
    getBase64.base64Text = xmlEle.Text
    With CreateObject("WIA.ImageFile")
        .loadfile inBmpFile
        getBase64.imageHeight = .Height
        getBase64.imageWidth = .Width
    End With

End Function



Sub OCR_Pictures_To_Text()
    Dim vFNi As Variant
    Dim sFNi As Variant
    Dim sFNo As String
    Dim oTS As TextStream
    vFNi = Application.GetOpenFilename("*.jpg,*.jpg", , , , True)
    If VarType(vFNi) = vbBoolean Then Exit Sub
    sFNo = Application.GetSaveAsFilename(, "*.txt,*.txt")
    If sFNo = "False" Then Exit Sub
    Dim sTmp As String
    With New Scripting.FileSystemObject
        Set oTS = .CreateTextFile(sFNo)
    End With
    For Each sFNi In vFNi
        sTmp = GetTextFromPicture(CStr(sFNi))
        While InStr(1, sTmp, " ") > 0
            sTmp = Replace(sTmp, " ", "")
        oTS.Write sTmp
    MsgBox "OK"
End Sub