關於大XML文件與大節點處理(System.Xml.XmlTextReader)

近期有個任務要求處理大XML文件,其中有個存了Base64的大節點(>90M,路徑已知)。node

這種任務只能上XmlReader,即便如此大節點的處理仍是頭疼了一陣……this

最初查MSDN的時候,找到了ReadChars(),能夠拿來對付大節點。spa

方法說明:https://msdn.microsoft.com/zh-cn/library/system.xml.xmltextreader.readchars(v=vs.110).aspxcode

示例中提到使用方法是:orm

while(0 != reader.ReadChars(buffer, 0, 1))
{
    // Do something.
    // Attribute values are not available at this point.
}

這個處理規範格式的XML沒有問題,好比這樣的:xml

<Root>
  <LeafNode>Value</LeafNode>
  <ParentNode>
    <LeafNode>Value</LeafNode>
  </ParentNode>
</Root>

可是(沒人喜歡這個詞,然並卵……),遇到些格式詭異的XML就……blog

<Root><LeafNode>Value</LeafNode><ParentNode>
<LeafNode>Value</LeafNode></ParentNode>
</Root>

好比這個畫風的,用示例代碼去讀第一個LeafNode的內容,估計會讀出「ValueValue」來……ip

恰恰輸入的XML就是這風格的……(*sigh*)get

單步執行了一陣,發現這種狀況下,XmlTextReader.Name會變化成下個節點的名稱(XmlTextReader.LocalName亦如此),能夠根據這個判斷是否已經達到節點結尾。string

改進版爲:

string currentName = reader.LocalName;
while(currentName == reader.LocalName && 0 != reader.ReadChars(buffer, 0, 1))
{
    // Do something.
    // Attribute values are not available at this point.
}

順便貼上一個轉寫並對特定節點進行處理的代碼:

List<string> processNodePathList = new List<string> {"/Root/Path/to/Target"};
List<string> bigNodePathList = new List<string> { "/Root/Path/to/Big/Node" }; 

private static void ProcessBigXmlFile(string sourcePath, string targetPath, IList<string> processNodePathList, IList<string> bigNodePathList)
{
    var processNodeNameList =
        processNodePathList.Select(
            processNodePath => processNodePath.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries))
            .Select(nodePathParts => nodePathParts[nodePathParts.Length - 1])
            .ToList();
    var bigNodeNameList = bigNodePathList.Select(
            bigNodePath => bigNodePath.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries))
            .Select(nodePathParts => nodePathParts[nodePathParts.Length - 1])
            .ToList();

    var sourceStream = new FileStream(sourcePath, FileMode.Open, FileAccess.Read);
    var reader = new XmlTextReader(sourceStream);

    var targetStream = new FileStream(targetPath, FileMode.Create, FileAccess.Write);
    var writer = new XmlTextWriter(targetStream, Encoding.UTF8);

    try
    {
        var pathStack = new Stack<string>();
        var readResult = reader.Read();
        while (readResult)
        {
            int skipMode = 0;
            switch (reader.NodeType)
            {
                case XmlNodeType.Element:
                {
                    pathStack.Push(reader.Name);
                    writer.WriteStartElement(reader.LocalName);
                    if (reader.HasAttributes)
                    {
                        while (reader.MoveToNextAttribute())
                        {
                            writer.WriteAttributeString(reader.LocalName,
                                reader.Value);
                        }
                        reader.MoveToElement();
                    }

                    if (processNodeNameList.Contains(reader.LocalName))
                    {
                        var index = processNodeNameList.IndexOf(reader.LocalName);
                        if (CompareNodePath(pathStack, processNodePathList[index]))
                        {
                                    
                            // Replace node content

                            writer.WriteFullEndElement();
                            skipMode = 1;
                        }
                    }
                    else if (bigNodeNameList.Contains(reader.LocalName))
                    {
                        var index = bigNodeNameList.IndexOf(reader.LocalName);
                        if (CompareNodePath(pathStack, bigNodePathList[index]))
                        {
                            reader.MoveToContent();
                            var buffer = new char[1024];
                            int len;
                            while (reader.LocalName == bigNodePathList[index] &&
                                    (len = reader.ReadChars(buffer, 0, buffer.Length)) > 0)
                            {
                                writer.WriteRaw(buffer, 0, len);
                            }
                            writer.WriteFullEndElement();
                            skipMode = 2;
                        }
                    }
                    if (reader.IsEmptyElement)
                    {
                        pathStack.Pop();
                        writer.WriteEndElement();
                    }
                    break;
                }
                //case XmlNodeType.Attribute:
                //{
                //    newPackageWriter.WriteAttributeString(oldPackageReader.LocalName, oldPackageReader.Value);
                //    break;
                //}
                case XmlNodeType.Text:
                {
                    writer.WriteValue(reader.Value);
                    break;
                }
                case XmlNodeType.CDATA:
                {
                    writer.WriteCData(reader.Value);
                    break;
                }
                //case XmlNodeType.EntityReference:
                //{
                //    newPackageWriter.WriteEntityRef(oldPackageReader.Name);
                //    break;
                //}
                //case XmlNodeType.Entity:
                //{
                //    break;
                //}
                case XmlNodeType.ProcessingInstruction:
                {
                    writer.WriteProcessingInstruction(reader.Name, reader.Value);
                    break;
                }
                case XmlNodeType.Comment:
                {
                    writer.WriteComment(reader.Value);
                    break;
                }
                //case XmlNodeType.Document:
                //{
                //    break;
                //}
                case XmlNodeType.DocumentType:
                {
                    writer.WriteRaw(string.Format("<!DOCTYPE{0} [{1}]>", reader.Name,
                        reader.Value));
                    break;
                }
                //case XmlNodeType.DocumentFragment:
                //{
                //    break;
                //}
                //case XmlNodeType.Notation:
                //{
                //    break;
                //}
                case XmlNodeType.Whitespace:
                {
                    writer.WriteWhitespace(reader.Value);
                    break;
                }
                //case XmlNodeType.SignificantWhitespace:
                //{
                //    break;
                //}
                case XmlNodeType.EndElement:
                {
                    pathStack.Pop();
                    writer.WriteFullEndElement();
                    break;
                }
                case XmlNodeType.XmlDeclaration:
                {
                    writer.WriteStartDocument();
                    break;
                }
            }

            switch (skipMode)
            {
                case 1:
                {
                    reader.Skip();
                    pathStack.Pop();
                    readResult = !reader.EOF;
                    break;
                }
                case 2:
                {
                    pathStack.Pop();
                    readResult = !reader.EOF;
                    break;
                }
                default:
                {
                    readResult = reader.Read();
                    break;
                }
            }
        }
    }
    finally
    {
        writer.Close();
        targetStream.Close();
        targetStream.Dispose();
        reader.Close();
        sourceStream.Close();
        sourceStream.Dispose();
    }
}

private static bool CompareNodePath(Stack<string> currentNodePathStack, string compareNodePathString)
{
    var currentArray = currentNodePathStack.Reverse().ToArray();
    var compareArray = compareNodePathString.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries);
    if (compareArray.Length != currentArray.Length)
    {
        return false;
    }
    bool isDifferent = false;
    for (int i = 0; i < currentArray.Length; i++)
    {
        if (compareArray[i] != currentArray[i])
        {
            isDifferent = true;
            break;
        }
    }
    return !isDifferent;
}
相關文章
相關標籤/搜索