近期有個任務要求處理大XML文件,其中有個存了Base64的大節點(>90M,路徑已知)。node
這種任務只能上XmlReader,即便如此大節點的處理仍是頭疼了一陣……this
最初查MSDN的時候,找到了ReadChars(),能夠拿來對付大節點。spa
方法說明:https://msdn.microsoft.com/zh-cn/library/system.xml.xmltextreader.readchars(v=vs.110).aspxcode
示例中提到使用方法是:orm
while(0 != reader.ReadChars(buffer, 0, 1)) { // Do something. // Attribute values are not available at this point. }
這個處理規範格式的XML沒有問題,好比這樣的:xml
<Root> <LeafNode>Value</LeafNode> <ParentNode> <LeafNode>Value</LeafNode> </ParentNode> </Root>
可是(沒人喜歡這個詞,然並卵……),遇到些格式詭異的XML就……blog
<Root><LeafNode>Value</LeafNode><ParentNode> <LeafNode>Value</LeafNode></ParentNode> </Root>
好比這個畫風的,用示例代碼去讀第一個LeafNode的內容,估計會讀出「ValueValue」來……ip
恰恰輸入的XML就是這風格的……(*sigh*)get
單步執行了一陣,發現這種狀況下,XmlTextReader.Name會變化成下個節點的名稱(XmlTextReader.LocalName亦如此),能夠根據這個判斷是否已經達到節點結尾。string
改進版爲:
string currentName = reader.LocalName; while(currentName == reader.LocalName && 0 != reader.ReadChars(buffer, 0, 1)) { // Do something. // Attribute values are not available at this point. }
順便貼上一個轉寫並對特定節點進行處理的代碼:
List<string> processNodePathList = new List<string> {"/Root/Path/to/Target"}; List<string> bigNodePathList = new List<string> { "/Root/Path/to/Big/Node" }; private static void ProcessBigXmlFile(string sourcePath, string targetPath, IList<string> processNodePathList, IList<string> bigNodePathList) { var processNodeNameList = processNodePathList.Select( processNodePath => processNodePath.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries)) .Select(nodePathParts => nodePathParts[nodePathParts.Length - 1]) .ToList(); var bigNodeNameList = bigNodePathList.Select( bigNodePath => bigNodePath.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries)) .Select(nodePathParts => nodePathParts[nodePathParts.Length - 1]) .ToList(); var sourceStream = new FileStream(sourcePath, FileMode.Open, FileAccess.Read); var reader = new XmlTextReader(sourceStream); var targetStream = new FileStream(targetPath, FileMode.Create, FileAccess.Write); var writer = new XmlTextWriter(targetStream, Encoding.UTF8); try { var pathStack = new Stack<string>(); var readResult = reader.Read(); while (readResult) { int skipMode = 0; switch (reader.NodeType) { case XmlNodeType.Element: { pathStack.Push(reader.Name); writer.WriteStartElement(reader.LocalName); if (reader.HasAttributes) { while (reader.MoveToNextAttribute()) { writer.WriteAttributeString(reader.LocalName, reader.Value); } reader.MoveToElement(); } if (processNodeNameList.Contains(reader.LocalName)) { var index = processNodeNameList.IndexOf(reader.LocalName); if (CompareNodePath(pathStack, processNodePathList[index])) { // Replace node content writer.WriteFullEndElement(); skipMode = 1; } } else if (bigNodeNameList.Contains(reader.LocalName)) { var index = bigNodeNameList.IndexOf(reader.LocalName); if (CompareNodePath(pathStack, bigNodePathList[index])) { reader.MoveToContent(); var buffer = new char[1024]; int len; while (reader.LocalName == bigNodePathList[index] && (len = reader.ReadChars(buffer, 0, buffer.Length)) > 0) { writer.WriteRaw(buffer, 0, len); } writer.WriteFullEndElement(); skipMode = 2; } } if (reader.IsEmptyElement) { pathStack.Pop(); writer.WriteEndElement(); } break; } //case XmlNodeType.Attribute: //{ // newPackageWriter.WriteAttributeString(oldPackageReader.LocalName, oldPackageReader.Value); // break; //} case XmlNodeType.Text: { writer.WriteValue(reader.Value); break; } case XmlNodeType.CDATA: { writer.WriteCData(reader.Value); break; } //case XmlNodeType.EntityReference: //{ // newPackageWriter.WriteEntityRef(oldPackageReader.Name); // break; //} //case XmlNodeType.Entity: //{ // break; //} case XmlNodeType.ProcessingInstruction: { writer.WriteProcessingInstruction(reader.Name, reader.Value); break; } case XmlNodeType.Comment: { writer.WriteComment(reader.Value); break; } //case XmlNodeType.Document: //{ // break; //} case XmlNodeType.DocumentType: { writer.WriteRaw(string.Format("<!DOCTYPE{0} [{1}]>", reader.Name, reader.Value)); break; } //case XmlNodeType.DocumentFragment: //{ // break; //} //case XmlNodeType.Notation: //{ // break; //} case XmlNodeType.Whitespace: { writer.WriteWhitespace(reader.Value); break; } //case XmlNodeType.SignificantWhitespace: //{ // break; //} case XmlNodeType.EndElement: { pathStack.Pop(); writer.WriteFullEndElement(); break; } case XmlNodeType.XmlDeclaration: { writer.WriteStartDocument(); break; } } switch (skipMode) { case 1: { reader.Skip(); pathStack.Pop(); readResult = !reader.EOF; break; } case 2: { pathStack.Pop(); readResult = !reader.EOF; break; } default: { readResult = reader.Read(); break; } } } } finally { writer.Close(); targetStream.Close(); targetStream.Dispose(); reader.Close(); sourceStream.Close(); sourceStream.Dispose(); } } private static bool CompareNodePath(Stack<string> currentNodePathStack, string compareNodePathString) { var currentArray = currentNodePathStack.Reverse().ToArray(); var compareArray = compareNodePathString.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries); if (compareArray.Length != currentArray.Length) { return false; } bool isDifferent = false; for (int i = 0; i < currentArray.Length; i++) { if (compareArray[i] != currentArray[i]) { isDifferent = true; break; } } return !isDifferent; }