public static Document transferByNeko(InputStream stream, String charset) { if (stream == null) return null; if(StringUtils.isEmpty(charset)){ charset = DEFAULT_CHARSET; } //NEKOHTML的DOMParser會將html標籤轉化成大寫,是否設置下面的配置都沒有意義,解決辦法是須要使用xerces的DOMParser // DOMParser domParser = new DOMParser(); // Document doc = null; // ByteArrayOutputStream byteOs = null; // Writer writer = null; // InputSource inputSource = null; // DocumentType documentType = null; // org.w3c.dom.Document document = null; // DOMReader domReader = null; // try { // domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); // domParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); // domParser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8"); // // domParser.setFeature("http://xml.org/sax/features/namespaces", false); // domParser.setFeature("http://cyberneko.org/html/features/balance-tags", true); // domParser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-comment-delims", false); // // byteOs = new ByteArrayOutputStream(); // writer = new Writer(byteOs, charset); // XMLDocumentFilter domFilter[] = { // writer // }; // domParser.setProperty("http://cyberneko.org/html/properties/filters", domFilter); // inputSource = new InputSource(new InputStreamReader(stream, Charset.forName(charset))); // domParser.parse(inputSource); // document = domParser.getDocument(); // documentType = document.getDoctype(); // if (documentType != null) // document.removeChild(documentType); // domReader = new DOMReader(); // doc = domReader.read(document); // } catch (SAXNotRecognizedException e) { // e.printStackTrace(); // } catch (SAXNotSupportedException e) { // e.printStackTrace(); // } catch (UnsupportedEncodingException e) { // e.printStackTrace(); // } catch (SAXException e) { // e.printStackTrace(); // } catch (IOException e) { // e.printStackTrace(); // }finally{ // IOUtils.closeQuietly(byteOs); // IOUtils.closeQuietly(stream); // } //採用xerces的DOMParser Document doc = null; DocumentType documentType = null; org.w3c.dom.Document document = null; DOMReader domReader = null; ByteArrayOutputStream byteOs = null; Writer writer = null; InputSource inputSource = null; try { HTMLConfiguration htmlConfiguration = new HTMLConfiguration(); htmlConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems","lower"); org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser(htmlConfiguration); inputSource = new InputSource(new InputStreamReader(stream, Charset.forName(charset))); parser.parse(inputSource); document = parser.getDocument(); documentType = document.getDoctype(); if (documentType != null) document.removeChild(documentType); domReader = new DOMReader(); doc = domReader.read(document); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return doc; }