java poi sax方式處理大數據量excel文件

系統須要用到一個導入excel文件的功能,使用poi組件常規方式讀取excel時,內存耗盡,OutOfMemoryError,或者讀取很是慢
因此寫了一個工具類,使用poi sax方式讀取excel,速度快不少,內存消耗能夠接受。java

測試結果以下:
.xlsx文件,35M大小,總4個sheel,
只讀取第一個,37434行,54列git

總行數:37434
讀取耗時:39秒
打印耗時:17秒apache

主要代碼以下:
ExcelUtils.class 主入口緩存

package com.xxx.bi.utils.excel;

import java.util.List;
import java.util.Objects;

import org.apache.commons.lang3.StringUtils;

import com.google.common.collect.Lists;

public class ExcelUtils {
    /** logger日誌. */
    // public static final Logger LOGGER = Logger.getLogger(ExcelUtils.class);

    public ExcelUtils() {
    }

    /**
     * 獲取excel的表頭
     * 
     * @param filePath
     *            文件路徑
     * @param headerNum
     *            表頭所在行數
     * @return
     */
    public static List<String> getHeader(String filePath, int headerNum) {
        if (StringUtils.isBlank(filePath)) {
            throw new IllegalArgumentException("傳入文件路徑不能爲空");
        }
        if (Objects.isNull(headerNum) || headerNum < 1) {
            headerNum = 1;
        }
        try {
            return LargeExcelFileReadUtil.getRowFromSheetOne(filePath, headerNum);
        } catch (Exception e) {
            // LOGGER.info("獲取excel[" + filePath + "]表頭失敗,緣由:", e);
            e.printStackTrace();
        }
        return Lists.newArrayList();
    }

    /**
     * 獲取excel的全部數據<br/>
     * 全部數據類型都是String<br/>
     * 會以第一行數據的列數爲總列數,因此第一行的數據必須都不爲空,不然可能出java.lang.IndexOutOfBoundsException
     * 
     * @param filePath
     *            文件路徑
     * @param headerNum
     *            表頭所在行數
     * @return
     */
    public static List<List<String>> getAllData(String filePath) {
        if (StringUtils.isBlank(filePath)) {
            throw new IllegalArgumentException("傳入文件路徑不能爲空");
        }
        try {
            return LargeExcelFileReadUtil.getRowsFromSheetOne(filePath);
        } catch (Exception e) {
            // LOGGER.info("獲取excel[" + filePath + "]表頭失敗,緣由:", e);
            e.printStackTrace();
        }
        return Lists.newArrayList();
    }

    public static void main(String[] args) {
        long start = System.currentTimeMillis();
        String filepath = "C:/Users/Administrator/Desktop/05-做業調配表 -快遞.xlsx";
        // List<String> result = ExcelUtils.getHeader(filepath, 1);
        // for (String col : result) {
        // System.out.println(col);
        // }

        List<List<String>> result = ExcelUtils.getAllData(filepath);
        long end = System.currentTimeMillis();
        for (List<String> list : result) {
            System.out.println(list.toString());
        }
        long end1 = System.currentTimeMillis();
        try {
            Thread.sleep(1000l);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.err.println("總行數:" + result.size());
        System.err.println(("讀取耗時:" + (end - start) / 1000) + "秒");
        System.err.println(("打印耗時:" + (end1 - end) / 1000) + "秒");
    }
}

LargeExcelFileReadUtil.class 真正的工具類xss

package com.xxx.bi.utils.excel;

import java.io.InputStream;
import java.util.List;
import java.util.Objects;

import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

public class LargeExcelFileReadUtil {
    /** logger日誌. */
    public static final Logger LOGGER = Logger.getLogger(LargeExcelFileReadUtil.class);

    // 處理一個sheet
    public static List<String> getRowFromSheetOne(String filename, Integer rowNum) throws Exception {
        InputStream inputStream = null;
        OPCPackage pkg = null;
        SingleRowHandler singleRowHandler = null;
        try {
            pkg = OPCPackage.open(filename);
            XSSFReader r = new XSSFReader(pkg);
            SharedStringsTable sst = r.getSharedStringsTable();
            singleRowHandler = new SingleRowHandler(sst, rowNum);
            XMLReader parser = XMLReaderFactory.createXMLReader("com.sun.org.apache.xerces.internal.parsers.SAXParser");
            parser.setContentHandler(singleRowHandler);
            inputStream = r.getSheet("rId1");
            InputSource sheetSource = new InputSource(inputStream);
            parser.parse(sheetSource);
            return singleRowHandler.getRow();
        } catch (Exception e) {
            String message = e.getMessage();
            if (Objects.nonNull(rowNum) && Objects.nonNull(singleRowHandler)
                    && SingleRowHandler.FINISH_ROW_MESSAGE.equalsIgnoreCase(message)) {
                // 獲取某一行數據完成 ,暫時不知道怎麼能終止excel解析,直接拋出了異常,實際是成功的
                return singleRowHandler.getRow();
            }
            throw e;
        } finally {
            if (Objects.nonNull(pkg)) {
                pkg.close();
            }
            if (Objects.nonNull(inputStream)) {
                inputStream.close();
            }
        }
    }

    // 處理一個sheet
    public static List<List<String>> getRowsFromSheetOne(String filename) throws Exception {
        InputStream inputStream = null;
        OPCPackage pkg = null;
        MultiRowHandler multiRowHandler = null;
        try {
            pkg = OPCPackage.open(filename);
            XSSFReader r = new XSSFReader(pkg);
            SharedStringsTable sst = r.getSharedStringsTable();
            multiRowHandler = new MultiRowHandler(sst);
            XMLReader parser = XMLReaderFactory.createXMLReader("com.sun.org.apache.xerces.internal.parsers.SAXParser");
            parser.setContentHandler(multiRowHandler);
            inputStream = r.getSheet("rId1");
            InputSource sheetSource = new InputSource(inputStream);
            parser.parse(sheetSource);
            return multiRowHandler.getRows();
        } catch (Exception e) {
            throw e;
        } finally {
            if (Objects.nonNull(pkg)) {
                pkg.close();
            }
            if (Objects.nonNull(inputStream)) {
                inputStream.close();
            }
        }
    }

}

SingleRowHandler.class 當行處理類,能夠只獲取表頭或表格中的某一行數據ide

package com.xxx.bi.utils.excel;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;

import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class SingleRowHandler extends DefaultHandler {
    public final static String FINISH_ROW_MESSAGE = "row data process finish";

    private Integer rowNum = null;// rowNum不爲空時則標示只須要獲取這一行的數據
    private int curRowNum = 1;
    private String cellType = "";
    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;
    private String cellPosition;
    private List<String> row = new ArrayList<>();

    public List<String> getRow() {
        return row;
    }

    public SingleRowHandler(SharedStringsTable sst, Integer rowNum) {
        this.sst = sst;
        this.rowNum = rowNum;
    }

    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
        if (name.equals("c")) {
            cellPosition = attributes.getValue("r");
            // 這是一個新行
            if (Pattern.compile("^A[0-9]+$").matcher(cellPosition).find()) {
                curRowNum = Integer.valueOf(cellPosition.substring(1));
            }
            cellType = "";
            cellType = attributes.getValue("t");
            if ("s".equals(cellType)) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 清楚緩存內容
        lastContents = "";
        if (Objects.nonNull(rowNum) && curRowNum > rowNum) {
            // 獲取某一行數據完成 ,暫時不知道怎麼能終止excel解析,直接拋出了異常,實際是成功的
            throw new SAXException(FINISH_ROW_MESSAGE);
        }
    }

    public void endElement(String uri, String localName, String name) throws SAXException {
        if (nextIsString) {
            int idx = Integer.parseInt(lastContents);
            lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
            nextIsString = false;
        }

        if (name.equals("v")) {
            if (Objects.isNull(rowNum) || rowNum == curRowNum) {
                row.add(lastContents);
            }
        }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
        lastContents += new String(ch, start, length);
    }
}

MultiRowHandler.class 獲取excel全部行的數據工具

package com.xxx.bi.utils.excel;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * 獲取完整excel數據的handler<br/>
 * 
 * @author Administrator
 *
 */
public class MultiRowHandler extends DefaultHandler {
    private int curRowNum = 0;// 行號,從1開始
    private int curColIndex = -1;// 列索引,從0開始
    private int colCnt = 0;// 列數,取第一行列數作爲列總數
    private String cellType = "";
    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;
    private String cellPosition;
    private List<String> head = null;
    private List<String> curRowData = null;
    private boolean curRowIsBlank = true;// 當前是個空行
    private List<List<String>> rows = new ArrayList<>();

    public List<List<String>> getRows() {
        return rows;
    }

    public MultiRowHandler(SharedStringsTable sst) {
        this.sst = sst;
    }

    @Override
    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
        if (name.equals("c")) {
            cellPosition = attributes.getValue("r");
            curColIndex = getColIndex(cellPosition);
            // 這是一個新行
            if (isNewRow(cellPosition)) {
                curRowNum = getRowNum(cellPosition);
                if (2 == curRowNum && Objects.nonNull(curRowData)) {
                    head = curRowData;
                    colCnt = head.size();
                }
                curRowData = getBlankRow(colCnt);
            }
            cellType = "";
            cellType = attributes.getValue("t");
            if ("s".equals(cellType)) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 清楚緩存內容
        lastContents = "";
    }

    private boolean isNewRow(String cellPosition) {
        // 座標以A開頭,後面跟數字 或者座標行和當前行不一致的
        boolean newRow = Pattern.compile("^A[0-9]+$").matcher(cellPosition).find();
        if (!newRow) {
            int cellRowNum = getRowNum(cellPosition);
            newRow = (cellRowNum != curRowNum);
        }
        return newRow;
    }

    /**
     * 根據列座標獲取行號,從1開始,返回0時標示出錯
     * 
     * @param cellPosition
     *            列座標,爲A1,B23等
     * @return 行號,從1開始,返回0是爲失敗
     */
    private static int getRowNum(String cellPosition) {
        String strVal = Pattern.compile("[^0-9]").matcher(cellPosition).replaceAll("").trim();// 獲取座標中的數字
        if (StringUtils.isNotBlank(strVal)) {
            return Integer.valueOf(strVal);
        }
        return 0;
    }

    /**
     * 根據列座標返回當前列索引,從0開始,返回-1時標示出錯<br/>
     * A1->0; B1->1...AA1->26
     * 
     * @param cellPosition
     *            列座標,爲A1,B23等
     * @return 列索引,從0開始,返回-1是爲失敗,A1->0; B1->1...AA1->26
     */
    private static int getColIndex(String cellPosition) {
        int index = -1;
        int num = 65;// A的Unicode碼
        int length = cellPosition.length();
        for (int i = 0; i < length; i++) {
            char c = cellPosition.charAt(i);
            if (Character.isDigit(c)) {
                break;// 肯定指定的char值是否爲數字
            }
            index = (index + 1) * 26 + (int) c - num;
        }
        return index;
    }

    /**
     * 返回一個所有爲空字符串的空行
     * 
     * @param cnt
     * @return
     */
    private List<String> getBlankRow(int cnt) {
        List<String> result = new ArrayList<>(cnt);
        for (int i = 0; i < cnt; i++) {
            result.add(i, "");
        }
        curRowIsBlank = true;
        return result;
    }

    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
        if (nextIsString) {
            int idx = Integer.parseInt(lastContents);
            lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
            nextIsString = false;
        }

        if (name.equals("v")) {
            // System.out.println(MessageFormat.format("當前列定位:{0},當前行:{1},當前列:{2},當前值:{3}",
            // cellPosition, curRowNum,
            // curColIndex, lastContents));
            if (Objects.isNull(head)) {
                curRowData.add(lastContents);
            } else {
                curRowData.set(curColIndex, lastContents);
            }
            curRowIsBlank = false;
            // 這是一個新行
            if (isNewRow(cellPosition)) {
                if (Objects.nonNull(curRowData)) {
                    if (curRowIsBlank) {
                        curRowData.clear();// 若是當前行是空行,則清空當前行數據
                    }
                    rows.add(curRowData);
                }
            }

        }
    }

    @Override
    public void endDocument() throws SAXException {
        if (Objects.nonNull(curRowData) && !curRowIsBlank) {
            rows.add(curRowData);// 最後一行在上面很差加入,最後一行全是空行的不加入
        }
        super.endDocument();
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        lastContents += new String(ch, start, length);
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
        lastContents += "";
    }

    public static void main(String[] args) {
        System.out.println(getColIndex("BC2"));
    }
}
相關文章
相關標籤/搜索