Java實現Excel文件經過XSSF和SAX方式讀取大量數據避免內存溢出的方法

使用普通的POI或者其餘的技術方式處理Excel文件會出現IDEA內存溢出問題的,本方法實現50萬條數據的輕鬆實現,大概須要300多秒的處理時間左右,實現對手機號碼或其餘數據格式的信息讀取處理,須要寫出文件操做能夠另行寫一個方法寫出,本方法只是實現讀取Excel文件!

1、添加所需依賴:java

<dependency>
        <groupId>com.monitorjbl</groupId>
        <artifactId>xlsx-streamer</artifactId>
        <version>1.2.0</version>
    </dependency>
    <dependency>
        <groupId>org.eclipse.birt.runtime.3_7_1</groupId>
        <artifactId>org.apache.xerces</artifactId>
        <version>2.9.0</version>
    </dependency>
    <dependency>

2、新建工具類
package com.cy.exceldata.util;sql

import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.junit.jupiter.api.Test;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
/**apache

  • XSSF and SAX (Event API)
  • 能避免內存溢出問題的大數據Excel文件數據處理
    */

public abstract class BigDataParseExcel extends DefaultHandler {api

private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private int sheetIndex = -1;
private List<String> rowlist = new ArrayList<String>();
private int curRow = 0; //當前行
private int curCol = 0; //當前列索引
private int preCol = 0; //上一列列索引
private int titleRow = 0; //標題行,通常狀況下爲0
private int rowsize = 0; //列數

//excel記錄行操做方法,以sheet索引,行索引和行元素列表爲參數,對sheet的一行元素進行操做,元素爲String類型
public abstract void optRows(int sheetIndex, int curRow, List<String> rowlist) throws SQLException;
//只遍歷一個sheet,其中sheetId爲要遍歷的sheet索引,從1開始,1-3

/**
 * @param filename
 * @param sheetId  sheetId爲要遍歷的sheet索引,從1開始,1-3
 * @throws Exception
 */
public void processOneSheet(String filename, int sheetId) throws Exception {
    OPCPackage pkg = OPCPackage.open(filename);
    XSSFReader r = new XSSFReader(pkg);
    SharedStringsTable sst = r.getSharedStringsTable();
    XMLReader parser = fetchSheetParser(sst);
    // rId2 found by processing the Workbook
    // 根據 rId# 或 rSheet# 查找sheet
    InputStream sheet2 = r.getSheet("rId" + sheetId);
    sheetIndex++;
    InputSource sheetSource = new InputSource(sheet2);
    parser.parse(sheetSource);
    sheet2.close();
}

/**
 * 遍歷 excel 文件
 */
public void process(String filename) throws Exception {
    OPCPackage pkg = OPCPackage.open(filename);
    XSSFReader r = new XSSFReader(pkg);
    SharedStringsTable sst = r.getSharedStringsTable();
    XMLReader parser = fetchSheetParser(sst);
    Iterator<InputStream> sheets = r.getSheetsData();
    while (sheets.hasNext()) {
        curRow = 0;
        sheetIndex++;
        InputStream sheet = sheets.next();
        InputSource sheetSource = new InputSource(sheet);
        parser.parse(sheetSource);
        sheet.close();
    }
}

public XMLReader fetchSheetParser(SharedStringsTable sst)
        throws SAXException {
    XMLReader parser = XMLReaderFactory.createXMLReader();
    //.createXMLReader("org.apache.xerces.parsers.SAXParser");
    this.sst = sst;
    parser.setContentHandler(this);
    return parser;
}

public void startElement(String uri, String localName, String name,
                         Attributes attributes) throws SAXException {
    // c => 單元格
    if (name.equals("c")) {
        // 若是下一個元素是 SST 的索引,則將nextIsString標記爲true
        String cellType = attributes.getValue("t");
        String rowStr = attributes.getValue("r");
        curCol = this.getRowIndex(rowStr);
        if (cellType != null && cellType.equals("s")) {
            nextIsString = true;
        } else {
            nextIsString = false;
        }
    }
    // 置空
    lastContents = "";
}

public void endElement(String uri, String localName, String name)
        throws SAXException {
    // 根據SST的索引值的到單元格的真正要存儲的字符串
    // 這時characters()方法可能會被調用屢次
    if (nextIsString) {
        try {
            int idx = Integer.parseInt(lastContents);
            lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
                    .toString();
        } catch (Exception e) {
        }
    }
    // v => 單元格的值,若是單元格是字符串則v標籤的值爲該字符串在SST中的索引
    // 將單元格內容加入rowlist中,在這以前先去掉字符串先後的空白符
    if (name.equals("v")) {
        String value = lastContents.trim();
        value = value.equals("") ? " " : value;
        int cols = curCol - preCol;
        if (cols > 1) {
            for (int i = 0; i < cols - 1; i++) {
                rowlist.add(preCol, "");
            }
        }
        preCol = curCol;
        rowlist.add(curCol - 1, value);
    } else {
        //若是標籤名稱爲 row ,這說明已到行尾,調用 optRows() 方法
        if (name.equals("row")) {
            int tmpCols = rowlist.size();
            if (curRow > this.titleRow && tmpCols < this.rowsize) {
                for (int i = 0; i < this.rowsize - tmpCols; i++) {
                    rowlist.add(rowlist.size(), "");
                }
            }
            try {
                optRows(sheetIndex, curRow, rowlist);
            } catch (SQLException e) {
                e.printStackTrace();
            }
            if (curRow == this.titleRow) {
                this.rowsize = rowlist.size();
            }
            rowlist.clear();
            curRow++;
            curCol = 0;
            preCol = 0;
        }
    }
}

public void characters(char[] ch, int start, int length)
        throws SAXException {
//獲得單元格內容的值
    lastContents += new String(ch, start, length);
}

//獲得列索引,每一列c元素的r屬性構成爲字母加數字的形式,字母組合爲列索引,數字組合爲行索引,
//如AB45,表示爲第(A-A+1)*26+(B-A+1)*26列,45行
public int getRowIndex(String rowStr) {
    rowStr = rowStr.replaceAll("[^A-Z]", "");
    byte[] rowAbc = rowStr.getBytes();
    int len = rowAbc.length;
    float num = 0;
    for (int i = 0; i < len; i++) {
        num += (rowAbc[i] - 'A' + 1) * Math.pow(26, len - i - 1);
    }
    return (int) num;
}

public int getTitleRow() {
    return titleRow;
}

public void setTitleRow(int titleRow) {
    this.titleRow = titleRow;
}

}eclipse

3、測試(這裏實現的是提取列表中符合條件的手機號信息,可根據需求改變)
public class POITest {xss

@SneakyThrows
public static void main(String[] args) {

    long start = System.currentTimeMillis();
    BigDataParseExcel xlx = new BigDataParseExcel(){

        @Override
        public void optRows(int sheetIndex, int curRow, List<String> rowlist) throws SQLException {
            for (int i = 0; i < rowlist.size(); i++) {
                if(isMobile( rowlist.get(i))){
                    System.out.println(rowlist.get(i));
                }

            }

        }

    };

    xlx.process("C:/Users/Administrator/Desktop/data/test/DATA552410.xlsx");
    long end = System.currentTimeMillis();
    System.out.println((end-start)/1000);

}

public static boolean isMobile(final String str) {
    Pattern p = null;
    Matcher m = null;
    boolean b = false;
    p = Pattern.compile("^[1][3,4,5,7,8][0-9]{9}$"); // 驗證手機號
    m = p.matcher(str);
    b = m.matches();
    return b;
}

}ide

相關文章
相關標籤/搜索