假若有以下html代碼須要解析 html
<table border="1"> <tr> <td rowspan="3">1</td> <td>1</td> <td>1</td> <td>1</td> </tr> <tr> <td>2</td> <td>2</td> <td>2</td> </tr> <tr> <td>3</td> <td>3</td> <td>3</td> </tr> </table>
咱們須要以每一行爲一個對象的解析過程 java
理想輸出的結果是 node
1111 數組
1222 測試
1333 this
再假若有下面html spa
<table border="1"> <tr> <td rowspan="3">1</td> <td>1</td> <td>1</td> <td>1</td> </tr> <tr> <td>2</td> <td rowspan="2">2</td> <td>2</td> </tr> <tr> <td>3</td> <td>3</td> </tr> </table>
咱們獲得理想的結果是 code
1111 htm
1222 對象
1323
若是每次都按照這樣的結果輸出,就符合咱們的理想, 把rowspan掉的給恢復成無rowspan的table解析起來就比較方便了
下面說解決思路, 首先咱們用一個二維的String數組來表示無rowspan table的行和列
上面的咱們能夠聲明String[][] table=new String[3][4];
表示三行四列的table
下面我開始遍歷html代碼的每一行,行下標用i來表示,在行的遍歷代碼中咱們遍歷每行的列,在遍歷的同時咱們判斷這個列是否是有rowspan屬性,若是有在二維數組中給i+1行的當前列(經過當前列的下標能夠訪問到)賦值爲i行的當前單元格的值,這個rowspan被順利讀取,最後的二維數組就是咱們獲得的沒有rowspan的table。注意在給每一個單元格賦值前,須要判斷當前行和當前列是否是爲空的, 若是不爲空則向列+1的單元格賦值 , 依此往復知道賦值成功
下面貼上個人代碼,用到了jsoup(解析html利器嘛 。)
package bank.html; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Table { private List<Tr> trs; private String html; private String[][] table; private boolean isHtml; public Table(String tableHtml){ html=tableHtml; } public void parse(){ Document document=Jsoup.parse(html); Elements trsE = document.select("tr"); Elements firstTds=trsE.get(0).select("td"); table = new String[trsE.size()][firstTds.size()]; for (int row = 0; row < table.length; row++) { Element tr = trsE.get(row); Elements tds = tr.select("td"); int column = 0; for (Element td : tds) { if (td.hasAttr("rowspan")) { int rowspan = Integer.parseInt(td.attr("rowspan")); td.removeAttr("rowspan"); int endRow = rowspan + row - 1; for (int startColumn=column; endRow > row; endRow--) { boolean flag = true; int columnA=startColumn; while (flag) { if (table[endRow][columnA] == null) { table[endRow][columnA] = isHtml?td.toString():td.html(); flag=false; startColumn=columnA; } else { columnA++; } } } } boolean runing = true; do { if (table[row][column] == null) { runing = false; table[row][column] = isHtml?td.toString():td.html(); } else { runing = true; } column++; } while (runing); } } trs=new ArrayList<Tr>(); for (String[] strings : table) { Tr tr=new Tr(); List<Td> tds=new ArrayList<Td>(); for (String string : strings) { Td td=new Td(); td.setContent(string); tds.add(td); } tr.setTds(tds); trs.add(tr); } } public List<Tr> getTrs(){ return trs; } public String getHtml(){ return html; } public int getTrCount(){ return trs.size(); } public void isHtml(boolean v){ isHtml=v; } public static void main(String[] args) throws IOException { Document document=Jsoup.parse(new File("table.html"), "utf-8"); Elements trs=document.select("tr"); System.out.println(trs.size()); Table table=new Table("<table>"+trs.toString()+"</table>"); table.parse(); for(Tr trss:table.getTrs()){ System.out.println("行開始"); for(Td td:trss.getTds()){ System.out.println(td.getContent()); } System.out.println("行結束"); } } }
package bank.html; import java.util.List; public class Tr { private List<Td> tds; public List<Td> getTds() { return tds; } public void setTds(List<Td> tds) { this.tds = tds; } }
package bank.html; public class Td { private String content; public String getContent() { return content; } public void setContent(String content) { this.content = content; } }
能夠用上面的html代碼測試,可能我描述的不夠清楚,你們代碼運行下就會明白的。這是我在OSC的處做,其中可能會存在問題,歡迎各路人批評指正。