在JAVA下實現數據庫對word文件的讀取與字段的提取

在JAVA下實現數據庫對word文件的讀取與字段的提取

  實現excel中的數據導入數據庫,在java或是C#下是不難實現的,即便想實如今txt中的數據導入也不是什麼難事,可是,最近接了個任務,學校要建一個英文版的教學平臺,而後各個學院的課程資料與簡介什麼的都是英文的,學校有20個學院,每一個學院多的有兩百來個科目,少的也有百八十種,可他恰恰就是個word,並且,作的格式很不規範。這可愁着我了,我首先想到的是POI,因而google了一下,原來真的很容易實現,這個後面的代碼能夠發上去,能夠實現03版,和07版的。差異主要是jar包的問題。03的要3個jar包,07的須要7個jar包。html

* POI 讀取 word 2003 和 word 2007 中文字內容的測試類<br />
* @createDate 2009-07-25
* @author Carl He
*/
    public class Test {
        public static void main(String[] args) {
            try {
            ////word 2003: 圖片不會被讀取
            InputStream is = new FileInputStream(new File("files\\2003.doc"));
            WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream 
            String text2003 = ex.getText();
    
            System.out.println(text2003);
            //對字符串進行分解
    
            //word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後
            OPCPackage opcPackage = POIXMLDocument.openPackage("files\\2007.docx");
            POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
            String text2007 = extractor.getText();
            System.out.println(text2007);

            } catch (Exception e) {
                e.printStackTrace();
            }
      }
}

  而後,重要的問題是,如何從word的字段中抓去文件纔是關鍵,由於他們提供的word文件並非excel,並不能直接導入,我仍是果斷上一個word文件吧,這樣好理解:java

Course Description of Biochemistrymysql

Course Name: Biochemistry Nature of CourseCompulsory coursereact

Course Code: B1700025 Total Credits: 5.0ios

Total Credit Hours80 Lecture Hours80sql

Experimental Hours: 0 Oriented Majors: Bioscience, Biotechnology數據庫

Prerequisite Courses: express

Penner: Validator(s):apache

Briefing of Course Content:學習

Biochemistry is a science exploring the chemical compositions and chemical reactions during life activitiesof living organisms. It is an important compulsive fundamental course for undergraduates majoring in bioscience and biotechnology. The main content of this course includes 1. The structure, function and the relationship between the structure and function of biological macromolecule such as protein and nucleic acid; 2. The metabolisms and regulation of biological macromolecules including carbohydrate, lipid, protein, nucleic acid etc.; 3. The transfer and expression of genetic information.

import java.io.File;
import java.util.ArrayList;

    public class Directory {
        private ArrayList nameList = new ArrayList(); 
        private static String dirName = "d:\\EclipseWorkSpace\\Test\\files"; 
    
        public void getSubFile(String FileName) { 
            File parentF = new File(FileName); 
            if (!parentF.exists()) { 
            System.out.println("文件或目錄不存在"); 
            return; 
        } 
        if (parentF.isFile()) { 
            nameList.add(parentF.getAbsoluteFile()); 
            return; 
        } 
        String[] subFiles = parentF.list(); 
            for (int i = 0; i < subFiles.length; i++) { 
                getSubFile(dirName + "/" + subFiles[i]); 
            } 
        } 
    
        public ArrayList getNameList() { 
            return nameList; 
        }
        public static void main(String[] args){
            Directory d=new Directory();
            d.getSubFile("d:\\EclipseWorkSpace\\Test\\files");
        }
    }


import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;

import org.apache.poi.hwpf.extractor.WordExtractor;

public class FileTool {

    /**
     * 列出目錄
     * @throws Exception
     */
    public static void listDirs()throws Exception{
        
        File f=new File("d:\\EclipseWorkSpace\\Test\\files\\英文課程簡介");
        String[] dirs=f.list();
        for(int i=0;i<dirs.length;i++){
        //    System.out.println(dirs[i]);
 listFiles(getDeptID(dirs[i].trim()),dirs[i]);
        }
    }
    public static void main(String[] args)throws Exception{
        listDirs();
        
    }
    /**
     * 列出文件名
     * @throws Exception
     */
public static void listFiles(int deptID,String dirName)throws Exception{
        
        File f=new File("d:\\EclipseWorkSpace\\Test\\files\\英文課程簡介\\"+dirName);
        String[] files=f.list();
        for(int i=0;i<files.length;i++){
            //System.out.println("d:\\EclipseWorkSpace\\Test\\files\\英文課程簡介\\"+dirName+"\\"+files[i]);
            //列出文件名
 String docName=files[i];
            if(docName.length()>30)
                docName=docName.substring(30, docName.length()-4);
            //System.out.println(docName+"開始寫入數據庫");
            //從word中讀取信息
 
            boolean res=insertCoursesIntoDB(deptID,docName,getDetailFromWord("d:\\EclipseWorkSpace\\Test\\files\\英文課程簡介\\"+dirName+"\\"+files[i]));
            if(res){
            File file=new File("d:\\EclipseWorkSpace\\Test\\files\\英文課程簡介\\"+dirName+"\\"+files[i]);
            file.delete();}else{
                
                System.out.println("導入錯誤的文件爲:"+"d:\\EclipseWorkSpace\\Test\\files\\英文課程簡介\\"+dirName+"\\"+files[i]);
                System.out.println("----------------以上-------------------");
            }
        }
    }
    //將提出的數據寫入數據庫
public static boolean insertCoursesIntoDB(int deptID,String courseName,String courseDetail)throws Exception{
    //顯示內容
 try{
        //System.out.println(courseDetail);
        //分析字符串
 courseDetail=courseDetail.replace(":", ":");
        String courseCode=courseDetail.substring(courseDetail.indexOf("Course Code:"), courseDetail.indexOf("Total Credits:")).trim();
        if(courseCode.length()>12)
        courseCode=courseCode.substring(12, courseCode.length()).trim();
        
        String nature=courseDetail.substring(courseDetail.indexOf("Nature of Course:"), courseDetail.indexOf("Course Code:")).trim();
        if(nature.length()>17)
        nature=nature.substring(17, nature.length()).trim();
        
        String totalCredit=courseDetail.substring(courseDetail.indexOf("Total Credits:"), courseDetail.indexOf("Total Credit Hours :")).trim();
        if(totalCredit.length()>14)
        totalCredit=totalCredit.substring(14, totalCredit.length()).trim();

        String totalCreditHours=courseDetail.substring(courseDetail.indexOf("Total Credit Hours :"), courseDetail.indexOf("Lecture Hours:")).trim();
        if(totalCreditHours.length()>19)
        totalCreditHours=totalCreditHours.substring(19, totalCreditHours.length()).trim();
        
        String lectureHours=courseDetail.substring(courseDetail.indexOf("Lecture Hours:"), courseDetail.indexOf("Experiment Hours:")).trim();
        if(lectureHours.length()>14)
        lectureHours=lectureHours.substring(14, lectureHours.length()).trim();
        
        String experimentalHours=courseDetail.substring(courseDetail.indexOf("Experiment Hours:"), courseDetail.indexOf("Oriented Majors:")).trim();
        if(experimentalHours.length()>19)
        experimentalHours=experimentalHours.substring(19, experimentalHours.length()).trim();
        
        String orientedMajors=courseDetail.substring(courseDetail.indexOf("Oriented Majors:"), courseDetail.indexOf("Prerequisite Course:")).trim();
        if(orientedMajors.length()>16)
        orientedMajors=orientedMajors.substring(16, orientedMajors.length()).trim();
        
        String prerequisiteCourse=courseDetail.substring(courseDetail.indexOf("Prerequisite Course:"), courseDetail.indexOf("Penner:")).trim();
        if(prerequisiteCourse.length()>23)
        prerequisiteCourse=prerequisiteCourse.substring(23, prerequisiteCourse.length()).trim();
        
        String penner=courseDetail.substring(courseDetail.indexOf("Penner:"), courseDetail.indexOf("Validators :")).trim();
        if(penner.length()>7)
        penner=penner.substring(7, penner.length()).trim();
        
        String validator=courseDetail.substring(courseDetail.indexOf("Validators :"), courseDetail.indexOf("Briefing of Course Content:")).trim();
        if(validator.length()>13)
        validator=validator.substring(13, validator.length()).trim();
        
        String content=courseDetail.substring(courseDetail.indexOf("Briefing of Course Content:"), courseDetail.length()).trim();
        if(content.length()>27)
        content=content.substring(27, content.length()).trim();
        
        
        
        //寫入數據庫
        
        // 從mysql數據庫中讀取正確信息
 Connection conn = null;
                Statement stmt = null;
                ResultSet rs = null;
                try {
                    conn = getConn();
                    stmt = conn.createStatement();
                    String sql="insert into course(deptId,name,courseCode,nature,totalCredits,totalCreditHours,lectureHours,experimentalHours,orientedMajors,prerequisiteCourse,penner,validator,content) values(" +
                            deptID+",'"+courseName+"','"+courseCode+"','"+nature+"','"+totalCredit+"','"+totalCreditHours+"','"+lectureHours+"','"+experimentalHours+"','"+orientedMajors+"','"+prerequisiteCourse+"','"+penner+"','"+validator+"','"+content+"')";
                    stmt.executeUpdate(sql);
                } catch (Exception e) {
                    
                    System.out.println("數據庫錯誤"+e.getMessage());
                    return false;
                } finally {
                    if (rs != null)
                        try {
                            rs.close();
                        } catch (Exception e) {
                            System.out.println(e.getMessage());
                        }
                    if (stmt != null)
                        try {
                            stmt.close();
                        } catch (Exception e) {
                            System.out.println(e.getMessage());
                        }
                    if (conn != null)
                        try {
                            conn.close();
                        } catch (Exception e) {
                            System.out.println(e.getMessage());
                        }
                }
                return true;
        }catch(Exception e){
            System.out.println("字符轉換截取錯誤:"+e.getMessage());
            return false;
        }
    
}


/**
 * 讀取word中的內容
 * @param fileName
 * @return
 * @throws Exception
 */
public static String getDetailFromWord(String fileName){
    //獲取word中的內容
 String text="";
    try{
    InputStream is = new FileInputStream(new File(fileName));
    WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream 
 text = ex.getText();
    }catch(Exception e){
        System.out.println("錯誤文件爲:"+fileName);
        return null;
    }
    return text;
}
    /**
     * 得到單位ID
     * 
     * @param deptName
     * @return
     * @throws Exception
     */
    public static int getDeptID(String deptName)throws Exception{
        int deptID=0;
        // 從mysql數據庫中讀取正確信息
 Connection conn = null;
        Statement stmt = null;
        ResultSet rs = null;
        try {
            conn = getConn();
            stmt = conn.createStatement();
            rs=stmt.executeQuery("select id from dept where name='"+deptName+"'");
            if(rs.next())
                deptID=rs.getInt(1);
        } catch (Exception e) {
            System.out.println(e.getMessage());
        } finally {
            if (rs != null)
                try {
                    rs.close();
                } catch (Exception e) {
                    System.out.println(e.getMessage());
                }
            if (stmt != null)
                try {
                    stmt.close();
                } catch (Exception e) {
                    System.out.println(e.getMessage());
                }
            if (conn != null)
                try {
                    conn.close();
                } catch (Exception e) {
                    System.out.println(e.getMessage());
                }
        }
        return deptID;
    }
    
    
    
    public static Connection getConn() throws Exception{
        Class.forName("com.mysql.jdbc.Driver");
        DriverManager.registerDriver(new com.mysql.jdbc.Driver());
        String dbUrl = "jdbc:mysql://210.44.**.**:3306/qauenDB?useUnicode=true&CharacterEncoding=utf8";
        String dbUser = "***";
        String dbPassword = "******";
        return java.sql.DriverManager.getConnection(dbUrl, dbUser,dbPassword);
    }
    
}



反正此次任務很麻煩,看起來,各個科目弄的很專業,很標準,可是,不一樣老師作的,一些標點符號仍是整的很亂,有的是英文的標點,有的是中文的,有的是半角的,有的是全角的,形成了數據庫裏面全是亂碼,到如今沒發改了,全辦公室集體上陣,一人分幾個學院,挨條記錄看,挨條記錄改,糾結ing。。。。。。

www.cnblogs.com/ytliyang 陽倌_℡__向各位大俠學習。

相關文章
相關標籤/搜索