結對第二次做業html
031602414 柯葉祥
031602435 肖逸清java
PSP2.1 | Personal Software Process Stages | 預估耗時(分鐘) | 實際耗時(分鐘) |
---|---|---|---|
Planning | 計劃 | 10 | 10 |
• Estimate | • 估計這個任務須要多少時間 | 10 | 10 |
Development | • 開發 | 1210 | 1670 |
• Analysis | • 需求分析 (包括學習新技術) | 400 | 500 |
• Design Spec | • 生成設計文檔 | 10 | 10 |
• Design Review | • 設計複審 | 10 | 10 |
• Coding Standard | • 代碼規範 (爲目前的開發制定合適的規範) | 10 | 10 |
• Design | • 具體設計 | 20 | 20 |
• Coding | • 具體編碼 | 500 | 600 |
• Code Review | • 代碼複審 | 60 | 120 |
• Test | • 測試(自我測試,修改代碼,提交修改) | 200 | 400 |
Reporting | 報告 | 30 | 40 |
• Test Repor | • 測試報告 | 10 | 10 |
• Size Measurement | • 計算工做量 | 10 | 10 |
• Postmortem & Process Improvement Plan | • 過後總結, 並提出過程改進計劃 | 10 | 20 |
合計 | 1250 | 1710 |
<dependency> <!-- jsoup HTML parser library @ http://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
接下來是具體實現部分,思路是先爬取每一個文章的連接,再逐個爬取每一個連接,獲取標題和摘要。node
package cvpr; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /**爬蟲實現 * * @author xyq * @since 2018 10 10 * @version 1.0 */ public class clawler { /**爬取cvpr網站 * * @return Map<String, String> * */ public Map<String, String> getPaperInfo() { Map<String, String> paperInfo = new HashMap<String, String>(); ArrayList<String> htmlLinks = new ArrayList<String>(); String head = "http://openaccess.thecvf.com/";//html連接前半部分 try { Document doc = null ; doc = Jsoup.connect("http://openaccess.thecvf.com/CVPR2018.py#").get(); //Document document = Jsoup.parse(new File("C:/Users/zkpkhua/Desktop/yiibai-index.html"), "utf-8"); Elements links = doc.select("dt a[href]"); for (Element link : links) { //System.out.println("link : " + link.attr("href")); htmlLinks.add(head+link.attr("href"));//獲得html連接 //System.out.println(head+link.attr("href")); } } catch (IOException e) { e.printStackTrace(); } try { for (String htmlLink : htmlLinks) { Document doc = null ; doc = Jsoup.connect(htmlLink).get(); Elements titleInfo = doc.select("div#papertitle"); Elements abstractInfo = doc.select("div#abstract"); //System.out.println(titleInfo.text()); paperInfo.put(titleInfo.text(), abstractInfo.text()); } } catch (IOException e) { e.printStackTrace(); } return paperInfo; } public void writeResult() throws Exception { Map<String, String> paperInfo = new HashMap<String, String>(); clawler clawler = new clawler(); paperInfo = clawler.getPaperInfo(); String path = System.getProperty("user.dir")+"\\src\\main\\java\\cvpr\\result.txt"; File file = new File(path); StringBuilder result = new StringBuilder(""); int count = 0; FileWriter filewriter = new FileWriter(file.getAbsoluteFile()); //System.out.println("absolutely path:"+file.getAbsolutePath()); BufferedWriter bufferedWriter = new BufferedWriter(filewriter); for (Entry<String, String> entry : paperInfo.entrySet()) { result.append(count); count++; result.append("\r\n"); result.append("Title:"+entry.getKey()); result.append("\r\n"); result.append("Abstract:"+entry.getValue()); result.append("\r\n"); result.append("\r\n"); result.append("\r\n"); //System.out.println("title: "+entry.getKey()); //System.out.println("abstract: "+entry.getValue()); } bufferedWriter.write(result.toString()); bufferedWriter.close(); } }
UML類圖
app
流程圖
yii
關鍵實現部分流程圖
此次的關鍵在於新增的詞組統計功能,一下是實現的流程圖。
maven
爬取每篇文章的做者信息,統計各個做者的文章數量。性能
先爬取每篇文章的連接,再對這些連接爬取獲取做者的名字,將結果排序輸出至文件。單元測試
/**排序,按權重降序 * * */ private static void sort(ArrayList<HashMap.Entry<String, Integer>> wordList) { Collections.sort(wordList, new Comparator<HashMap.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { if (o1.getValue() < o2.getValue()) { return 1; } else { if (o1.getValue().equals(o2.getValue())) { if (o1.getKey().compareTo(o2.getKey()) > 0) { return 1; } else { return -1; } } else { return -1; } } } }); }
獲取命令行參數學習
/* * 這裏先取默認值,由於不是全部參數都會被用戶提供 */ String in = " "; String o = " "; int m = 1; int n = 10; int w = 1; long charNum = 0; long lineNum = 0; long wordNum = 0; /* * 設置一個offset變量,用來定位相關信息 */ int optSetting = 0; for (; optSetting < args.length; optSetting++) { if ("-i".equals(args[optSetting])) { in = args[++optSetting]; } else if ("-o".equals(args[optSetting])) { o = args[++optSetting]; } else if ("-m".equals(args[optSetting])) { m = Integer.parseInt(args[++optSetting]); } else if ("-n".equals(args[optSetting])) { n = Integer.parseInt(args[++optSetting]); } else if ("-w".equals(args[optSetting])) { w = Integer.parseInt(args[++optSetting]); } }
for(;i < str.length();i++) { //大寫字母轉爲小寫字母 temp = str.charAt(i); if ((temp >= 65) && (temp <= 90)) { temp += 32; } if(state == 0) { if ((temp >= 97) && (temp <= 122)) { word.append(temp); state = 1; } }else if (state == 1) { if ((temp >= 97) && (temp <= 122)) { word.append(temp); state = 2; } else { word.setLength(0); state = 0; } }else if (state == 2) { if ((temp >= 97) && (temp <= 122)) { word.append(temp); state = 3; } else { word.setLength(0); state = 0; } }else if (state == 3) { if ((temp >= 97) && (temp <= 122)) { word.append(temp); state = 4; } else { word.setLength(0); state = 0; } }else if (state == 4) { if (((temp >= 97) && (temp <= 122)) || ((temp >= '0') && (temp <= '9'))) { word.append(temp); } else { wordlist.add(word.toString()); word.setLength(0); state = 0; } } } if (state == 4) { wordlist.add(word.toString()); }
@Test public void testPhraseFrequency2_1() { phraseFrequencyCounter phraseFrequencyCounter = new phraseFrequencyCounter(); int topPhrasenum = 0; ArrayList<HashMap.Entry<String, Integer>> wordList = new ArrayList<HashMap.Entry<String, Integer>>(); HashMap<String, Integer> wordlistMap = new HashMap<String, Integer>(); wordlistMap = phraseFrequencyCounter.countPhraseFrequency("D:\\java_project\\031602435&031602414\\src\\test\\java\\wordCount2_031602435\\"+"result.txt",1,3); wordList = phraseFrequencyCounter.topFrequentPhrases(wordlistMap); topPhrasenum = wordList.get(0).getValue(); assertEquals(97, topPhrasenum); } @Test public void testWordNum1() { wordNumberCounter wordNumberCounter = new wordNumberCounter(); long wordnum =wordNumberCounter.countWord("D:\\java_project\\031602435&031602414\\src\\test\\java\\wordCount2_031602435\\"+"text1.txt"); System.out.println(wordnum); assertEquals(9, wordnum); } @Test public void testPhraseFrequency1_1() { phraseFrequencyCounter phraseFrequencyCounter = new phraseFrequencyCounter(); int topPhrasenum = 0; ArrayList<HashMap.Entry<String, Integer>> wordList = new ArrayList<HashMap.Entry<String, Integer>>(); HashMap<String, Integer> wordlistMap = new HashMap<String, Integer>(); wordlistMap = phraseFrequencyCounter.countPhraseFrequency("D:\\java_project\\031602435&031602414\\src\\test\\java\\wordCount2_031602435\\"+"text1.txt",1,3); wordList = phraseFrequencyCounter.topFrequentPhrases(wordlistMap); topPhrasenum = wordList.get(0).getValue(); assertEquals(11, topPhrasenum); }
此次的用時雖然有20多小時,可是完成總時間是在兩天半(不包括學習技術)。。因此只有一條記錄。測試
認真負責
java:2->2.4 自主學習能力:2->2.6