PSP2.1 | Personal Software Process Stages | 預估耗時(分鐘) | 實際耗時(分鐘) |
---|---|---|---|
Planning | 計劃 | ||
• Estimate | • 估計這個任務須要多少時間 | 610 | 630 |
Development | 開發 | ||
• Analysis | • 需求分析 (包括學習新技術) | 70 | 90 |
• Design Spec | • 生成設計文檔 | 60 | 50 |
• Design Review | • 設計複審 | 30 | 40 |
• Coding Standard | • 代碼規範 (爲目前的開發制定合適的規範) | 40 | 30 |
• Design | • 具體設計 | 70 | 90 |
• Coding | • 具體編碼 | 310 | 300 |
• Code Review | • 代碼複審 | 30 | 30 |
• Test | • 測試(自我測試,修改代碼,提交修改) | 40 | 70 |
Reporting | 報告 | ||
• Test Report | • 測試報告 | 60 | 80 |
• Size Measurement | • 計算工做量 | 30 | 40 |
• Postmortem & Process Improvement Plan | • 過後總結, 並提出過程改進計劃 | 40 | 55 |
合計 | 740 | 820 |
剛開始拿到題目時,是比較迷茫的,畢竟題目看起來很繁瑣,但仔細思考一下,發現不是很難。咱們經過百度以及CSDN論壇等渠道找到了須要的資料,這些資料對咱們起到了很大的幫助。java
類圖以下:
關鍵函數是getWords。基本思路是每一行讀取後根據正則表達式匹配,每找到一個符合的單詞就加一。
流程圖以下:
這些代碼的關鍵在於細節的處理。node
使用了Jsoup工具進行網站頁面的爬取。經過對CVPR2018官網的首頁,咱們發現文章標題都屬於一個class,即ptitle。因而咱們先用 Jsoup.connect(url).get( )獲得整個頁面,用getElementsByClass(ptitle)獲得標題,接着用attr(href)獲得該文章的連接,並用它獲得該文章的頁面,接着用getElementById("abstract")獲得文章的摘要,最後將它們輸入到result.txt中。git
進階需求在基本需求上增長了自定義輸入輸出文件、加入權重的詞頻統計(詞組未實現)、自定義詞頻統計輸出(詞組未實現)、多參數的混合使用等功能。github
花費的時間:55分鐘
改進思路:優化了算法。正則表達式
import java.io.*; import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.*; import java.util.regex.Matcher; class EntryComparator implements Comparator<Entry<String, Integer>> { public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { if(o2.getValue() == o1.getValue()) { return o1.getKey().compareTo(o2.getKey()); } else return (o2.getValue() - o1.getValue()); } } public class Main { private static ArrayList<String> wordStrings =new ArrayList<String>(); private static int count = 0; private static int lines = 0; private static int words = 0; //獲取字符數 private static void getCharacter(String filename) { int ch, ef = 0; try { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(new File(filename))); BufferedReader in = new BufferedReader(new InputStreamReader(bis, "utf-8"), 20* 1024* 1024 ); while (in.ready()) { ch = in.read(); count++; if((char)ch == '\n') count--; if((char)ch != ' ' && (char)ch != '\t' && (char)ch != '\n' && (char)ch != '\r') ef++; if((char)ch == '\n' && ef > 0) { lines++; ef = 0; } } if(ef > 0) { lines++; ef = 0; } in.close(); } catch (IOException ex) { ex.printStackTrace(); } } //獲取單詞數 private static void getWords(String filename)throws IOException { FileReader fr = new FileReader(filename); String s = "([A-Za-z]{4,})([A-Za-z0-9]*)"; BufferedReader br = new BufferedReader(fr); String line = ""; while((line = br.readLine()) != null) { line = line.replaceAll("[^a-zA-Z0-9]([0-9]{1,})([a-zA-Z0-9]*)", ""); Pattern pattern=Pattern.compile(s); Matcher ma=pattern.matcher(line); while(ma.find()){ words++; //System.out.println(ma.group()); } } br.close(); fr.close(); } //輸出前10的單詞及個數 private static void getMostWord(String filename)throws IOException { FileReader fr = new FileReader(filename); String s = "([A-Za-z]{4,})([A-Za-z0-9]*)"; ArrayList<String> text = new ArrayList<String>(); BufferedReader br = new BufferedReader(fr); String line = ""; while((line = br.readLine()) != null) { line = line.toLowerCase(); line = line.replaceAll("[^a-z0-9]([0-9]{1,})([a-z0-9]*)", ""); Pattern pattern=Pattern.compile(s); Matcher ma=pattern.matcher(line); while(ma.find()){ text.add(ma.group()); //System.out.println(ma.group()); } } br.close(); fr.close(); Map<String, Integer> map = new HashMap<String, Integer>(); for(String st : text) { if(map.containsKey(st)) { map.put(st, map.get(st)+1); }else { map.put(st, 1); } } List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String,Integer>>(); for(Entry<String, Integer> entry : map.entrySet()) { list.add(entry); } Collections.sort(list,new EntryComparator()); int i = 0; String ssString; for(Entry<String, Integer> obj : list) { if(i>9) break; ssString="<"+obj.getKey()+">: " + obj.getValue()+"\r\n"; wordStrings.add(ssString); ++i; // System.out.print(ssString); } } private static void writers(String c,String w,String l,ArrayList<String>ws,String path) { try { File file1 =new File(path); Writer out =new FileWriter(file1); out.write(c); out.write(w); out.write(l); for(int i=0;i<ws.size();i++)out.write(ws.get(i)); out.close(); }catch (Exception e) { // TODO: handle exception } } public static void main(String[] args) throws IOException { String path = "input3.txt"; //String path = args[0]; //long start = System.currentTimeMillis();//要測試的程序或方法 getCharacter(path); getWords(path); getMostWord(path); String c,w,l; c = "characters: "+count+"\r\n"; w = "words: "+words+"\r\n"; l = "lines: "+lines+"\r\n"; writers(c, w, l, wordStrings, "result.txt"); //long end = System.currentTimeMillis(); //System.out.println("程序運行時間:"+(end-start)+"ms"); } }
package 爬蟲; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class 爬蟲 { public static void main(String []args) { String url1="http://openaccess.thecvf.com/CVPR2018.py"; Document document1 = null,document2 = null; try { File file1 =new File("result.txt"); Writer out =new FileWriter(file1); Connection connection = Jsoup.connect(url1); connection.maxBodySize(0); document1 = connection.get(); Elements x = document1.getElementsByClass("ptitle"); //System.out.print(x.size()); for(int i=0;i<x.size();i++) { //System.out.print(i+1+" "); //System.out.print("Title: "+x.get(i).text()+" "); String n = i+"\r\n"; String t="Title: "+x.get(i).text()+"\r\n"; Elements links = document1.select("dt a"); String url2=links.get(i).attr("href"); url2="http://openaccess.thecvf.com/"+url2; document2 = Jsoup.connect(url2).get(); Element y= document2.getElementById("abstract"); //System.out.println("Abstract:"+y.text()+"\n\n"); String a="Abstract: "+y.text()+"\r\n\r\n\r\n"; out.write(n); out.write(t); out.write(a); } out.close(); } catch (IOException e) { System.out.println("爬取失敗"); } } }
import java.io.*; import java.util.Map.Entry; import java.util.*; //排序 class EntryComparator implements Comparator<Entry<String, Integer>> { public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { if(o2.getValue() == o1.getValue()) { return o1.getKey().compareTo(o2.getKey()); } else return (o2.getValue() - o1.getValue()); } } public class WordCount { private static ArrayList<String> wordStrings =new ArrayList<String>(); private static int count = 0; private static int lines = 0; private static int words = 0; //獲取字符數 private static void getCharacter(String filename) { int ls=0; String chhString=""; try { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(new File(filename))); BufferedReader in = new BufferedReader(new InputStreamReader(bis, "utf-8"), 20* 1024* 1024 ); while (in.ready()) { chhString = in.readLine(); ls++; if(chhString.indexOf("Abstract:") == 0||chhString.indexOf("Title:") == 0) { chhString = chhString.replaceAll("[^\\u0000-\\u007f]", ""); count +=chhString.length(); lines++; } } ls = ls/5*2; count += ls; in.close(); } catch (IOException ex) { ex.printStackTrace(); } count -= lines/2*17; } private static boolean isWord(String s) { char[] temp = s.toCharArray(); if(temp.length>3) if(temp[0]>=97 && temp[0]<=122 && temp[1]>=97 && temp[1]<=122 &&temp[2]>=97 && temp[2]<=122 && temp[3]>=97 && temp[3]<=122) return true; else return false; else return false; } //獲取單詞數 private static void getWords(String filename )throws IOException { try { FileReader fr = new FileReader(filename); BufferedReader br = new BufferedReader(fr); String line = ""; while((line = br.readLine()) != null) { line = line.replace("[^\\u0000-\\u007f]", ""); line = line.toLowerCase(); String[] strings = line.split("[^a-z0-9]"); for(int i=1;i<strings.length;i++) { if(isWord(strings[i])) words++; } } br.close(); fr.close(); }catch (Exception e) { e.printStackTrace(); } } //輸出前n的單詞及個數 private static void getMostWord(String filename,boolean w,int times)throws IOException { int t=1; if(w) t=10; int a=t; ArrayList<String> text = new ArrayList<String>(); try { FileReader fr = new FileReader(filename); BufferedReader br = new BufferedReader(fr); String line = ""; while((line = br.readLine()) != null) { line = line.toLowerCase(); line = line.replace("[^\\u0000-\\u007f]", ""); if(line.indexOf("title:")==0) { String[] strings = line.split("[^a-z0-9]"); for(int nu=1;nu<strings.length;nu++) { if(isWord(strings[nu])) { while((t)>0) { t--; text.add(strings[nu]); } t = a; } } } if(line.indexOf("abstract:")==0) { String[] strings = line.split("[^a-z0-9]"); for(int nu=1;nu<strings.length;nu++) { if(isWord(strings[nu])) text.add(strings[nu]); } } } br.close(); fr.close(); }catch (Exception e) { e.printStackTrace(); } Map<String, Integer> map = new HashMap<String, Integer>(); for(String st : text) { if(map.containsKey(st)) { map.put(st, map.get(st)+1); }else { map.put(st, 1); } } List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String,Integer>>(); for(Entry<String, Integer> entry : map.entrySet()) { list.add(entry); } Collections.sort(list,new EntryComparator()); int ii = 0; String ssString; for(Entry<String, Integer> obj : list) { if(ii>(times-1)) break; ssString="<"+obj.getKey()+">: " + obj.getValue()+"\r\n"; wordStrings.add(ssString); ++ii; } } private static void writers(String c,String w,String l,ArrayList<String>ws,String path) { try { File file1 =new File(path); Writer out =new FileWriter(file1); out.write(c); out.write(w); out.write(l); for(int i=0;i<ws.size();i++)out.write(ws.get(i)); out.close(); }catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException { //long start = System.currentTimeMillis();//要測試的程序或方法 String ifile = ""; String ofile = ""; String w; boolean b = false; int times = 10; for(int ar=0;ar<args.length;ar=ar+2) { if("-i".equals(args[ar])) ifile = args[ar+1]; if("-o".equals(args[ar])) ofile = args[ar+1]; if("-w".equals(args[ar])) { w = args[ar+1]; if(w.equals("1")) { b = true; } else if(w.equals("0")) { b = false; } } if("-n".equals(args[ar])) { times = Integer.valueOf(args[ar+1]).intValue(); } /*if("-m".equals(args[ar])) { nl = Integer.valueOf(args[ar+1]).intValue(); }*/ } getCharacter(ifile); getWords(ifile); getMostWord(ifile, b, times); String c,ws,l; c = "characters: "+count+"\r\n"; ws = "words: "+words+"\r\n"; l = "lines: "+lines+"\r\n"; writers(c, ws, l, wordStrings, ofile); //long end = System.currentTimeMillis(); //System.out.println("程序運行時間:"+(end-start)+"ms"); } }
需求比較模糊,理解用了較長的時間,有些理解錯誤致使走了彎路;還有對爬蟲jsoup不熟悉;細節上出現了錯誤。算法
與同窗討論;上網查教程;細心處理。函數