





進度規劃 計劃用時計劃用時 實際用時 備註
實現文件迭代遍歷 2hour 2hour 之前沒有寫過這種,搜索和學習相關的數據結構的用法花了較長時間,
統計字符總數、總行數 30min 大概也就20min 這一步比較簡單
統計單詞總數目 0.5hour 1.5hour 晚上寫的,熬夜效率不高,判斷是否爲單詞的步驟花了時間較多,並且第一次還寫的不太對,後來改了
統計各個單詞的數量 0.5hour 45min 這裏就是學習unordered_map怎麼用佔時間,這裏沒用幾行代碼
統計各個詞組的數量 20min 20min 和統計各個單詞數量差很少,很快完成
排序輸出前十個單詞和詞組 120min 45min 選擇一個好的算法很重要,剛開始想直接把全部的單詞和詞組排序,後來換成了用十一個緩衝區來選擇
代碼優化 220min 200min 已經遠超過120min了。。。固然到目前爲止優化效果很顯著,沒有明顯的可優化的地方了
Linux的問題 120min 80min 已經花了50min寫了一個能在Linux上跑的程序,就是把遍歷文件夾的函數重新寫一個就行了,接口改一下,可見低耦合的好處!
輸入文件名以命令行參數傳入。須要遍歷整個文件夾時,則要輸入文件夾的路徑。 15min  10min  
根據命令行參數判斷是否爲目錄 15min  5min  
收尾工做,測試細節 60min 30min 如今雖然已經優化了很長時間了,可是前三個的總數和詞組的大小寫一直和答案不同,規則的細節還得細細品味




我採用了文件的深度優先遍歷,剛開始寫是在Windows10下的visual studio2017下寫的,查了一下發現Windows下的文件遍歷能夠用_finddata_t這個數據結構這個數據結構的詳細內容以下:github

1 struct _finddata_t  
2 { 3 unsigned attrib; //文件屬性 4 time_t time_create; //文件建立時間 5 time_t time_access; //文件上一次訪問時間 6 time_t time_write; //文件上一次修改時間 7 _fsize_t size; //文件字節數 8 char name[_MAX_FNAME]; //文件名 9 };



 1 //深度優先遞歸遍歷當前目錄下文件夾和文件及子文件夾和文件  
 2 void DfsFolder(string path, int layer)
 3 { 4  _finddata_t file_info; 5 string current_path = path + "/*.*"; //也能夠用/*來匹配全部 6 intptr_t handle = _findfirst(current_path.c_str(), &file_info); 7 //返回值爲-1則查找失敗 8 if (-1 == handle) 9  { 10 cout << "cannot match the path" << endl; 11 return; 12  } 13 14 do 15  { 16 //判斷是否子目錄 17 if (file_info.attrib == _A_SUBDIR) 18  { 19 //遞歸遍歷子目錄 20 21 int layer_tmp = layer; 22 if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0) //.是當前目錄,..是上層目錄,必須排除掉這兩種狀況 23 DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下能夠用\\轉義分隔符,不推薦 24  } 25 else 26  { 27 //打印記號反映出深度層次 28 //for (int i = 0; i<layer; i++) 29 // cout << "--"; 30 //cout << file_info.name << endl; 31 //這幾行用來測試這個函數 32 string filename = file_info.name; 33 string suffixStr = filename.substr(filename.find_last_of('.') + 1);//獲取文件後綴 34 NumOfCharsLinesInFile(path + '/' + file_info.name); 35  } 36 } while (!_findnext(handle, &file_info)); //返回0則遍歷完 37 //關閉文件句柄 38  _findclose(handle); 39 }



string suffixStr = filename.substr(filename.find_last_of('.') + 1);//獲取文件後綴


其中 NumOfCharsLinesInFile 這個函數是處理文件的函數,當時想這個函數只用來統計文件的字數行數和詞數,後來五項統計都寫在了這個函數裏,如今看來名字不太好,能夠考慮後面改一下。數據結構





1 long long TotalNum_chars = 0;
2 long long TotalNum_lines = 0; 3 long long TotalNum_words = 0;



 1 for(int i = 0;i<len;i++)
 2  { 3 current_char = buf[i]; 4 if (current_char == '\n') { 5 NumberLines++; 6  } 7 if (current_char < 32 || current_char>126) 8  { 9 current_char = ' '; 10 TotalNum_chars--; 11  } 12 //判斷是否爲單詞 13 if ((!isalpha(last_char)) && (!isdigit(last_char)) && (isalpha(current_char))) 14  { 15 wordbegin = true; 16 current_word = current_char; 17  } 18 else if (wordbegin) 19  { 20 if ((isalpha(current_char)) || (isdigit(current_char))) 21  { 22 //current_word.push_back(current_char); 23  current_word.push_back(current_char); 24 if (i == len-1) { 25 goto panduan; 26  } 27  } 28 else 29  { 30 panduan: wordbegin = false; 31 //Determines whether the current current word meets the word requirement: the first four characters are all letters 32 if (isalpha(current_word[1]) && isalpha(current_word[2]) && isalpha(current_word[3])) 33  { 34 35 //that current_word meets the requirements 36 NumberWords++; 37  EnterMap(last_word, current_word); 38 last_word = current_word; //NumberWords++,word,last_word=current_word 39  current_word.clear(); 40 41  } 42  } 43  } 44 last_char = current_char; 45 }


我是用到了上一個字符和目前的字符,而且設置了一個 wordbegin 變量來表示是否在讀單詞,只要是間隔符和字母在一塊兒,就讀入,讀入的單詞再經過前四個 char 是否是都是字母來判斷是否是單詞。



 1 void NumOfCharsLinesInFile(string FileLocation)
 2 {//Read the file, count the number of characters, lines, and words, and add it to the global variable. The word is processed and added to the map dictionary.
 3     //int NumberChars = 0;
 4     int NumberLines = 1; 5 int NumberWords = 0; 6 char last_char = ' '; 7 char current_char; 8 bool wordbegin = false; 9 string current_word; 10 string last_word; 11 12  size_t sz; 13 FILE * fp = fopen(FileLocation.c_str(), "rb"); 14 fseek(fp, 0L, SEEK_END); 15 sz = ftell(fp); // 16 17  rewind(fp); 18 char*buf; 19 buf = (char*)malloc(sz * sizeof(char)); 20 int len = fread(buf, sizeof(char), sz, fp);//用來讀文件,通過測試,fread是最快的讀文件方式 21 //if (len) { 22 // NumberLines++; 23 //} 24 25 for(int i = 0;i<len;i++) 26  { 27 current_char = buf[i]; 28 if (current_char == '\n') { 29 NumberLines++; 30  } 31 if (current_char < 32 || current_char>126) 32  { 33 current_char = ' '; 34 TotalNum_chars--; 35  } 36 //判斷是否爲單詞 37 if ((!isalpha(last_char)) && (!isdigit(last_char)) && (isalpha(current_char))) 38  { 39 wordbegin = true; 40 current_word = current_char; 41  } 42 else if (wordbegin) 43  { 44 if ((isalpha(current_char)) || (isdigit(current_char))) 45  { 46 //current_word.push_back(current_char); 47  current_word.push_back(current_char); 48 if (i == len-1) { 49 goto panduan; 50  } 51  } 52 else 53  { 54 panduan: wordbegin = false; 55 //Determines whether the current current word meets the word requirement: the first four characters are all letters 56 if (isalpha(current_word[1]) && isalpha(current_word[2]) && isalpha(current_word[3])) 57  { 58 59 //that current_word meets the requirements 60 NumberWords++; 61  EnterMap(last_word, current_word); 62 last_word = current_word; //NumberWords++,word,last_word=current_word 63  current_word.clear(); 64 65  } 66  } 67  } 68 last_char = current_char; 69  } 70 71 free(buf); 72 73 74 TotalNum_chars += sz; 75 TotalNum_lines += NumberLines; 76 TotalNum_words += NumberWords; 77  fclose(fp); 78 fp = NULL; 79 }



這裏用到了 unordered_map 這個關聯容器,用法很簡單,詳見C++primer 。

還有就是這兩個 map 的定義:

1 struct my_word
2 { 3 string sort_word = "zzzzzzzzzzzzzzzzzz"; 4 size_t appear_count = 0; 5 }; 6 unordered_map<string, my_word>word_count; 7 unordered_map<string, size_t>phrase_count;

其中 my_word 的 sort_word 是用來存字典序排最前面的格式, appear_count 用來存單詞出現的數量。


void EnterMap(string last_word, string current_word)
    string simple_last_word; string simple_current_word; size_t len = last_word.length(); string temp_word = last_word; transform(temp_word.begin(), temp_word.end(), temp_word.begin(), ::tolower); bool is_start = false; for (size_t i = len - 1; i >= 0; i--) { if (isalpha(temp_word[i])) { is_start = true; simple_last_word = temp_word.substr(0, i + 1); break; } } len = current_word.length(); temp_word = current_word; transform(temp_word.begin(), temp_word.end(), temp_word.begin(), ::tolower); is_start = false; for (size_t i = len - 1; i >= 0; i--) { if (isalpha(temp_word[i])) { is_start = true; simple_current_word = temp_word.substr(0, i + 1); break; } } unordered_map<string, my_word> ::iterator got = word_count.find(simple_current_word); if (got == word_count.end()) { word_count.insert({ simple_current_word,{current_word,1} }); } else { got->second.appear_count++; if (current_word<got->second.sort_word) { got->second.sort_word = current_word; } } string simple_phrase = simple_last_word + '_' + simple_current_word; phrase_count[simple_phrase]++; }




開一個11個元素的 my_word 數組做爲全局變量

my_word ten_word[11];

而後遍歷 word_count ,每一個都放入數組的第十一個,而後對這個數組進行一遍冒泡,使得第11個事最小的,而後最後就能篩選出出現次數最多的十個 my_word 。下面事代碼。

 1 void Getten_word() {
 3  my_word temporary_word; 4 for (const auto &w : word_count) 5  { 6 ten_word[10] = w.second; 7 for (int i = 0; i <= 9; i++) 8  { 9 if (ten_word[i].appear_count < ten_word[i + 1].appear_count) 10  { 11 temporary_word = ten_word[i]; 12 ten_word[i] = ten_word[i + 1]; 13 ten_word[i + 1] = temporary_word; 14  } 15  } 16  } 17 sort(ten_word, ten_word + 10, compare); 18 } 19 20 void Getten_phrase() 21 { 22  my_phrase temporary_phrase; 23 for (const auto &w : phrase_count) 24  { 25 ten_phrase[10].appear_count = w.second; 26 ten_phrase[10].sort_phrase = w.first; 27 for (int i = 0; i <= 9; i++) 28  { 29 if (ten_phrase[i].appear_count < ten_phrase[i + 1].appear_count) 30  { 31 temporary_phrase = ten_phrase[i]; 32 ten_phrase[i] = ten_phrase[i + 1]; 33 ten_phrase[i + 1] = temporary_phrase; 34  } 35  } 36  } 37 sort(ten_phrase, ten_phrase + 10, phrase_compare); 38 } 39 40 bool compare(my_word a, my_word b) 41 { 42 return a.appear_count>b.appear_count; //升序排列 43 } 44 45 bool phrase_compare(my_phrase a, my_phrase b) 46 { 47 return a.appear_count>b.appear_count; //升序排列 48 }



寫 main 函數:

 1 int main(int argc, char *argv[])
 2 { 3 clock_t tStart = clock(); 4 int state = DfsFolder("C:/newsample", 0); 5 if (state) 6  { 7 return 0; 8  } 9 cout << "char_number :" << TotalNum_chars << endl; 10 cout << "line_number :" << TotalNum_lines << endl; 11 cout << "word_number :" << TotalNum_words << endl; 12  Getten_word(); 13 cout <<endl<< "the top ten frequency of word : " << endl; 14 for (int i = 0; i < 10; i++) 15  { 16 cout << ten_word[i].sort_word << " " << ten_word[i].appear_count << endl; 17 18  } 19  Getten_phrase(); 20 cout <<"\n\n"<< "the top ten frequency of phrase :" << endl; 21 for (int i = 0; i < 10; i++) 22  { 23 string phrase_now = ten_phrase[i].sort_phrase; 24 string temp1, temp2; 25 int x = phrase_now.length(); 26 int k = phrase_now.find("_"); 27 28 //temp1 = phrase_now.substr(0, k); 29 //temp2 = phrase_now.substr(k + 1, x - k - 1); 30 string xx = phrase_now.substr(0, k); 31 cout << word_count[phrase_now.substr(0, k)].sort_word << ' ' << word_count[phrase_now.substr(k + 1, x - k - 1)].sort_word <<" "<< ten_phrase[i].appear_count << endl; 32  } 33 printf("Time taken: %.2fs\n", (double)(clock() - tStart) / CLOCKS_PER_SEC); 34 return 0; 35 }



#include <iostream>  
#include <string> #include <fstream> #include <io.h> #include<ctype.h> #include <algorithm> #include <unordered_map> #include <time.h> using namespace std; long long TotalNum_chars = 0; long long TotalNum_lines = 0; long long TotalNum_words = 0; struct my_word { string sort_word = "zzzzzzzzzzzzzzzzzz"; size_t appear_count = 0; }; my_word ten_word[11]; struct my_phrase { string sort_phrase = "zzzzzzzzzzzzzzzzzz"; size_t appear_count = 0; }; my_phrase ten_phrase[11]; unordered_map<string, my_word>word_count; unordered_map<string, my_phrase>phrase_count; string transform_word(string raw_word) { size_t len = raw_word.length(); string simple_word; string temp_word = raw_word; transform(temp_word.begin(), temp_word.end(), temp_word.begin(), ::tolower); bool is_start = false; for (int i = len - 1; i >= 0; i--) { if (isalpha(temp_word[i])) { is_start = true; simple_word = temp_word.substr(0, i + 1); break; } } return simple_word; } void EnterMap(string last_word, string current_word) { string simple_last_word; string simple_current_word; size_t len = last_word.length(); string temp_word = last_word; transform(temp_word.begin(), temp_word.end(), temp_word.begin(), ::tolower); bool is_start = false; for (size_t i = len - 1; i >= 0; i--) { if (isalpha(temp_word[i])) { is_start = true; simple_last_word = temp_word.substr(0, i + 1); break; } } len = current_word.length(); temp_word = current_word; transform(temp_word.begin(), temp_word.end(), temp_word.begin(), ::tolower); is_start = false; for (size_t i = len - 1; i >= 0; i--) { if (isalpha(temp_word[i])) { is_start = true; simple_current_word = temp_word.substr(0, i + 1); break; } } unordered_map<string, my_word> ::iterator got = word_count.find(simple_current_word); if (got == word_count.end()) { word_count.insert({ simple_current_word,{current_word,1} }); } else { got->second.appear_count++; if (current_word<got->second.sort_word) { got->second.sort_word = current_word; } } string simple_phrase = simple_last_word + '_' + simple_current_word; string raw_phrase = last_word + '_' + current_word; unordered_map<string, my_phrase> ::iterator got_phrase = phrase_count.find(simple_phrase); if (got_phrase == phrase_count.end()) { phrase_count.insert({ simple_phrase,{raw_phrase,1} }); } else { got_phrase->second.appear_count++; if (raw_phrase < got_phrase->second.sort_phrase) { got_phrase->second.sort_phrase = raw_phrase; } } } void NumOfCharsLinesInFile(string FileLocation) {//讀入文件,統計字符數、行數、單詞數,並加入到全局變量中。並對單詞進行處理,加入map字典中。 //int NumberChars = 0; int NumberLines = 1; int NumberWords = 0; char last_char = ' '; char current_char; bool wordbegin = false; string current_word; string last_word; size_t sz; FILE * fp = fopen(FileLocation.c_str(), "rb"); fseek(fp, 0L, SEEK_END); sz = ftell(fp); rewind(fp); char*buf; buf = (char*)malloc(sz * sizeof(char)); int len = fread(buf, sizeof(char), sz, fp); //if (len) { // NumberLines++; //} for(int i = 0;i<len;i++) { current_char = buf[i]; if (current_char == '\n') { NumberLines++; } if (current_char < 32 || current_char>126) { current_char = ' '; TotalNum_chars--; } //判斷是否爲單詞 if ((!isalpha(last_char)) && (!isdigit(last_char)) && (isalpha(current_char))) { wordbegin = true; current_word = current_char; } else if (wordbegin) { if ((isalpha(current_char)) || (isdigit(current_char))) { //current_word.push_back(current_char);  current_word.push_back(current_char); if (i == len-1) { goto panduan; } } else { panduan: wordbegin = false; //判斷如今的current_word是否知足word的要求:前四個字符都是字母 if (isalpha(current_word[1]) && isalpha(current_word[2]) && isalpha(current_word[3])) { //說明current_word知足要求 NumberWords++; EnterMap(last_word, current_word); last_word = current_word; //若是知足word要求,則將NumberWords++,並處理該word,並last_word=current_word current_word.clear(); //將current_word清空  } } } //判斷是否爲單詞結束 last_char = current_char; } free(buf); TotalNum_chars += sz; TotalNum_lines += NumberLines; TotalNum_words += NumberWords; fclose(fp); fp = NULL; // } //深度優先遞歸遍歷當前目錄下文件夾和文件及子文件夾和文件 void DfsFolder(string path, int layer) { _finddata_t file_info; string current_path = path + "/*.*"; //也能夠用/*來匹配全部 intptr_t handle = _findfirst(current_path.c_str(), &file_info); //返回值爲-1則查找失敗 if (-1 == handle) { cout << "cannot match the path" << endl; return; } do { //判斷是否子目錄 if (file_info.attrib == _A_SUBDIR) { //遞歸遍歷子目錄 int layer_tmp = layer; if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0) //.是當前目錄,..是上層目錄,必須排除掉這兩種狀況 DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下能夠用\\轉義分隔符,不推薦  } else { //打印記號反映出深度層次 //for (int i = 0; i<layer; i++) // cout << "--"; //cout << file_info.name << endl; string filename = file_info.name; string suffixStr = filename.substr(filename.find_last_of('.') + 1);//獲取文件後綴 NumOfCharsLinesInFile(path + '/' + file_info.name); } } while (!_findnext(handle, &file_info)); //返回0則遍歷完 //關閉文件句柄  _findclose(handle); } bool compare(my_word a, my_word b) { return a.appear_count>b.appear_count; //升序排列 } bool phrase_compare(my_phrase a, my_phrase b) { return a.appear_count>b.appear_count; //升序排列 } void Getten_word() { my_word temporary_word; for (const auto &w : word_count) { ten_word[10] = w.second; for (int i = 0; i <= 9; i++) { if (ten_word[i].appear_count < ten_word[i + 1].appear_count) { temporary_word = ten_word[i]; ten_word[i] = ten_word[i + 1]; ten_word[i + 1] = temporary_word; } } } sort(ten_word, ten_word + 10, compare); } void Getten_phrase() { my_phrase temporary_phrase; for (const auto &w : phrase_count) { ten_phrase[10] = w.second; for (int i = 0; i <= 9; i++) { if (ten_phrase[i].appear_count < ten_phrase[i + 1].appear_count) { temporary_phrase = ten_phrase[i]; ten_phrase[i] = ten_phrase[i + 1]; ten_phrase[i + 1] = temporary_phrase; } } } sort(ten_phrase, ten_phrase + 10, phrase_compare); } int main(int argc, char *argv[]) //int main() { clock_t tStart = clock(); //遞歸遍歷文件夾 DfsFolder("D:/newsample", 0); //遞歸遍歷文件夾結束 cout << "characters: " << TotalNum_chars << endl; cout << "words: " << TotalNum_words << endl; cout << "lines: " << TotalNum_lines << endl; Getten_word(); cout << "=====================word=====================" << endl; for (int i = 0; i < 10; i++) { cout << ten_word[i].sort_word << " " << ten_word[i].appear_count << endl; } Getten_phrase(); cout << "====================phrase===================" << endl; for (int i = 0; i < 10; i++) { cout << ten_phrase[i].sort_phrase << " " << ten_phrase[i].appear_count << endl; } printf("Time taken: %.2fs\n", (double)(clock() - tStart) / CLOCKS_PER_SEC); return 0; }


Performance analyses on Windows







Performance analyses on Linux

command lines :

g++ -std=c++11 -Wall -pg test_gprof.cpp -o test_gprof
gprof test_gprof gmon.out >analysis.txt



 Call graph (explanation follows)
granularity: each sample hit covers 2 byte(s) for 0.06% of 16.82 seconds
index % time    self  children    called     name
[1]     99.0    0.00   16.66                 main [1]
                0.00   16.42       1/1           listDir(char*) [3]
                0.14    0.06       1/1           Getten_phrase() [40]
                0.00    0.04       1/1           Getten_word() [85]
                0.00    0.00      20/20          std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, my_word, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, my_word> > >::operator[](std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >&&) [166]
                1.76   14.66    1323/1323        listDir(char*) [3]
[2]     97.6    1.76   14.66    1323         NumOfCharsLinesInFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) [2]
                0.34   14.32 16641077/16641077     EnterMap(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) [4]
                                 125             listDir(char*) [3]
                0.00   16.42       1/1           main [1]
[3]     97.6    0.00   16.42       1+125     listDir(char*) [3]
                1.76   14.66    1323/1323        NumOfCharsLinesInFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) [2]
                                 125             listDir(char*) [3]
                0.34   14.32 16641077/16641077     NumOfCharsLinesInFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) [2]
[4]     87.2    0.34   14.32 16641077         EnterMap(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) [4]


這兩條代表了  NumOfCharsLinesInFile() 、 EnterMap 函數佔據了主要時間,因此因該主要優化這兩個函數,但是 Entermap 這個函數裏面主要是用的 unordered_map 這個自帶的關聯容器,因此在單獨語句上優化空間不大,可是在用map查詞時候要儘可能少,剛開始我用了好屢次重複的map查詢,後來改爲了查詢一次後把指向value的迭代器存下來,這一步優化在性能上有很大的進步。再就是在 NumOfCharsLinesInFile 這個函數裏面優化。












至於優化嘛,真的感覺到了對於這種程序的熱行優化的重要性,尤爲剛開始的前幾回優化,每次都效果顯著。在visual studio的release模式下從一分鐘到10s。


