結對第二次—文獻摘要熱詞統計及進階需求

時間 2019-11-20

原文原文鏈接

班級：軟件工程1916|W
做業：結對第一次—原型設計（文獻摘要熱詞統計）
結對學號：221600107 陳家豪、221600110 公孫駿傑
課程目標：實現一個可以對文本文件中的單詞的詞頻進行統計的控制檯程序
Github倉庫地址：基礎部分GitHubc++

1.Github簽入記錄

2.具體分工

咱們兩我的各自實現了基礎部分的兩個功能
陳家豪負責編寫了字符統計和單詞統計函數，撰寫博客
公孫駿傑負責編寫了行數統計和詞頻統計函數，簽入GitHubgit

3.PSP表格

PSP2.1	Personal Software Process Stages	預估耗時（分鐘）	實際耗時（分鐘）
Planning	計劃
• Estimate	• 估計這個任務須要多少時間	1280	1400
Development	開發
• Analysis	• 需求分析 (包括學習新技術)	100	100
• Design Spec	• 生成設計文檔	180	100
• Design Review	• 設計複審	10	10
• Coding Standard	• 代碼規範 (爲目前的開發制定合適的規範)	30	30
• Design	• 具體設計	60	60
• Coding	• 具體編碼	600	800
• Code Review	• 代碼複審	60	60
• Test	• 測試（自我測試，修改代碼，提交修改）	120	120
Reporting	報告
• Test Report	• 測試報告	60	60
• Size Measurement	• 計算工做量	20	20
• Postmortem & Process Improvement Plan	• 過後總結, 並提出過程改進計劃	40	40
	合計	1280	1400

4.解題思路

拿到題目後便開始用C語言實現（後來仔細看題目要求C++實現便進行了小改），所以代碼中有不少C語言的痕跡。看到題目的要求就開始作了，而後許多對字符的處理函數都上網查了一遍，算是對C語言和C++作了一次複習吧。github

5.設計實現過程

代碼主要由四個函數組成：字符統計函數、單詞統計函數、行數統計函數、詞頻統計函數。算法

單元測試的部分對各個函數都進行了測試：數組

測試了輸出的字符數是否正確
測試了輸出的單詞數是否正確
測試了輸出的行數是否正確
測試了輸出的詞頻排序是否正確

算法設計過程
部分流程圖：

題目中要求統計文件的單詞總數，單詞：至少以4個英文字母開頭，跟上字母數字符號，單詞以分隔符分割，不區分大小寫。因而就須要記錄單詞開頭的字母數量，來進行判斷。若是開頭是數字，則一直讀到下一個分隔符；若是讀到超過了四個以上的字母，說明該字符串爲單詞，則繼續讀取直至分隔符。
部分測試過程：
第一篇文章：

函數

第二篇文章：

單元測試

6.關鍵代碼

字符統計函數學習

void CharCount() //字符數統計函數
{
    FILE *fp;
    int c = 0;
    char ch;
    if((fp = fopen("input.txt","r")) == NULL)
    {
        printf("file read failure.");
    }
    ch = fgetc(fp);
    while(ch != EOF)
    {
            c++;
            ch = fgetc(fp);
    }
    freopen("result.txt","a",stdout);
    printf("characters：%d.\n",c);
    fclose(fp);
}

單詞數統計函數測試

void WordCount() //單詞數統計函數
{
    FILE *fp;
    int w = 0;
    int a = 0;
    char ch;
    if((fp = fopen("input.txt","r")) == NULL)
    {
        printf("file read failure.");
    }
    ch = fgetc(fp);
    while(ch != EOF)
    {
        if ((ch >= 'a'&&ch <= 'z')||(ch >= 'A'&&ch <='Z'))
        {
            while ((ch >= 'a'&&ch <= 'z')||(ch >= 'A'&&ch <= 'Z'))
            {
                a++;
                ch = fgetc(fp);
            }
            if (a >= 4)
            {
                w++;
                while (ch >= '0'&&ch <= '9')
                {
                    ch = fgetc(fp);
                }
                a = 0; 
            }
            else
            {
                while (ch >= '0'&&ch <= '9')
                {
                    ch = fgetc(fp);
                }
                a = 0; 
            }
        }
        else if ((ch >= '0'&&ch <= '9'))
        {
            while ((ch >= 'a'&&ch <= 'z')||(ch >= 'A'&&ch <= 'Z')||(ch >= '0'&&ch <= '9'))
            {
                ch = fgetc(fp);
            }
        }
        else 
        {
            ch = fgetc(fp);
        }
    }
    freopen("result.txt","a",stdout);
    printf("words：%d.\n",w);
    fclose(fp);

}

//行數統計函數編碼

void LineCount() //行數統計函數
{
    FILE *fp;
    int l = 1;
    char ch;
    if((fp = fopen("input.txt","r")) == NULL)
    {
        printf("file read failure.");
    }
    ch = fgetc(fp);
    while(ch != EOF)
    {
        if (ch == '\n')
        {
            l++;
            ch = fgetc(fp);
        }
        else
        {
            ch = fgetc(fp);
        }
    }
    freopen("result.txt","a",stdout);
    printf("lines：%d.\n",l);
    fclose(fp);
}

詞頻統計函數

typedef pair<string, int> PAIR;

bool cmp_by_value(const PAIR& lhs, const PAIR& rhs)
{
    return lhs.second > rhs.second;
}

struct CmpByValue
{
    bool operator()(const PAIR& lhs, const PAIR& rhs)
    {
        return lhs.second > rhs.second;
    }
};

map<string,int> words;
void Transform()
{
#ifdef LOCAL
    freopen("input.txt", "r", stdin); 
    freopen("result.txt", "a", stdout);
#endif // LOCAL
    string s;
    words.clear();
    while(cin>>s)
    {
        transform(s.begin(), s.end(), s.begin(), ::tolower);
        if(!words.count(s)) words[s]=0;
        words[s]++;
    }

    //把 map 中元素轉存到 vector 中
    vector<PAIR> words_vec(words.begin(), words.end());
    sort(words_vec.begin(), words_vec.end(), CmpByValue());
// sort(name_score_vec.begin(), name_score_vec.end(), cmp_by_value);
    int top10=0;
    for (int i = 0; i != words_vec.size(); ++i)
    {
        if(top10!=10)
        {
            
                if(words_vec[i].first.length()>=4)
        if(isalpha(words_vec[i].first.at(0)))
        {
             cout <<"<"<< words_vec[i].first<<">"<<" "<<words_vec[i].second << endl;
             top10++;
        }
        }
    
        else break;
       
    }

}