詞項頻率:java
df:term frequency。 term在文檔中出現的頻率.tf越大,詞項越重要.工具
文檔頻率:學習
tf:document frequecy。有多少文檔包含此term,df越大詞項越不重要.spa
詞項權重計算公式:.net
tf-idf=tf(t,d)*log(N/df(t))
package com.javacore.algorithm; import java.util.Arrays; import java.util.List; /** * Created by bee on 17/3/13. * @version 1.0 * @author blog.csdn.net/napoay */ public class TfIdfCal { /** *calculate the word frequency * @param doc word vector of a doc * @param term a word * @return the word frequency of a doc */ public double tf(List<String> doc, String term) { double termFrequency = 0; for (String str : doc) { if (str.equalsIgnoreCase(term)) { termFrequency++; } } return termFrequency / doc.size(); } /** *calculate the document frequency * @param docs the set of all docs * @param term a word * @return the number of docs which contain the word */ public int df(List<List<String>> docs, String term) { int n = 0; if (term != null && term != "") { for (List<String> doc : docs) { for (String word : doc) { if (term.equalsIgnoreCase(word)) { n++; break; } } } } else { System.out.println("term不能爲null或者空串"); } return n; } /** *calculate the inverse document frequency * @param docs the set of all docs * @param term a word * @return idf */ public double idf(List<List<String>> docs, String term) { System.out.println("N:"+docs.size()); System.out.println("DF:"+df(docs,term)); return Math.log(docs.size()/(double)df(docs,term)); } /** * calculate tf-idf * @param doc a doc * @param docs document set * @param term a word * @return inverse document frequency */ public double tfIdf(List<String> doc, List<List<String>> docs, String term) { return tf(doc, term) * idf(docs, term); } public static void main(String[] args) { List<String> doc1 = Arrays.asList("人工", "智能", "成爲", "互聯網", "大會", "焦點"); List<String> doc2 = Arrays.asList("谷歌", "推出", "開源", "人工", "智能", "系統", "工具"); List<String> doc3 = Arrays.asList("互聯網", "的", "將來", "在", "人工", "智能"); List<String> doc4 = Arrays.asList("谷歌", "開源", "機器", "學習", "工具"); List<List<String>> documents = Arrays.asList(doc1, doc2, doc3,doc4); TfIdfCal calculator = new TfIdfCal(); System.out.println(calculator.tf(doc2, "開源")); System.out.println(calculator.df(documents, "開源")); double tfidf = calculator.tfIdf(doc2, documents, "谷歌"); System.out.println("TF-IDF (谷歌) = " + tfidf); System.out.println(Math.log(4/2)*1.0/7); } }
運行結果:code
0.14285714285714285 2 N:4 DF:2 TF-IDF (谷歌) = 0.09902102579427789