統計文本文件中全部單詞出現的頻率。html
文本文件:python
foo foo quux labs foo bar quux abc bar see you by test welcome test abc labs foo me python hadoop ab ac bc bec python
#!/usr/bin/env python # -*- coding:utf-8 -*- import sys #輸入爲標準輸入stdin for line in sys.stdin: #刪除開頭和結尾的空行 line = line.strip() #以默認空格分隔單詞到words列表 words = line.split() for word in words: #輸出全部單詞,格式爲「單詞,1」以便做爲Reduce的輸入 print '%s\t%s' % (word,1)0
#!/usr/bin/env python # -*- coding:utf-8 -*- from operator import itemgetter import sys current_word = None current_count = 0 word = None #獲取標準輸入,即mapper.py的標準輸出 for line in sys.stdin: #刪除開頭和結尾的空行 line = line.strip() #解析mapper.py輸出做爲程序的輸入,以tab做爲分隔符 word,count = line.split('\t',1) #轉換count從字符型到整型 try: count = int(count) except ValueError: #count非數字時,忽略此行 continue #要求mapper.py的輸出作排序(sort)操做,以便對連續的word作判斷 if current_word == word: current_count += count else: if current_word: #輸出當前word統計結果到標準輸出 print '%s\t%s' % (current_word,current_count) current_count = count current_word = word #輸出最後一個word統計 if current_word == word: print '%s\t%s' % (current_word,current_count)
[root@wx ~]# cd /root/hadooptest/ [root@wx hadooptest]# cat input.txt | ./mapper.py foo 1 foo 1 quux 1 labs 1 foo 1 bar 1 quux 1 abc 1 bar 1 see 1 you 1 by 1 test 1 welcome 1 test 1 abc 1 labs 1 foo 1 me 1 python 1 hadoop 1 ab 1 ac 1 bc 1 bec 1 python 1 [root@wx hadooptest]# cat input.txt | ./mapper.py | sort -k1,1 | ./reducer.py ab 1 abc 2 ac 1 bar 2 bc 1 bec 1 by 1 foo 4 hadoop 1 labs 2 me 1 python 2 quux 2 see 1 test 2 welcome 1 you 1
/usr/local/hadoop-2.6.4/bin/hadoop fs -mkdir -p /user/root/word
/usr/local/hadoop-2.6.4/bin/hadoop fs -put /root/hadooptest/input.txt /user/root/word
/usr/local/hadoop-2.6.4/bin/hadoop fs -ls /user/root/word #結果: Found 1 items -rw-r--r-- 2 root supergroup 118 2016-03-22 13:36 /user/root/word/input.txt
/usr/local/hadoop-2.6.4/bin/hadoop jar /usr/local/hadoop-2.6.4/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar -files 'mapper.py,reducer.py' -input /user/root/word -output /output/word -mapper ./mapper.py -reducer ./reducer.py
[root@wx hadooptest]# /usr/local/hadoop-2.6.4/bin/hadoop fs -ls /output/word Found 2 items -rw-r--r-- 2 root supergroup 0 2016-03-22 13:47 /output/word/_SUCCESS -rw-r--r-- 2 root supergroup 110 2016-03-22 13:47 /output/word/part-00000
[root@wx hadooptest]# /usr/local/hadoop-2.6.4/bin/hadoop fs -cat /output/word/part-00000 ab 1 abc 2 ac 1 bar 2 bc 1 bec 1 by 1 foo 4 hadoop 1 labs 2 me 1 python 2 quux 2 see 1 test 2 welcome 1 you 1
活動使用開源框架:MRJobapp