1.用Hive對爬蟲大做業產生的文本文件(或者英文詞頻統計下載的英文長篇小說)詞頻統計。html
啓動hadoopapp
Hdfs上建立文件夾並查看ide
上傳nover.txt文件至hdfsoop
啓動Hiveurl
hive
建立原始文檔表spa
create table novels(line string);
導入文件內容到表novels並查看3d
load data inpath '/user/hadoop/fileinput/novel.txt' overwrite into table novels; select * from novels;
用HQL進行詞頻統計,結果放在表word_count裏code
create table word_count as select word,count(1) as count from (se
lect explode(split(line,' ')) as word from novels) word group by word order by word;
查看統計結果orm
show tables; select * from word_count;
2.用Hive對爬蟲大做業產生的csv文件進行數據分析,寫一篇博客描述你的分析過程和分析結果。htm
# 爬取環球科技網新聞信息 import requests from bs4 import BeautifulSoup from datetime import datetime import jieba import pandas newsurl = 'http://tech.huanqiu.com/internet/' def sort(text): str = '''一!「」,。?;’"',.、:\n''' for s in str: text = text.replace(s, ' ') wordlist = list(jieba.cut(text)) exclude = {'這', '\u3000', '\r', '\xa0', '的', '_', ' ', '將', '在', '是', '了', '一', '還', '也', '《', '》', '(', ')'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) print("top5關鍵詞:") for i in range(5): print(dictlist[i]) def getContent(url): res = requests.get(url) res.encoding = 'utf-8' soup2 = BeautifulSoup(res.text, 'html.parser') for news in soup2.select('.l_a'): if len(news.select('.author'))>0: author=news.select('.author')[0].text print("做者",author) if len(soup2.select('.la_con'))>0: content = soup2.select('.la_con')[0].text print("正文:", content) sort(content) def getNewDetails(newsurl): pagelist = [] res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('.item'): # print(news) newsDict = {} newsDict['title'] = news.select('a')[0].attrs['title'] href= news.select('a')[0].attrs['href'] newsDict['brief'] = news.select('h5')[0].text.rstrip('[詳細]') newsDict['time'] = news.select('h6')[0].text dt= datetime.strptime(newsDict['time'], '%Y-%m-%d %H:%M') # print("新聞標題:", title) # print("連接:", a) # print("內容簡介:", brief) # print("時間:", dt) pagelist.append(newsDict) getContent(href) print('\n') pan = pandas.DataFrame(pagelist) pan.to_csv('data.csv') # break res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') getNewDetails(newsurl) # for total in soup.select('#pages'): # all=int(total.select('a')[0].text.rstrip('條')) #獲取總條數計算總頁數 # #print(all) # if(all%60==0): # totalpages=all//60 # else: # totalpages=all//60+1 # print(totalpages) # for i in range(1,totalpages+1): #全部頁面的新聞信息 # PageUrl = newsurl + '{}.html'.format(i) # getNewDetails(PageUrl)
產生的csv文件
傳到虛擬機並查看
上傳到hdfs,啓動hive
導入數據到docs表並查看