1.啓動hadoopphp
2.Hdfs上建立文件夾並查看html
上傳英文詞頻統計文本至hdfsmysql
啓動Hivesql
導入文件內容到表docs並查看api
進行詞頻統計,結果放在表t_word_count2裏數據結構
查看統計結果app
經過hadoop上的hive完成WordCountssh
啓動hadoop函數
ssh localhost
cd /usr/local/hadoop
./sbin/start-dfs.sh
cd /usr/local/hive/lib
service mysql start
start-all.shoop
Hdfs上建立文件夾
hdfs dfs -mkdir test1 hdfs dfs -ls /user/hadoop
上傳文件至hdfs
hdfs dfs -put ./try.txt test1 hdfs dfs -ls /user/hadoop/test1
啓動Hive
hive
建立原始文檔表
create table docs(line string)
用HQL進行詞頻統計,結果放在表word_count裏
create table word_count
as
select
word,count(1)
as
count
from
(
select
explode(split(line,
" "
))
as
word
from
docs) word
group
by
word order
by
word;
導入文件內容到表docs並查看
load data inpath
'/user/hadoop/tese1/try.txt'
overwrite
into
table docs
select
*
from
docs
f = open("C:/Users/ZD/PycharmProjects/test/test.txt", 'w+', encoding='utf8')
import jieba
import requests
from bs4 import BeautifulSoup
def songlist(url):
res = requests.get(url)
res.encoding = 'UTF-8'
soup = BeautifulSoup(res.text, 'html.parser')
songname = soup.select('.song')
for i in songname[1:]:
url = i.select('a')[0].attrs['href']
songread(url)
def songread(url):
f = open("C:/Users/ZD/PycharmProjects/test/test.txt", 'w+', encoding='utf8')
res = requests.get(url)
res.encoding = 'UTF-8'
soup = BeautifulSoup(res.text, 'html.parser')
song = soup.select('.lrcItem')
for i in song:
f.write(i.text)
songlist('http://www.kuwo.cn/geci/a_336/?')
f = open("C:/Users/ZD/PycharmProjects/test/test.txt", 'r', encoding='utf8')
str = f.read()
f.close()
wordList = jieba.cut(str)
wordList = list(jieba.cut(str))
wordDic = {}
for i in set(wordList):
wordDic[i] = wordList.count(i)
sort_word = sorted(wordDic.items(), key=lambda d: d[1], reverse=True)
for i in range(60):
print(sort_word[i])
fo = open("C:/Users/ZD/PycharmProjects/test/test1.txt", 'w', encoding='utf8')
for i in range(60):
fo.write(sort_word[i][0] + '\n')
fo.close()
cd /usr/local/hadoop
touch text.txt
ls -al
cat text.txt
./sbin/start-dfs.sh
./bin/hdfs dfs -mkdir -p /user/hadoop ./bin/hdfs dfs -mkdir input ./bin/hdfs dfs -put ./test.txt input
./sbin/start-dfs.sh
./bin/hdfs dfs -ls input ./bin/hdfs dfs -cat input/test.txt
cd rm test.txt
./bin/hdfs dfs -get input/test.txt ~/test.txt
./bin/hdfs dfs -rm news.txt
./bin/hdfs dfs -ls input
if $(hdfs dfs -test -e text.txt); then $(hdfs dfs -appendToFile local.txt text.txt); else $(hdfs dfs -copyFromLocal -f local.txt text.txt); fi
if $(hdfs dfs -test -e file:///usr/hadoop/text.txt); then $(hdfs dfs -copyToLocal text.txt ./text2.txt); else $(hdfs dfs -copyToLocal text.txt ./text.txt); fi
hdfs dfs -cat text.txt
hdfs dfs -ls -h text.txt
hdfs dfs -ls -R -h /user/hadoop
if $(hdfs dfs -test -d dir1/dir2); then $(hdfs dfs -touchz dir1/dir2/filename); else $(hdfs dfs -mkdir -p dir1/dir2 && hdfs dfs -touchz dir1/dir2/filename); fi hdfs dfs -rm dir1/dir2/filename
建立目錄:hdfs dfs -mkdir -p dir1/dir2 刪除目錄:hdfs dfs -rmdir dir1/dir2 強制刪除目錄:hdfs dfs -rm -R dir1/dir2
追加到文件末尾:hdfs dfs -appendToFile local.txt text.txt 追加到文件開頭: hdfs dfs -get text.txt cat text.txt >> local.txt hdfs dfs -copyFromLocal -f text.txt text.txt
hdfs dfs -rm text.txt
刪除目錄:hdfs dfs -rmdir dir1/dir2 強制刪除目錄:hdfs dfs -rm -R dir1/dir2
hdfs dfs -mv text.txt text2.txt
1. 將新聞的正文內容保存到文本文件。
2. 將新聞數據結構化爲字典的列表:
3. 安裝pandas,用pandas.DataFrame(newstotal),建立一個DataFrame對象df.
4. 經過df將提取的數據保存到csv或excel 文件。
5. 用pandas提供的函數和方法進行數據分析:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import pandas
#獲取點擊次數
def getClickCount(newsUrl):
newId=re.search('\_(.*).html',newsUrl).group(1).split('/')[1]
clickUrl="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
clickStr = requests.get(clickUrl).text
count = re.search("hits'\).html\('(.*)'\);", clickStr).group(1)
return count
#獲取新聞詳情
def getNewsDetail(newsurl):
resd=requests.get(newsurl)
resd.encoding='utf-8'
soupd=BeautifulSoup(resd.text,'html.parser')
news={}
news['title']=soupd.select('.show-title')[0].text
# news['newsurl']=newsurl
info=soupd.select('.show-info')[0].text
news['dt']=datetime.strptime(info.lstrip('發佈時間:')[0:19],'%Y-%m-%d %H:%M:%S')
news['click'] = int(getClickCount(newsurl))
if info.find('來源')>0:
news['source'] =info[info.find('來源:'):].split()[0].lstrip('來源:')
else:
news['source']='none'
if info.find('做者:') > 0:
news['author'] = info[info.find('做者:'):].split()[0].lstrip('做者:')
else:
news['author'] = 'none'
# news['content']=soupd.select('.show-content')[0].text.strip()
#獲取文章內容並寫入到文件中
content=soupd.select('.show-content')[0].text.strip()
writeNewsContent(content)
return news
def getListPage(listPageUrl):
res=requests.get(listPageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
newsList=[]
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
a=news.select('a')[0].attrs['href']
newsList.append(getNewsDetail(a))
return (newsList)
#數據寫入文件
def writeNewsContent(content):
f=open('gzccNews.txt','a',encoding='utf-8')
f.write(content)
f.close()
def getPageNumber():
ListPageUrl="http://news.gzcc.cn/html/xiaoyuanxinwen/"
res=requests.get(ListPageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
n = int(soup.select('.a1')[0].text.rstrip('條'))//10+1
return n
newsTotal=[]
firstPage='http://news.gzcc.cn/html/xiaoyuanxinwen/'
newsTotal.extend(getListPage(firstPage))
n=getPageNumber()
for i in range(n,n+1):
listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
newsTotal.extend(getListPage(listUrl))
df=pandas.DataFrame(newsTotal)
# df.to_excel("news.xlsx")
# print(df.head(6))
# print(df[['author','click','source']])
# print(df[df['click']>3000])
sou=['國際學院','學生工做處']print(df[df['source'].isin(sou)])