1. 將新聞的正文內容保存到文本文件。php
1
2
3
4
|
def
writeNewsDatail(content):
f
=
open
(
'gzccnews1.txt'
,
'a'
,encoding
=
'utf-8'
)
f.write(content)
f.close()
|
2. 將新聞數據結構化爲字典的列表:html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
import
pandas
import
requests
import
re
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
def
writeNewsDatail(content):
f
=
open
(
'gzccnews1.txt'
,
'a'
,encoding
=
'utf-8'
)
f.write(content)
f.close()
#dui
# 獲取新聞點擊次數
def
getNewsId(url):
#dui
newsId
=
re.findall(r
'\_(.*).html'
, url)[
0
][
-
4
:]
clickUrl
=
'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'
.
format
(newsId)
clickRes
=
requests.get(clickUrl)
# 利用正則表達式獲取新聞點擊次數
clickCount
=
int
(re.search(
"hits'\).html\('(.*)'\);"
, clickRes.text).group(
1
))
return
clickCount
def
getNewsDetail(newsUrl):
#dui
# 讀取新聞細節
resd
=
requests.get(newsUrl)
resd.encoding
=
"utf-8"
soupd
=
BeautifulSoup(resd.text,
'html.parser'
)
#打開詳情頁並解析
news
=
{}
news[
'title'
]
=
soupd.select(
'.show-title'
)[
0
].text
info
=
soupd.select(
".show-info"
)[
0
].text
# info相關內容
news[
'dt'
]
=
datetime.strptime(info.lstrip(
'發佈時間:'
)[
0
:
19
],
'%Y-%m-%d %H:%M:%S'
)
if
info.find(
'來源:'
)>
0
:
#做者:審覈:來源:攝影:同樣處理
news[
'source'
]
=
info[info.find(
'來源:'
):].split()[
0
].lstrip(
'來源:'
)
else
:
news[
'source'
]
=
'none'
news[
'content'
]
=
soupd.select(
".show-content"
)[
0
].text.strip()
# 正文
news[
'count'
]
=
getNewsId(newsUrl)
news[
'newsUrl'
]
=
newsUrl
return
(news)
def
getListPage(pageUrl):
#dui
res
=
requests.get(pageUrl)
res.encoding
=
'utf-8'
soup
=
BeautifulSoup(res.text,
'html.parser'
)
newsList
=
[]
for
news
in
soup.select(
'li'
):
if
len
(news.select(
'.news-list-title'
)) >
0
:
newsUrl
=
news.select(
'a'
)[
0
].attrs[
'href'
]
# 連接
newsList.append(getNewsDetail(newsUrl))
#把詳情的字典插進列表(一個新聞是字典 多個新聞是列表)
return
newsList
def
getPageN():
#新聞列表頁的總頁數
res
=
requests.get(
'http://news.gzcc.cn/html/xiaoyuanxinwen/'
)
res.encoding
=
'utf-8'
soup
=
BeautifulSoup(res.text,
'html.parser'
)
n
=
int
(soup.select(
'.a1'
)[
0
].text.rstrip(
'條'
))
return
(n
/
/
10
+
1
)
newsTotal
=
[]
n
=
getPageN()
p
=
[
2
, n]
for
i
in
p:
listPageUrl
=
"http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html"
.
format
(i)
print
(listPageUrl)
newsTotal.extend(getListPage(listPageUrl))
for
news
in
newsTotal:
print
(news)
|
3. 安裝pandas,用pandas.DataFrame(newstotal),建立一個DataFrame對象df.python
1
2
|
import
pandas
df
=
pandas.DataFrame(newsTotal)
|
4. 經過df將提取的數據保存到csv或excel 文件。正則表達式
1
|
df.to_excel(
"0416.xlsx"
)
|
5. 用pandas提供的函數和方法進行數據分析:api
1
2
3
|
print
(df[[
'count'
,
'title'
,
'source'
]][
0
:
6
])
print
(df[(df[
'count'
]>
3000
)&(df[
'source'
]
=
=
'學校綜合辦'
)])
print
(df[(df[
'source'
]
=
=
'國際學院'
)|(df[
'source'
]
=
=
'學生處'
)])
|