shell腳本--練習1（爬蟲）

時間 2019-11-13

標籤 shell 腳本練習爬蟲欄目 Unix 简体版

原文原文鏈接

系統環境

[root@m01 scripts]# uname -r
2.6.32-696.el6.x86_64
[root@m01 scripts]# uname -m
x86_64
[root@m01 scripts]# cat /etc/redhat-release 
CentOS release 6.9 (Final)

shell練習1

#!/bin/bash
# date: 2018-03-xx
# author: yk
# descrption: Climbing 51cto data
# version: 0.1

source /etc/profile
. /etc/init.d/functions

# Create a temporary file 
TmpFile="/tmp/.$(date +%Y%m%d_%H%M%S).log.tmp"
touch $TmpFile
# Store web page information
BlogFile="/tmp/$(date +%Y%m%d_%H%M%S)_blog.html"
touch $BlogFile

# Let the user enter the 51cto blogger's homepage URL
read -p 'please input websitei' Website
# Climb 51cto blogger home
wget -q -O $TmpFile $Website &>/dev/null
[ $? -ne 0 ] && echo "you input website is not exist" && exit 1

# Blogger's last page blog. That is, the last page contains the number of pages
MainURL=$(sed -n '/class="last".*末頁.*/p' $TmpFile | egrep -o 'http:.*p[0-9]{1,}')

# 28 pages
Pages=$(echo $MainURL | sed -n 's#^.*p##gp')

# If it is not the home page, the number of extracted pages is definitely not a number
if [ "$Pages" -gt 0 ] &>/dev/null
then
	echo "please wait ......"
else
	echo "you input url is not homepage"
	rm -f $TmpFile
	rm -f $BlogFile
	exit 1
fi



# Url address, in addition to the last number
UR=$(echo $MainURL | sed -rn 's#[0-9]{1,}$##gp')

# Traverse every page
for ((i=1;i<=$Pages;i++))
do
	# Splice together, which is the complete blogger's website
	wget -q -O $TmpFile ${UR}$i &>/dev/null
	# Get time, title, link
	egrep -A 1 '<a class="tit" | class="time' $TmpFile | sed '/^\-\-/d' | sed -r 's#[ ]+# #g'   >>$BlogFile
	# Pause 0.05 seconds, not too fast
	sleep 0.05   
done

# clear tmp file
>$TmpFile


# ===============================================================
action "The blogger’s blog information has been downloaded locally" /bin/true
echo "Extracting required data from downloaded data ......"
echo "please wait ....."
# ===============================================================


i=0
# Extract the desired data for each line of the file
while read line
do
	# Because every 4th line is the content of a blog, it only needs to extract from every 4th line and loop execution.
	((++i))
	case "$i" in
		1)
			# Get blog posting time
			Time=$(echo $line | sed -r 's#^.*>發佈於：(.*)</a>#\1#g')
			;;
		3)
			# get href 
			Href=$(echo $line | sed -r 's#^.*href=\"(.*)\">#\1#g')
			;;
		4)
			# get blog title
			Title=$(echo $line | sed -r 's#^(.*)<.*$#\1#g')
			;;
		*)
	esac
	# Every 4 acts as a blog, appends the acquired information to a temporary file
	if [ "$i" -eq "4" ]
	then
		i=0
		echo "<a href=\"$Href\">$Time---$Title</a><br/>" >> $TmpFile
	fi
done < $BlogFile
# clear file
>$BlogFile
# Sort by time , Append to file $BlogFile
cat $TmpFile | sort -rt '>' -k2 >>$BlogFile 
rm -f $TmpFile

action "success" /bin/true

注：僅供參考html