應女友要求,爲了能及時掌握技術動向,特地寫了這個爬蟲,天天定時爬取博客園首頁併發送至微信。html
Python3.4瀏覽器
# -*-coding:utf-8 -*- import requests from requests import exceptions from bs4 import BeautifulSoup as bs import re from wxpy import * import schedule import time bot=Bot(cache_path=True) #獲取網頁內容 def getHtml(pageIndex): #定義請求頭 假裝成瀏覽器 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} #pageIndex表明頁數 payload={'CategoryType': 'SiteHome', 'ParentCategoryId': '0', 'CategoryId': '808', 'PageIndex': pageIndex, 'TotalPostCount': '4000'} try: r=requests.post('https://www.cnblogs.com/mvc/AggSite/PostList.aspx',data=payload,headers=headers) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except requests.RequestException as e: return e.strerror #向微信文件傳輸助手發送消息 def sendblogmsg(content): #搜索本身的好友 #my_friend = bot.friends().search('')[0] my_friend=bot.file_helper my_friend.send(content) def job(): contents='' #i表示當前頁數 for i in range(1,3): html=getHtml(i) soup=bs(html,"html.parser") blogs=soup.findAll("div",{'class':'post_item_body'}) for blog in blogs: title=blog.find('h3').get_text() summary=blog.find('p',{'class':'post_item_summary'}).get_text() link=blog.find('a',{'class':'titlelnk'})['href'] content='標題:'+title+'\n連接:'+link+'\n-----------\n' contents+=content sendblogmsg(contents) #定時 schedule.every().day.at("06:00").do(job) while True: schedule.run_pending() time.sleep(1) bot.join()