@python
以前寫的博客都在csdn和博客園中
要將博客同步到本身的博客網站中
由於都是使用markdown格式書寫的,因此直接爬取上傳就完事git
分析了下博客園,發現可行。先登陸進入本身的博客主頁,能夠看到有下一頁的標誌,每一頁包含若干個博客詳情,博客詳情中包含edit頁(編輯頁面)和正常頁面(其餘用戶訪問的),要獲取的就是eidt頁面的博客名字,博客內容。博客分類在edit頁面中很差獲取,轉而去正常頁面獲取,發現是一個ajax請求,傳入用戶id和博客id就能夠獲取到分類。信息獲取到了就保存在本地,按分類保存。保存後就使用post請求發送到本身的博客網站中,或者直接寫入數據庫github
從文件讀取請求頭字典,構造函數傳入的是文件名ajax
class getHeaders(object): def __init__(self,path): self.dict_ = {} with open(path, "r",encoding="utf8") as f: line = f.readline() while line: a = line.split(":") self.clean_(a) try: self.dict_[a[0]] = a[1] except: pass line = f.readline() def clean_(self,list_): for i in range(len(list_)): list_[i] = list_[i].strip() if len(list_) > 2 :#說明有多個:號 try: #開頭爲空,說明元字符串開頭有: list_.remove("") list_[0] = ":"+list_[0] except: #說明開頭不爲空 list_[1] = list_[1]+":"+list_[2]
爬博客園的代碼數據庫
import requests from lxml import etree from GetHeaders import getHeaders import os #博客園獲取分類,傳入ID catagory_url = "https://www.cnblogs.com/simon-idea/ajax/CategoriesTags.aspx?blogId=xxxxxx&postId=%s" #每一頁中包含的url link_list = [] #篩選詳情頁的url detail_list = [] heads = getHeaders("博客園").dict_ for i in range(1,9): url = "https://www.cnblogs.com/simon-idea/default.html?page=%s" #7 url = url % str(i) req = requests.get(url, headers=heads) html = etree.HTML(req.content) every_page_links = html.xpath('//*[@id="mainContent"]/div/div/div/a/@href') link_list.extend(every_page_links) for i in link_list: if "Edit" in i : detail_list.append(i) # assert 1 ==2 for i in detail_list: url = i req = requests.get(url, headers=heads) a = req.content.decode(req.encoding) html = etree.HTML(a) # title //*[@id="Editor_Edit_txbTitle"]/@value title = html.xpath('//*[@id="Editor_Edit_txbTitle"]/@value')[0] # body //*[@id="Editor_Edit_EditorBody"] body = html.xpath('//*[@id="Editor_Edit_EditorBody"]/text()')[0] req = requests.get(catagory_url % i[-8:], headers=heads) a = req.content.decode(req.encoding) html = etree.HTML(a) catagory = html.xpath('//*[@id="BlogPostCategory"]/a/text()')[0] dirs = '博客/%s' % catagory if not os.path.exists(dirs): os.makedirs(dirs) with open("博客/%s/%s.md" % (catagory,title),"w",encoding="utf-8") as f: f.write(body)
上傳的代碼有不少坑,不完善
由於原博客做者的博客路徑定義的有問題markdown
我的博客網站
我的GitHub地址
我的公衆號:
app