好久沒有寫博客了,這段時間一直在搞風控的東西,過段時間我把風控的內容整理整理髮出來你們一塊兒研究研究。html
這兩天抽空寫了兩個python爬蟲腳本,一個使用re,一個使用xpath。python
直接上代碼——基於re:web
spider.pycookie
# -*- coding:utf-8 -*- import urllib.request import re import tool import os import http.cookiejar # 抓取MOKO_MM class Spider: # 頁面初始化 def __init__(self): self.siteURL = 'http://www.moko.cc/focus|list.action' self.tool = tool.Tool() # 獲取索引頁面的內容 def getPage(self, pageIndex): url = self.siteURL + "?type=4&curPage=" + str(pageIndex) request = urllib.request.Request(url) response = urllib.request.urlopen(request) return response.read().decode('utf-8') # 獲取索引界面全部MM的信息,list格式 def getContents(self, pageIndex): page = self.getPage(pageIndex) pattern = re.compile( '<div class="subMainContent".*?<a href="(.*?)".*?subFocus-07.*?<img src="(.*?)".*?subFocus-08.*?<h1>(.*?)</h1>', re.S) items = re.findall(pattern, page) # item[0] 詳情頁連接 # item[1] 縮略圖 # item[2] 標題 contents = [] for item in items: contents.append([item[0], item[1], item[2].replace(" ", "-").replace("|", "-").replace(".", "-").replace(":", "-")]) return contents # 獲取MM我的詳情頁面 def getDetailPage(self, infoURL): def makeMyOpener(head={ 'accept-encoding': 'deflate, sdch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': 'JSESSIONID=58C82905AD36B5DFA8D4F1C98A2559DC; Hm_lvt_8d82e75c6168ba4bc0135a08edae2a2e=1488505496; Hm_lpvt_8d82e75c6168ba4bc0135a08edae2a2e', 'Referer': 'https://mm.taobao.com/687471686.htm', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' }): cookie = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = header return opener oper = makeMyOpener() uop = oper.open(infoURL) data = uop.read().decode('utf-8') return data # response = urllib.request.urlopen(infoURL) # return response.read().decode('utf-8') # 獲取我的文字簡介 def getBrief(self, page): pattern = re.compile( '<div class="infoShow-12".*?<p align="center".*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>', re.S) items = re.findall(pattern, page) # item[0] 主頁地址 # item[1] 頭像 # item[2] 姓名 contents = [] for item in items: contents.append([item[0], item[1], item[2]]) return contents # result = re.search(pattern, page) # print(result.group()) # return self.tool.replace(result.group(1)) # 獲取頁面全部圖片 def getAllImg(self, page): pattern = re.compile('<div class="infoShow-12">(.*?)<div class="infoShow-13">', re.S) # 我的信息頁面全部代碼 content = re.search(pattern, page) # 從代碼中提取圖片 patternImg = re.compile('<img.*?src="(.*?)"', re.S) images = re.findall(patternImg, content.group(1)) return images # 保存多張寫真圖片 def saveImgs(self, images, name): number = 1 print(u"發現", name, u"共有", len(images), u"張照片") for imageURL in images: splitPath = imageURL.split('.') splitPath = splitPath fTail = splitPath.pop() if len(fTail) > 3: fTail = "jpg" fileName = name + "/" + str(number) + "." + fTail self.saveImg(imageURL, fileName) number += 1 # 保存頭像 def saveIcon(self, iconURL, name): splitPath = iconURL.split('.') fTail = splitPath.pop() fileName = name + "/icon." + fTail self.saveImg(iconURL, fileName) # 保存我的簡介 def saveBrief(self, content, name): fileName = name + "/" + name + ".txt" f = open(fileName, "w+") print(u"正在保存信息爲", fileName) # f.write(content.decode('utf-8')) f.write(content) # 保存圖片地址頁到各文件夾中 def saveToLocal(self, Li, name): fileName = name + "/" + "urlPage.txt" print(u"正在保存圖片地址頁:", fileName) # f.write(content.decode('utf-8')) # pre=pre.replace("[","") # pre=pre.replace("]","")+"\n" # print (pre) f = open(fileName, "w") f.write(Li) f.close() # 追加方式寫入當前爬行的名字,後續調用 content = name + " " with open('url.txt', 'a') as url: url.write(content) url.close() print(name + u"追加完成!\n") # 傳入圖片地址,文件名,保存單張圖片 def saveImg(self, imageURL, fileName): try: u = urllib.request.urlopen(imageURL) data = u.read() f = open(fileName, 'wb') f.write(data) print(u"正在保存的一張圖片爲", fileName) f.close() except urllib.request.URLError as e: print(e.reason) # 建立新目錄 def mkdir(self, path): path = path.strip() # 判斷路徑是否存在 # 存在 True # 不存在 False isExists = os.path.exists(path) # 判斷結果 if not isExists: # 若是不存在則建立目錄 print(u"新建了名字叫作", path, u'的文件夾') # 建立目錄操做函數 os.makedirs(path) return True else: # 若是目錄存在則不建立,並提示目錄已存在 print(u"名爲", path, '的文件夾已經建立成功') return False # 將一頁MOKO MM的信息保存起來 def savePageInfo(self, pageIndex): # 獲取第一頁MOKO MM列表 contents = self.getContents(pageIndex) for item in contents: # item[0]我的詳情URL,item[1]頭像URL,item[2]姓名 print(u"發現一位名叫", item[2], u"的信息") print(u"正在保存", item[2], "的信息") print(u"我的詳情地址是", "http://www.moko.cc" + str(item[0])) # 我的詳情頁面的URL detailURL = "http://www.moko.cc" + str(item[0]) # 獲得我的詳情頁面代碼 detailPage = self.getDetailPage(detailURL) # 獲取我的簡介 # brief = self.getBrief(detailPage) # 獲取全部圖片列表 self.mkdir(item[2]) images = self.getAllImg(detailPage) spider.saveImgs(images, item[2]) # 保存我的簡介 # self.saveBrief(brief.encode('utf-8'), item[2]) # self.saveBrief(brief, item[2]) # 保存圖片地址頁到本地 # self.saveToLocal(detailPage, item[2]) # 保存頭像 self.saveIcon("https:" + str(item[1]), item[2]) # 刪除舊名單(若是有) def deleteOldTxt(self): filename = 'url.txt' if os.path.exists(filename): os.remove(filename) print("\n發現舊名單,已刪除\n採集開始\n") # 傳入起止頁碼,獲取MM頁面保存 def savePagesInfo(self, start, end): for i in range(start, end + 1): print(u"正在尋找第", i, u"個地方") self.savePageInfo(i) # 保存圖片 # self.saveImgs(images,item[2]) # 讀取名字list def openNameList(self): with open("url.txt", "r") as f: for line in f: line = line.strip() # line.split(",") # result.append(line) # result.append(line.split(",")) # \s匹配空格與tab,\s+表示至少一個 result = re.split(r'\s+', line) return result # 逐個調取文件夾下頁面中地址來保存 def saveAll(self): i = spider.openNameList() for name in i: print("當前正在保存的是" + name + "的圖片") filepath = name + "/urlPage.txt" with open(filepath, "r") as urlContent: urlContent = urlContent.read() images = spider.getAllImg(urlContent) spider.saveImgs(images, name) # 傳入起止頁碼便可,在此傳入了6,10,表示抓取第6到10頁的MM spider = Spider() spider.deleteOldTxt() spider.savePagesInfo(1, 10)
tool.pyapp
# -*- coding:utf-8 -*- import re # 處理頁面標籤類 class Tool: # 去除img標籤,1-7位空格, removeImg = re.compile(r'<img.*?>| {1,7}| ') # 刪除超連接標籤 removeAddr = re.compile(r'<a.*?>|</a>') # 把換行的標籤換爲\n replaceLine = re.compile(r'<tr>|<div>|</div>|</p>') # 將表格製表<td>替換爲\t replaceTD = re.compile(r'<td>') # 將換行符或雙換行符替換爲\n replaceBR = re.compile(r'<br><br>|<br>') # 將其他標籤剔除r removeExtraTag = re.compile(r'<.*?>') # 將多行空行刪除 removeNoneLine = re.compile(r'\n+') # 刪除 removeSpace = re.compile(r' ') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) x = re.sub(self.removeNoneLine, "\n", x) x = re.sub(self.removeSpace, "", x) # strip()將先後多餘內容刪除 return x.strip()