自從以前爬取twitter後公司要求對fancebook進行爬取,瞬間心中有一萬隻×××。畢竟這些社交網絡的站點反爬機制作的很不錯。但既然上面安排下來只能硬着頭皮上了。經過抓包,發現登錄m.facebook.com站點psot的數據相比facebook.com要簡單,全部就寫了一套利用scrapy爬取facebook的爬蟲。css
from scrapy import Spider from scrapy.http import Request, FormRequest class FacebookLogin(Spider): download_delay = 0.5 usr = "××××" # your username/email/phone number pwd = "××××" #account password def start_requests(self): return [Request("https://m.facebook.com/", callback=self.parse)] def parse(self, response): return FormRequest.from_response(response, formdata={ 'email': self.usr, 'pass': self.pwd }, callback=self.remember_browser) def remember_browser(self, response): # if re.search(r'(checkpoint)', response.url): # Use 'save_device' instead of 'dont_save' to save device return FormRequest.from_response(response, formdata={'name_action_selected': 'dont_save'}, callback=self.after_login) def after_login(self, response): pass
注:爲了保險起見能夠在seething文件中添加一個手機端的USER-AGENThtml
# -*- coding: UTF-8 -*- import re from urlparse import urljoin from scrapy import Item, Field from scrapy.http import Request from scrapy.selector import Selector from facebook_login import FacebookLogin class FacebookItems(Item): id = Field() url = Field() name = Field() work = Field() education = Field() family = Field() skills = Field() address = Field() contact_info = Field() basic_info = Field() bio = Field() quote = Field() nicknames = Field() relationship = Field() image_urls = Field() class FacebookProfile(FacebookLogin): download_delay = 2 name = "fb" links = None start_ids = [ "plok74122", "bear.black.12","tabaco.wang","chaolin.chang.q","ahsien.liu","kaiwen.cheng.100","liang.kevin.92","bingheng.tsai.9","psppupu", 'cscgbakery',"hc.shiao.l","asusisbad","benjamin","franklin", # 'RobertScoble' ] # "https://m.facebook.com/tabaco.wang?v=info",'https://m.facebook.com/RobertScoble?v=info'] def after_login(self, response): for id in self.start_ids: url = "https://m.facebook.com/%s?v=info" %id yield Request(url, callback=self.parse_profile,meta={"id":id}) def parse_profile(self, response): item = FacebookItems() item['id'] = response.meta['id'] item['url'] = response.url item["name"] = "".join(response.css('#root strong *::text').extract()) item["work"] = self.parse_info_has_image(response, response.css('#work')) item["education"] = self.parse_info_has_image(response, response.css('#education')) item["family"] = self.parse_info_has_image(response, response.css('#family')) item["address"] = self.parse_info_has_table(response.css('#living')) item["contact_info"] = self.parse_info_has_table(response.css('#contact-info')) item["basic_info"] = self.parse_info_has_table(response.css('#basic-info')) item["nicknames"] = self.parse_info_has_table(response.css('#nicknames')) item["skills"] = self.parse_info_text_only(response.css('#skills')) item["bio"] = self.parse_info_text_only(response.css('#bio')) item["quote"] = self.parse_info_text_only(response.css('#quote')) item["relationship"] = self.parse_info_text_only(response.css('#relationship')) yield item def parse_info_has_image(self, response, css_path): info_list = [] for div in css_path.xpath('div/div[2]/div'): url = urljoin(response.url, "".join(div.css('div > a::attr(href)').extract())) title = "".join(div.css('div').xpath('span | h3').xpath('a/text()').extract()) info = "\n".join(div.css('div').xpath('span | h3').xpath('text()').extract()) if url and title and info: info_list.append({"url": url, "title": title, "info": info}) return info_list def parse_info_has_table(self, css_path): info_dict = {} for div in css_path.xpath('div/div[2]/div'): key = "".join(div.css('td:first-child div').xpath('span | span/span[1]').xpath('text()').extract()) value = "".join(div.css('td:last-child').xpath('div//text()').extract()).strip() if key and value: if key in info_dict: info_dict[key] += ", %s" % value else: info_dict[key] = value return info_dict def parse_info_text_only(self, css_path): text = css_path.xpath('div/div[2]//text()').extract() text = [t.strip() for t in text] text = [t for t in text if re.search('\w+', t) and t != "Edit"] return "\n".join(text)
雖然圖片在https://m.facebook.com/%s?v=info中會有顯示,可是真正的圖片連接卻須要幾回請求以後才能拿到,本做在spider中儘可能少的操做原則故將抓取圖片也單獨寫成了一個爬蟲,以下:python
# -*- coding: UTF-8 -*- from scrapy.spider import CrawlSpider,Rule,Spider from scrapy.linkextractor import LinkExtractor from facebook_login import FacebookLogin from scrapy.http import Request from scrapy.selector import Selector from scrapy import Item, Field import re,hashlib import sys reload(sys) sys.setdefaultencoding('utf-8') class FacebookPhotoItems(Item): url = Field() id = Field() photo_links = Field() md5 = Field() class CrawlPhoto(FacebookLogin): name = 'fbphoto' timelint_photo = None id = None links = [] start_ids = [ "plok74122", "bear.black.12", "tabaco.wang", "chaolin.chang.q", # "ashien.liu", "liang.kevin.92","qia.chen", "bingheng.tsai.9", "psppupu", 'cscgbakery', "hc.shiao.l", "asusisbad", "benjamin", "franklin", # 'RobertScoble' ] def after_login(self, response): for url in self.start_ids: yield Request('https://m.facebook.com/%s/photos'%url,callback=self.parse_item,meta={"id":url}) # yield Request('https://m.facebook.com/%s/photos'%self.id,callback=self.parse_item) def parse_item(self,response): # print response.body urls = response.xpath('//span').extract() next_page = None try: next_page = response.xpath('//div[@class=\'co\']/a/@href').extract()[0].strip() except: pass # urls = response.xpath('//div[@data-sigil=\'marea\']').extract() for i in urls: # if i.find(u'時間線照片')!=-1: try: self.timeline_photo = Selector(text=i).xpath('//span/a/@href').extract()[0] if self.timeline_photo is not None: yield Request('https://m.facebook.com/%s'%self.timeline_photo,callback=self.parse_photos,meta=response.meta) except: continue if next_page: print '-----------------------next image page -----------------------------------------' yield Request('https://m.facebook.com/%s'%next_page,callback=self.parse_item,meta=response.meta) def parse_photos(self,response): urls = response.xpath("//a[@class=\'bw bx\']/@href").extract() # urls = response.xpath("//a[@class=\'_39pi _4i6j\']/@href").extract() for i in urls: yield Request('https://m.facebook.com/%s'%i,callback=self.process_photo_url,meta=response.meta) if len(urls) == 12: next_page = response.xpath('//div[@id=\'m_more_item\']/a/@href').extract()[0] yield Request('https://m.facebook.com/%s'%next_page,callback=self.parse_photos,meta=response.meta) def process_photo_url(self,response): # photo_url = response.xpath('//i[@class=\'img img\']').extract() item = FacebookPhotoItems() item['url'] = response.url item['id'] = response.meta['id'] photo_url = response.xpath('//div[@style=\'text-align:center;\']/img/@src').extract()[0] item['photo_links'] = photo_url item['md5'] = self.getstr_md5(item['photo_links'])+".jpg" yield item def wirtefile(self,str): with open('temp2.html','w') as file: file.write(str) file.write('\n') def getstr_md5(self, input): if input is None: input = '' md = hashlib.md5() md.update(input) return md.hexdigest()
由於個人python水平也是半路出家,全部尚未找到一個好的辦法將圖片連接的抓取集成到抓取基本信息的那個爬蟲上,若是有大神知道還請指點一二。
下載圖片沒有使用scrapy的imagePipline,而是使用的wget命令,緣由就是上面所說,python水平太菜。。。
下面是本身寫的一個下載圖片的pipline:git
class MyOwenImageDownload(object): def process_item(self, item,spider): if len(item) >6: pass else: file = "image/"+item['id'] if os.path.exists(file): pass else: os.makedirs(file) cmd = 'wget \'%s\' -O %s -P %s --timeout=10 -q'%(item['photo_links'],file+"/"+item['md5'],file) os.system(cmd) return item
至此,整個爬蟲基本的結構已經寫完。。。源碼地址github
In the end, we will remember not the words of our enemies but the silence of our friends網絡