crawl facebook user basic infomation and photos

時間 2019-12-09

標籤 crawl user basic infomation photos 欄目硅谷简体版

原文原文鏈接

前言

自從以前爬取twitter後公司要求對fancebook進行爬取，瞬間心中有一萬隻×××。畢竟這些社交網絡的站點反爬機制作的很不錯。但既然上面安排下來只能硬着頭皮上了。經過抓包，發現登錄m.facebook.com站點psot的數據相比facebook.com要簡單,全部就寫了一套利用scrapy爬取facebook的爬蟲。css

模擬登錄

from scrapy import Spider
from scrapy.http import Request, FormRequest


class FacebookLogin(Spider):
    download_delay = 0.5

    usr = "××××" # your username/email/phone number
    pwd = "××××" #account password

    def start_requests(self):
        return [Request("https://m.facebook.com/", callback=self.parse)]

    def parse(self, response):
        return FormRequest.from_response(response,
                                            formdata={
                                                'email': self.usr,
                                                'pass': self.pwd
                                            }, callback=self.remember_browser)

    def remember_browser(self, response):
        # if re.search(r'(checkpoint)', response.url):
            # Use 'save_device' instead of 'dont_save' to save device
        return FormRequest.from_response(response,
                                                formdata={'name_action_selected': 'dont_save'},
                                                callback=self.after_login)

    def after_login(self, response):
        pass

注：爲了保險起見能夠在seething文件中添加一個手機端的USER-AGENThtml

爬取用戶基本信息

# -*- coding: UTF-8 -*-
import re
from urlparse import urljoin

from scrapy import Item, Field
from scrapy.http import Request
from scrapy.selector import Selector

from facebook_login import FacebookLogin


class FacebookItems(Item):
    id = Field()
    url = Field()
    name = Field()
    work = Field()
    education = Field()
    family = Field()
    skills = Field()
    address = Field()
    contact_info = Field()
    basic_info = Field()
    bio = Field()
    quote = Field()
    nicknames = Field()
    relationship = Field()
    image_urls = Field()

class FacebookProfile(FacebookLogin):
    download_delay = 2
    name = "fb"
    links = None
    start_ids = [
        "plok74122", "bear.black.12","tabaco.wang","chaolin.chang.q","ahsien.liu","kaiwen.cheng.100","liang.kevin.92","bingheng.tsai.9","psppupu",
                  'cscgbakery',"hc.shiao.l","asusisbad","benjamin","franklin",
        # 'RobertScoble'
    ]
                  # "https://m.facebook.com/tabaco.wang?v=info",'https://m.facebook.com/RobertScoble?v=info']

    def after_login(self, response):
        for id in self.start_ids:
            url = "https://m.facebook.com/%s?v=info" %id
            yield Request(url, callback=self.parse_profile,meta={"id":id})

    def parse_profile(self, response):
        item = FacebookItems()

        item['id'] = response.meta['id']
        item['url'] = response.url
        item["name"] = "".join(response.css('#root strong *::text').extract())

        item["work"] = self.parse_info_has_image(response, response.css('#work'))
        item["education"] = self.parse_info_has_image(response, response.css('#education'))
        item["family"] = self.parse_info_has_image(response, response.css('#family'))

        item["address"] = self.parse_info_has_table(response.css('#living'))
        item["contact_info"] = self.parse_info_has_table(response.css('#contact-info'))
        item["basic_info"] = self.parse_info_has_table(response.css('#basic-info'))
        item["nicknames"] = self.parse_info_has_table(response.css('#nicknames'))

        item["skills"] = self.parse_info_text_only(response.css('#skills'))
        item["bio"] = self.parse_info_text_only(response.css('#bio'))
        item["quote"] = self.parse_info_text_only(response.css('#quote'))
        item["relationship"] = self.parse_info_text_only(response.css('#relationship'))

        yield item


    def parse_info_has_image(self, response, css_path):
        info_list = []
        for div in css_path.xpath('div/div[2]/div'):
            url = urljoin(response.url, "".join(div.css('div > a::attr(href)').extract()))
            title = "".join(div.css('div').xpath('span | h3').xpath('a/text()').extract())
            info = "\n".join(div.css('div').xpath('span | h3').xpath('text()').extract())
            if url and title and info:
                info_list.append({"url": url, "title": title, "info": info})
        return info_list

    def parse_info_has_table(self, css_path):
        info_dict = {}
        for div in css_path.xpath('div/div[2]/div'):
            key = "".join(div.css('td:first-child div').xpath('span | span/span[1]').xpath('text()').extract())
            value = "".join(div.css('td:last-child').xpath('div//text()').extract()).strip()
            if key and value:
                if key in info_dict:
                    info_dict[key] += ", %s" % value
                else:
                    info_dict[key] = value
        return info_dict

    def parse_info_text_only(self, css_path):
        text = css_path.xpath('div/div[2]//text()').extract()
        text = [t.strip() for t in text]
        text = [t for t in text if re.search('\w+', t) and t != "Edit"]
        return "\n".join(text)

爬取用戶的全部圖片

雖然圖片在https://m.facebook.com/%s?v=info中會有顯示，可是真正的圖片連接卻須要幾回請求以後才能拿到，本做在spider中儘可能少的操做原則故將抓取圖片也單獨寫成了一個爬蟲，以下：python

# -*- coding: UTF-8 -*-
from scrapy.spider import CrawlSpider,Rule,Spider
from scrapy.linkextractor import LinkExtractor
from facebook_login import FacebookLogin
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy import Item, Field
import re,hashlib
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class FacebookPhotoItems(Item):
    url = Field()
    id = Field()
    photo_links = Field()
    md5 = Field()
class CrawlPhoto(FacebookLogin):
    name = 'fbphoto'
    timelint_photo = None
    id = None
    links = []
    start_ids = [
        "plok74122", "bear.black.12", "tabaco.wang", "chaolin.chang.q",
        # "ashien.liu",
        "liang.kevin.92","qia.chen",
        "bingheng.tsai.9", "psppupu",
        'cscgbakery', "hc.shiao.l", "asusisbad", "benjamin", "franklin",
        # 'RobertScoble'
    ]

    def after_login(self, response):
        for url in self.start_ids:
            yield Request('https://m.facebook.com/%s/photos'%url,callback=self.parse_item,meta={"id":url})
        # yield Request('https://m.facebook.com/%s/photos'%self.id,callback=self.parse_item)
    def parse_item(self,response):
        # print response.body
        urls = response.xpath('//span').extract()
        next_page = None
        try:
            next_page = response.xpath('//div[@class=\'co\']/a/@href').extract()[0].strip()
        except:
            pass
        # urls = response.xpath('//div[@data-sigil=\'marea\']').extract()
        for i in urls:
            # if i.find(u'時間線照片')!=-1:
            try:
                self.timeline_photo = Selector(text=i).xpath('//span/a/@href').extract()[0]
                if self.timeline_photo is not None:
                    yield Request('https://m.facebook.com/%s'%self.timeline_photo,callback=self.parse_photos,meta=response.meta)
            except:
                continue
        if next_page:
            print '-----------------------next image page -----------------------------------------'
            yield Request('https://m.facebook.com/%s'%next_page,callback=self.parse_item,meta=response.meta)
    def parse_photos(self,response):
        urls = response.xpath("//a[@class=\'bw bx\']/@href").extract()
        # urls = response.xpath("//a[@class=\'_39pi _4i6j\']/@href").extract()
        for i in urls:
            yield Request('https://m.facebook.com/%s'%i,callback=self.process_photo_url,meta=response.meta)
        if len(urls) == 12:
            next_page = response.xpath('//div[@id=\'m_more_item\']/a/@href').extract()[0]
            yield Request('https://m.facebook.com/%s'%next_page,callback=self.parse_photos,meta=response.meta)
    def process_photo_url(self,response):
        # photo_url = response.xpath('//i[@class=\'img img\']').extract()
        item = FacebookPhotoItems()
        item['url'] = response.url
        item['id'] = response.meta['id']
        photo_url = response.xpath('//div[@style=\'text-align:center;\']/img/@src').extract()[0]
        item['photo_links'] = photo_url
        item['md5'] = self.getstr_md5(item['photo_links'])+".jpg"
        yield item

    def wirtefile(self,str):
        with open('temp2.html','w') as file:
            file.write(str)
            file.write('\n')

    def getstr_md5(self, input):
        if input is None:
            input = ''
        md = hashlib.md5()
        md.update(input)
        return md.hexdigest()

由於個人python水平也是半路出家，全部尚未找到一個好的辦法將圖片連接的抓取集成到抓取基本信息的那個爬蟲上，若是有大神知道還請指點一二。
下載圖片沒有使用scrapy的imagePipline,而是使用的wget命令,緣由就是上面所說，python水平太菜。。。
下面是本身寫的一個下載圖片的pipline:git

class MyOwenImageDownload(object):
    def process_item(self, item,spider):
        if len(item) >6:
            pass
        else:
            file = "image/"+item['id']
            if os.path.exists(file):
                pass
            else:
                os.makedirs(file)
            cmd = 'wget \'%s\' -O %s -P %s --timeout=10 -q'%(item['photo_links'],file+"/"+item['md5'],file)
            os.system(cmd)
        return item

結語

至此，整個爬蟲基本的結構已經寫完。。。源碼地址github

In the end, we will remember not the words of our enemies but the silence of our friends網絡

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。