python：狂抓「某逼乎」精彩話題，撞破「某榴」新手帳號

時間 2019-12-13

原文原文鏈接

上一篇寫了個有意思的文章：python 腳本撞庫國內「某榴」帳號 https://www.52pojie.cn/thread...
不少朋友反映，該榴帳號有google驗證，即時撞破帳號也無卵用，其實新手號仍是可使用的，至於撞庫破解「某榴」帳號的問題請移到上篇帖子查看。php

華麗分割線

此次再來研究下如何搞定「某逼乎」的話題問題。
逼乎如今在整個社區類網站中能夠說火的不要不要的，逼乎上的內容質量在全部社區中仍是相對較高的，不少時候咱們都須要爬取逼乎精彩的話題，固然這不是爲了裝，搞很差你的設計剛好
就須要這麼一個需求。html

程序猿之間上代碼，一塊兒研究下：node

"""
@author:haoning
[url=home.php?mod=space&uid=365491]@create[/url] time:2015.8.5
"""
from future import division # 精確除法
from Queue import Queue
from builtin import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSouppython

reload(sys)
sys.setdefaultencoding( "utf-8" )數據庫

headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Referer':'https://www.zhihu.com/topics',
'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
}json

DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = 'root'app

queue= Queue() #接收隊列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""fetch

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
conn.autocommit(False)
curr = conn.cursor()網站

def get_html(url):ui

try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req,None,3) #在這裏應該加入代{過}{濾}理
    html = response.read()
    return html
except:
    pass
return None

def getTopics():

url = 'https://www.zhihu.com/topics'
print url
try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞&#65533;
    html = response.read().decode('utf-8')
    print html
    soup = BeautifulSoup(html)
    lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})
     
    for li in lis:
        data_id=li.get('data-id')
        name=li.text
        curr.execute('select id from classify_new where name=%s',(name))
        y= curr.fetchone()
        if not y:
            curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
    conn.commit()
except Exception as e:
    print "get topic error",e

def get_extension(name):

where=name.rfind('.')
if where!=-1:
    return name[where:len(name)]
return None

def which_platform():

sys_str = platform.system()
return sys_str

def GetDateString():

when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
foldername = str(when)
return foldername

def makeDateFolder(par,classify):

try:
    if os.path.isdir(par):
        newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
        if which_platform()=="Linux":
            newFolderName=par + '/' + GetDateString() + "/" +str(classify)
        if not os.path.isdir( newFolderName ):
            os.makedirs( newFolderName )
        return newFolderName
    else:
        return None
except Exception,e:
    print "kk",e
return None

def download_img(url,classify):

try:
    extention=get_extension(url)
    if(extention is None):
        return None
    req = urllib2.Request(url)
    resp = urllib2.urlopen(req,None,3)
    dataimg=resp.read()
    name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
    top="E://topic_pic"
    folder=makeDateFolder(top, classify)
    filename=None
    if folder is not None:
        filename  =folder+"//"+name
    try:
        if "e82bab09c_m" in str(url):
            return True
        if not os.path.exists(filename):
            file_object = open(filename,'w+b')
            file_object.write(dataimg)
            file_object.close()
            return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
        else:
            print "file exist"
            return None
    except IOError,e1:
        print "e1=",e1
        pass
except Exception as e:
    print "eee",e
    pass
return None #若是沒有下載下來就利用原來網站的連接

def getChildren(node,name):

global queue,nodeSet
try:
    url="https://www.zhihu.com/topic/"+str(node)+"/hot"
    html=get_html(url)
    if html is None:
        return
    soup = BeautifulSoup(html)
    p_ch='父話題'
    node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
    topic_cla=soup.find('div', {'class' : 'child-topic'})
    if topic_cla is not None:
        try:
            p_ch=str(topic_cla.text)
            aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #獲取全部子節點
            if u'子話題' in p_ch:
                for a in aList:
                    token=a.get('data-token')
                    a=str(a).replace('\n','').replace('\t','').replace('\r','')
                    start=str(a).find('>')
                    end=str(a).rfind('</a>')
                    new_node=str(str(a)[start+1:end])
                    curr.execute('select id from rooms where name=%s',(new_node)) #先保證名字毫不相同
                    y= curr.fetchone()
                    if not y:
                        print "y=",y,"new_node=",new_node,"token=",token
                        queue.put((token,new_node,node_name))
        except Exception as e:
            print "add queue error",e
except Exception as e:
    print "get html error",e

def getContent(n,name,p,top_id):

try:
    global counter
    curr.execute('select id from rooms where name=%s',(name)) #先保證名字毫不相同
    y= curr.fetchone()
    print "exist?? ",y,"n=",n
    if not y:
        url="https://www.zhihu.com/topic/"+str(n)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
        pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
        description=soup.find('div',{'class':'zm-editable-content'})
        if description is not None:
            description=description.text
             
        if (u"未歸類" in title or u"根話題" in title): #容許入庫，避免死循環
            description=None
             
        tag_path=download_img(pic_path,top_id)
        print "tag_path=",tag_path
        if (tag_path is not None) or tag_path==True:
            if tag_path==True:
                tag_path=None
            father_id=2 #默認爲雜談
            curr.execute('select id from rooms where name=%s',(p))
            results = curr.fetchall()
            for r in results:
                father_id=r[0]
            name=title
            curr.execute('select id from rooms where name=%s',(name)) #先保證名字毫不相同
            y= curr.fetchone()
            print "store see..",y
            if not y:
                friends_num=0
                temp = time.time()
                x = time.localtime(float(temp))
                create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                create_time
                creater_id=None
                room_avatar=tag_path
                is_pass=1
                has_index=0
                reason_id=None 
                #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                ######################有資格入庫的內容
                counter=counter+1
                curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                conn.commit() #必須時時進入數據庫，否則找不到父節點
                if counter % 200==0:
                    print "current node",name,"num",counter
except Exception as e:
    print "get content error",e

def work():

global queue
curr.execute('select id,node,parent,name from classify where status=1')
results = curr.fetchall()
for r in results:
    top_id=r[0]
    node=r[1]
    parent=r[2]
    name=r[3]
    try:
        queue.put((node,name,parent)) #首先放入隊列
        while queue.qsize() >0:
            n,p=queue.get() #頂節點出隊
            getContent(n,p,top_id)
            getChildren(n,name) #出隊內容的子節點
        conn.commit()
    except Exception as e:
        print "what's wrong",e

def new_work():

global queue
curr.execute('select id,data_id,name from classify_new_copy where status=1')
results = curr.fetchall()
for r in results:
    top_id=r[0]
    data_id=r[1]
    name=r[2]
    try:
        get_topis(data_id,name,top_id)
    except:
        pass

def get_topis(data_id,name,top_id):

global queue
url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
isGet = True;
offset = -20;
data_id=str(data_id)
while isGet:
    offset = offset + 20
    values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
    try:
        msg=None
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request,None,5)
            html=response.read().decode('utf-8')
            json_str = json.loads(html)
            ms=json_str['msg']
            if len(ms) <5:
                break
            msg=ms[0]
        except Exception as e:
            print "eeeee",e
        #print msg
        if msg is not None:
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all('div', {'class' : 'blk'})
            for blk in blks:
                page=blk.find('a').get('href')
                if page is not None:
                    node=page.replace("/topic/","") #將更多的種子入庫
                    parent=name
                    ne=blk.find('strong').text
                    try:
                        queue.put((node,ne,parent)) #首先放入隊列
                        while queue.qsize() >0:
                            n,name,p=queue.get() #頂節點出隊
                            size=queue.qsize()
                            if size > 0:
                                print size
                            getContent(n,name,p,top_id)
                            getChildren(n,name) #出隊內容的子節點
                        conn.commit()
                    except Exception as e:
                        print "what's wrong",e  
    except urllib2.URLError, e:
        print "error is",e
        pass

if name == '__main__':

i=0
while i<400:
    new_work()
    i=i+1

固然代碼是十分簡單的，稍微有python基礎均可以搞定，註釋清楚明白，你們安靜討論研究下，獻醜了。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。