由於要作觀點,觀點的屋子相似於知乎的話題,因此得想辦法把他給爬下來,搞了半天最終仍是妥妥的搞定了,代碼是python寫的,不懂得麻煩自學哈!懂得直接看代碼,絕對可用。html
#coding:utf-8 """ @author:haoning @create time:2015.8.5 """ from __future__ import division # 精確除法 from Queue import Queue from __builtin__ import False import json import os import re import platform import uuid import urllib import urllib2 import sys import time import MySQLdb as mdb from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With':'XMLHttpRequest', 'Referer':'https://www.zhihu.com/topics', 'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a' } DB_HOST = '127.0.0.1' DB_USER = 'root' DB_PASS = 'root' queue= Queue() #接收隊列 nodeSet=set() keywordSet=set() stop=0 offset=-20 level=0 maxLevel=7 counter=0 base="" conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8') conn.autocommit(False) curr = conn.cursor() def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在這裏應該加入代理 html = response.read() return html except: pass return None def getTopics(): url = 'https://www.zhihu.com/topics' print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode('utf-8') print html soup = BeautifulSoup(html) lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'}) for li in lis: data_id=li.get('data-id') name=li.text curr.execute('select id from classify_new where name=%s',(name)) y= curr.fetchone() if not y: curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name)) conn.commit() except Exception as e: print "get topic error",e def get_extension(name): where=name.rfind('.') if where!=-1: return name[where:len(name)] return None def which_platform(): sys_str = platform.system() return sys_str def GetDateString(): when=time.strftime('%Y-%m-%d',time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + '//' + GetDateString() + '//' +str(classify) if which_platform()=="Linux": newFolderName=par + '/' + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,'w+b') file_object.write(dataimg) file_object.close() return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #若是沒有下載下來就利用原來網站的連接 def getChildren(node,name): global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch='父話題' node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text topic_cla=soup.find('div', {'class' : 'child-topic'}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #獲取全部子節點 if u'子話題' in p_ch: for a in aList: token=a.get('data-token') a=str(a).replace('\n','').replace('\t','').replace('\r','') start=str(a).find('>') end=str(a).rfind('</a>') new_node=str(str(a)[start+1:end]) curr.execute('select id from rooms where name=%s',(new_node)) #先保證名字毫不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",e def getContent(n,name,p,top_id): try: global counter curr.execute('select id from rooms where name=%s',(name)) #先保證名字毫不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src') description=soup.find('div',{'class':'zm-editable-content'}) if description is not None: description=description.text if (u"未歸類" in title or u"根話題" in title): #容許入庫,避免死循環 description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默認爲雜談 curr.execute('select id from rooms where name=%s',(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute('select id from rooms where name=%s',(name)) #先保證名字毫不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有資格入庫的內容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必須時時進入數據庫,否則找不到父節點 if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e def work(): global queue curr.execute('select id,node,parent,name from classify where status=1') results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入隊列 while queue.qsize() >0: n,p=queue.get() #頂節點出隊 getContent(n,p,top_id) getChildren(n,name) #出隊內容的子節點 conn.commit() except Exception as e: print "what's wrong",e def new_work(): global queue curr.execute('select id,data_id,name from classify_new_copy where status=1') results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: pass def get_topis(data_id,name,top_id): global queue url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode('utf-8') json_str = json.loads(html) ms=json_str['msg'] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all('div', {'class' : 'blk'}) for blk in blks: page=blk.find('a').get('href') if page is not None: node=page.replace("/topic/","") #將更多的種子入庫 parent=name ne=blk.find('strong').text try: queue.put((node,ne,parent)) #首先放入隊列 while queue.qsize() >0: n,name,p=queue.get() #頂節點出隊 size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出隊內容的子節點 conn.commit() except Exception as e: print "what's wrong",e except urllib2.URLError, e: print "error is",e pass if __name__ == '__main__': i=0 while i<400: new_work() i=i+1
說下數據庫的問題,我這裏就不傳附件了,看字段本身創建,由於這確實太簡單了,我是用的mysql,你看本身的需求本身建。node
有什麼不懂得麻煩去去轉盤網找我,由於這個也是我開發的,上面會及時更新qq羣號,這裏不留qq號啥的,以避免被系統給K了。python