本身閒來無聊,就爬取了網易信息,重點是分析網頁,使用抓包工具詳細的分析網頁的每一個連接,數據存儲在sqllite中,這裏只是簡單的解析了新聞頁面的文字信息,並未對圖片信息進行解析html
僅供參考,不足之處請指正web
# coding:utf-8 import random, re import sqlite3 import json from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf-8') import uuid import requests session = requests.session() def md5(str): import hashlib m = hashlib.md5() m.update(str) return m.hexdigest() def wangyi(): for i in range(1,3): if i ==1: k = "" else: k = "_0" + str(i) url = "http://temp.163.com/special/00804KVA/cm_yaowen" + k + ".js?callback=data_callback" print url headers = { "Host":"temp.163.com", "Connection":"keep-alive", "Accept":"*/*", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER", "Referer":"http://news.163.com/", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8", } result = session.get(url=url,headers=headers).text try: result1 = eval(eval((json.dumps(result)).replace('data_callback(','').replace(')','').replace(' ',''))) except: pass try: for i in result1: tlink = i['tlink'] headers2 = { "Host":"news.163.com", "Connection":"keep-alive", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8", } print "tlinktlinktlinktlink",tlink return_data = session.get(url=tlink,headers=headers2).text try: soup = BeautifulSoup(return_data, 'html.parser') returnSoup = soup.find_all("div", attrs={"id": "endText"})[0] print returnSoup print "===============================" try: returnList = re.findall('<p>(.*?)</p>',str(returnSoup)) content1 = '<-->'.join(returnList) except: content1 ="" try: returnList1 = re.findall('<p class="f_center">(.*?)</p>',str(returnSoup)) content2 = '<-->'.join(returnList1) except: content2 ="" content = content1 +content2 except: content = "" cx = sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3", check_same_thread=False) cx.text_factory = str try: print "正在插入連接 %s 數據" % (url) tlink = i['tlink'] title = (i['title']).decode('unicode_escape') commenturl = i['commenturl'] tienum = i['tienum'] opentime = i['time'] print title print tlink print commenturl print tienum print opentime print content url2 = md5(str(tlink)) cx.execute("INSERT INTO wangyi (title,tlink,commenturl,tienum,opentime,content,url)VALUES (?,?,?,?,?,?,?)",(str(title), str(tlink), str(commenturl), str(tienum), str(opentime), str(content), str(url2))) except Exception as e: print e print "cha ru shi bai " cx.commit() cx.close() except: pass wangyi()