python網絡爬蟲（6）爬取數據靜態

時間 2019-12-10

原文原文鏈接

爬取靜態數據並存儲json

import requests
import chardet
from bs4 import BeautifulSoup
import json
user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
headers={'User-Agent':user_agent}
r=requests.get('http://seputu.com/',headers=headers)
r.encoding=chardet.detect(r.content)['encoding']
soup=BeautifulSoup(r.text,features='html.parser')
content=[]
for mulu in soup.find_all(class_='mulu'):
    h2=mulu.find('h2')
    if(h2!=None):
        h2_title=h2.string
        list=[]
        for a in mulu.find(class_='box').find_all('a'):
            href=a.get('href')
            box_title=a.get('title')
            print(href,'|||',box_title)
            list.append({'href':href,'box_title':box_title})
            pass
        content.append({'title':h2_title,'content':list})
        pass
    pass
with open('1.json','w') as fp:
    json.dump(content, fp=fp,indent=4)