例如輿情繫統:
獲取汽車之家新聞放到本身數據庫裏,建立本身的app,發佈內容,註明來源,本身創業。html
URL指定內容獲取到 - 發送Http請求:http://www.autohome.com.cn/news/ - 基於正則表達式獲取內容
Python實現:python
import requests from bs4 import BeautifulSoup response = requests.get('http://www.autohome.com.cn/news/') response.text obj = BeautifulSoup(response.text,...) 標籤對象 = obj.find('a') # 找到匹配成功的第一個標籤 標籤對象.find(...) [標籤對象,標籤對象,]= obj.find_all('a') # 找到匹配成功的全部標籤
示例一:爬取汽車之家新聞git
requests obj = requests.get("url") obj.content obj.encoding = "gbk" obj.text soup = beautifulsoup(obj.text,'html.parser') 標籤對象 = soup.find(name='xx') [標籤對象,標籤對象,] = soup.find_all(...) 標籤對象.text 標籤對象.attrs 標籤對象.get(...)
import requests from bs4 import BeautifulSoup response = requests.get('http://www.autohome.com.cn/news/') # socket發送的是字節類型 # # print(response.text) # 字符串,編碼設置不對出現亂碼 # print(response.content) # response.content獲取的是字節類型 response.encoding = 'gbk' # print(response.text) # response.text拿到的是文本信息 # python有個內置解析器html.parser,html頁面的<html lang='en'...></html>對象經過html.parser解析出來 soup = BeautifulSoup(response.text,'html.parser') tag = soup.find(id='auto-channel-lazyload-article') # h3 = tag.find(name='h3',class_='c1') # name是標籤名。標籤名不能直接寫,class='c1'直接報錯,寫成class_='c1',或者寫成attrs={'class':'c1'} # h3 = tag.find(name='h3',attrs={'class':'c1'}) h3 = tag.find(name='h3') print(h3)
response = requests.get('http://www.autohome.com.cn/news/') response.encoding = 'gbk' soup = BeautifulSoup(response.text,'html.parser') li_list = soup.find(id='auto-channel-lazyload-article').find_all('li') # find_all('li')默認爲find_all(name='li') for li in li_list: # print(li.find('h3')) # 有時候獲取到的li.find('h3')爲None title = li.find('h3') if not title: continue # print(title,type(title)) # <h3>將於第四季度上市 雲度π1正式下線</h3> <class 'bs4.element.Tag'> summary = li.find('p').text # url = li.find('a').attrs['href'] # li.find('a').attrs # 獲取到li的全部屬性,是個字典.使用get也能夠獲取到url url = li.find('a').get('href') img = li.find('img').get('src') # # 下載img # res= requests.get(img) # file_name = '%s.jpg'%(title,) # 標題看成下載的img文件名不符合規範,需修改 # with open(file_name,'wb') as f: # f.write(res.content) print(title.text, summary,url,img) # 標題:title.text,簡介:summary print('=============')
示例二:python代碼登陸githubgithub
1. 登陸頁面發送請求GET,獲取csrftoken 2. 發送POST請求: 攜帶用戶名、密碼、csrftoken發送POST請求 產生cookie,拿到後下次就不須要登陸了
requests obj = requests.get("url") obj.content obj.encoding = "gbk" obj.text obj.cookies.get_dict() requests.get("url",cookies={'k1':"v1"}) soup = beatifulsoup(obj.text,'html.parser') 標籤 = soup.find(name='xx') [標籤,] = soup.find_all(...) 標籤.text 標籤.attrs 標籤.get(...)
import requests from bs4 import BeautifulSoup # 獲取token r1 = requests.get('https://github.com/login') s1 = BeautifulSoup(r1.text,'html.parser') token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value') # github登陸頁面攜帶的憑證不是csrf_token,憑證是authenticity_token print(token) # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA== r1_token_dict = r1.cookies.get_dict() # 將用戶名、密碼、token以POST請求發送到服務端 # 測試下發送POST請求時,查看瀏覽器Network響應頭Headers發送請求的內容 """ utf8:? authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg== login:asdf password:asdf commit:Sign in """ r2 = requests.post( 'http://github.com/session', # POST發送的url是從瀏覽器Network響應頭Headers中查看獲取到的 data={ 'utf8':'?', 'authenticity_token':token, # 'login':'用戶名', 'login':'317828332@qq.com', 'password':'alex3714', # 'password':'密碼', 'commit':'Sign in' }, cookies = r1_token_dict ) # print(r2.text) r2_cookie_dict = r2.cookies.get_dict() print(r1_token_dict) # 有些網頁get請求時有cookies,有些沒有 #---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'} print(r2_cookie_dict) # post請求時的cookies #---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'} #整合二個cookies cookie_dict = {} cookie_dict.update(r1_token_dict) cookie_dict.update(r2_cookie_dict) #再次發送請求時 r3 = requests.get( # url='xxxxxx', #登陸後能夠訪問github的頁面 url='https://github.com/settings/emails', cookies=cookie_dict ) print(r3.text)
示例三:對抽屜新聞點贊正則表達式
# 1.登陸,拿到cookie # 2.找到標籤url,看抽屜頁面發送的點贊請求,首先看往哪一個url發送請求。 # 發送的是post請求,發送的url地址:http://dig.chouti.com/login。返回的不是讓瀏覽器直接跳轉頁面,返回的是字典 import requests from bs4 import BeautifulSoup # 1.獲取cookie r0 = requests.get('http://dig.chouti.com/') r0_cookie_dict = r0.cookies.get_dict() # 2.發送用戶名、密碼、cookie r1 = requests.post( 'http://dig.chouti.com/login', data={ 'phone':'8615131255089', 'password':'woshiniba', 'oneMonth':1 # 一個月免登陸 }, cookies=r0_cookie_dict ) r1_cookie_dict = r1.cookies.get_dict() print(r1.text) #---> {"result":{"code":"8887", "message":"手機號格式不對", "data":""}} #這是手機不對的狀況下打印的內容 print(r1.cookies.get_dict()) #---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'} cookie_dict = {} cookie_dict.update(r0_cookie_dict) cookie_dict.update(r1_cookie_dict) # cookie_dict={'gpsd':r0_cookie_dict['gpsd']} # 同上面cookie_dict同樣,但不推薦使用 # 點贊 r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict) # 點讚的時候是post請求,linksId=13911006是文章id print(r2.text)
requests模塊中提供的方法數據庫
# requests.get() # requests.post() # requests.put() # requests.request('post') # requests.get(url, params=None, **kwargs) # requests.post(url, data=None, json=None, **kwargs) # requests.put(url, data=None, **kwargs) # requests.head(url, **kwargs) # requests.delete(url, **kwargs) # requests.patch(url, data=None, **kwargs) # requests.options(url, **kwargs) # # # 以上方法均是在此方法的基礎上構建 # requests.request(method, url, **kwargs)
# url='xxx', # params={'k1':'v1','nid':888}, #GET傳參 # cookies={}, # headers={}, # data = {}, # data提供數據 # json = {} # json提供數據 # requests.get( # url='xxx', # params={'k1':'v1','nid':888}, # cookies={}, # headers={} # ) # http://www.baidu.com?k1=v1&nid=888 requests.post( url='xxx', params={'k1':'v1','nid':888}, cookies={}, headers={}, json={} ) # 注意:向後臺發送去年請求時,注意請求頭 # requests.post(url='',data={}) # 默認攜帶請求頭application/x-www-form-urlencoded requests.post(url='',data={},headers={'content-type':'application/json'}) # 這樣寫的話django經過request.POST拿不到值,只能經過request.boby中本身拿 requests.post(url='',json={}) # 默認攜帶請求頭headers={'content-type':'application/json'}
# auth def param_auth(): from requests.auth import HTTPBasicAuth, HTTPDigestAuth # HTTPBasicAuth基本上路由器都是經過HTTPBasicAuth驗證的 # 簡單經常使用的基本驗證規則 ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) # HTTPBasicAuth驗證規則 ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf')) # HTTPDigestAuth驗證規則 # 上面二種規則不會簡單的,爬蟲反爬不可能那麼簡單按照這二種規則驗證帳號密碼。 print(ret.text) # ret = requests.get('http://192.168.1.1',) # auth=HTTPBasicAuth('admin', 'admin')) # ret.encoding = 'gbk' # print(ret.text) # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass')) # print(ret) # timeout 超時時間限制 # allow_redirects 容許重定向 # 假設訪問http://www.abc.com跳轉到http://www.baidu.com response = requests.get('http://www.abc.com',allow_redirects=False) print(response.text) # 不容許重定向,則返回的是http://www.abc.com的內容 response = requests.get('http://www.abc.com',allow_redirects=True) print(response.text) # 返回的是http://www.baidu.com的內容 # proxies 代理,防止爬網頁時,把ip封了,加代理。能夠買代理,也能夠本身搭代理服務器,本身生成 # stream # verify 證書,例如12306的證書。知乎證書可帶可不帶 requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem') # stream=True須要攜帶證書,stream=False不須要攜帶證書
beautifulsoup:把html結構化成對象,經過對象的方式取html內部元素django
#html_doc = #""" # <html><head><title>The Dormouse's story</title></head> # <body> # asdf # <div class="title"> # <b>The Dormouse's story總共</b> # <h1>f</h1> # </div> # <div class="story">Once upon a time there were three little sisters; and their names were # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</div> # ad<br/>sf # <p class="story">...</p> # </body> # </html> # """ #from bs4 import BeautifulSoup #soup = BeautifulSoup(html_doc, features="lxml") # 與BeautifulSoup(html_doc,'html.parser')不一樣的是使用的解析器不一樣,lxml性能更好,不過要安裝lxml模塊,推薦使用 #tag = soup.find(class_='story') # print(tag)
# print(tag.name) # #---> div # # tag.name = 'span' # 設置
# print(tag.attrs) # #---> {'class': ['story']} # tag.attrs['kkk'] = 'vvv' # print(tag.attrs) # #---> {'class': ['story'], 'kkk': 'vvv'} # del tag.attrs['kkk'] # print(tag.attrs) # #---> {'class': ['story']}
# print(tag.children) # #---> <list_iterator object at 0x0000000002EA32B0> # print(list(tag.children)) # #---> ['Once upon a time there were three little sisters; and their names were\n ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.'] # for item in tag.children: # print(type(item),item) # # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were # # # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a> # # <class 'bs4.element.NavigableString'> , # # # # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> # # <class 'bs4.element.NavigableString'> and # # # # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> # # <class 'bs4.element.NavigableString'> ; # # and they lived at the bottom of a well.
# print(tag) # # ---> <div class="story">Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well.</div> # tag.clear() # print(tag) # ---> <div class="story"></div>
# tag.decompose() # print(tag) # #---> <None></None>
# taga = tag.find(name='a') # taga.extract() # print(tag)
# print(tag.decode()) # #---> <div class="story">Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well.</div> # print(type(tag.decode())) # # ---> <class 'str'> # print(tag.decode_contents(),type(tag.decode_contents())) # #---> Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well. <class 'str'>
# print(tag.decode()) # #---> <div class="story">Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well.</div> # print(type(tag.decode())) # # ---> <class 'str'> # print(tag.decode_contents(),type(tag.decode_contents())) # #---> Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well. <class 'str'>
# print(type(tag.encode())) # # ---> <class 'bytes'> # print(tag.encode()) # #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>' # print(tag.encode_contents(),type(tag.encode_contents()))
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # recursive遞歸找;text文本內容,不多用 # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # limit=1只找一個 # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags)
# v = soup.find_all(name=['a','div']) # name=['a','div'] 查找‘a’標籤和'div'標籤 # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # class_=['sister0', 'sister']查找class='sister0'或者class='sister' # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v)
#import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v)
# def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # 返回結果爲True,就把結果給v = soup.find_all() # v = soup.find_all(name=func) # name=func把標籤遍歷一遍,每找到標籤執行一次函數。 # print(v)
# tag = soup.find('a') # v = tag.get('id') # print(v)
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
# tag = soup.find('a') # v = tag.get_text() # print(v)
# tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v)
# soup.next # 找下一個,無論是標籤仍是文本 # soup.next_element # 找下一個標籤 # soup.next_elements # soup.next_sibling # 找兄弟姐妹 # soup.next_siblings # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # tag.parent # tag.parents
# tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 參數同find_all
# soup.select("title") # # soup.select("p nth-of-type(3)") # # soup.select("body a") # # soup.select("html head title") # # tag = soup.select("span,a") # # soup.select("head > title") # # soup.select("p > a") # # soup.select("p > a:nth-of-type(2)") # # soup.select("p > #link1") # # soup.select("body > a") # # soup.select("#link1 ~ .sister") # # soup.select("#link1 + .sister") # # soup.select(".sister") # # soup.select("[class~=sister]") # # soup.select("#link1") # # soup.select("a#link2") # # soup.select('a[href]') # # soup.select('a[href="http://example.com/elsie"]') # # soup.select('a[href^="http://example.com/"]') # # soup.select('a[href$="tillie"]') # # soup.select('a[href*=".com/el"]') # # from bs4.element import Tag # # # def default_candidate_generator(tag): # for child in tag.descendants: # if not isinstance(child, Tag): # continue # if not child.has_attr('href'): # continue # yield child # # # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) # print(type(tags), tags) # # from bs4.element import Tag # # # def default_candidate_generator(tag): # for child in tag.descendants: # if not isinstance(child, Tag): # continue # if not child.has_attr('href'): # continue # yield child # # # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) # print(type(tags), tags)
# tag = soup.find('span') # print(tag.string) # 獲取 # tag.string = 'new content' # 設置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # tag.text不能修改標籤內容 # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 遞歸內部獲取全部標籤的文本 # print(v)
# tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # 若是實在想追加當前標籤已經存在的,方法以下 # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.append(obj) # print(soup)
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup)
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup)
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('div') # tag.replace_with(obj) # print(soup)
# tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling)
# from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一個新來的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup)
# tag = soup.find('a') # v = tag.unwrap() # print(soup)
# tag = soup.find('a') # v = tag.unwrap() # print(soup)