爬蟲練習

時間 2019-11-12

原文原文鏈接

爬蟲小項目

0、爬取大學排名

import bs4
import requests
from bs4 import BeautifulSoup


# 經過傳入網址信息建立一個獲取網頁文本的函數
def getHTMLText(url):
    # 判斷獲取網頁文本過程當中是否有錯誤
    try:
        # 打開網址獲取文本，而且把延遲設置成30s
        r = requests.get(url, timeout=30)
        # 獲取狀態碼
        r.raise_for_status()
        # 設置文件編碼
        r.encoding = r.apparent_encoding
        # 若是成功返回網頁文本
        return r.text
    except:
        # 獲取網頁文本有錯誤則返回空文本
        return ""


# 經過傳入空列表和網頁文本信息建立一個在大學列表中加入大學信息的函數
def fillUnivList(ulist, html):
    # 用BeautifulSoup將網頁文本以’html.parser‘煮成一鍋粥
    soup = BeautifulSoup(html, "html.parser")
    # 經過網頁源代碼咱們能夠發現咱們須要的信息都在tbody標籤內，所以咱們循環找出’tbody‘標籤及其子標籤的內容
    for tr in soup.find('tbody').children:
        # 經過bs4.element.Tag判斷是否爲tr標籤
        if isinstance(tr, bs4.element.Tag):
            # 對於tr標籤的咱們拿到tr標籤裏的td標籤
            tds = tr('td')
            # [<td>1</td>, <td><div align="left">清華大學</div></td>, <td>北京</td>, <td>95.3</td>...
            # 咱們經過篩選出咱們須要的td標籤中的文本並將其用列表的方式加入咱們傳入的列表ulist中
            ulist.append([tds[0].string, tds[1].string,
                          tds[2].string, tds[3].string])


# 經過傳入學校列表信息建立一個打印大學列表的函數
def printUnivList(ulist,province):
    # 打印標題
    print("中國最好大學排名2018({}地區)".center(45, '-').format(province))
    # 設置一個format格式化的模板
    # 注意：這裏的{4}是由於utf8格式的英文和中文字節數不一樣，python會自動用英文來填
    tplt = "{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}"
    # 充空白位置，咱們用chr(12288)將其修改爲用中文填充空白位置
    # 打印第一行
    print(tplt.format("排名", "學校名稱", "地區", "總分", chr(12288)))
    if province == '安徽':
        print(tplt.format(1, '安徽師範大學花津校區', '安徽', 99.9, chr(12288)))
    # 循環取出列表中的每一所大學的信息，取出的大學信息是列表的形式(能夠控制range(len(ulist))的長度來控制想要打印的學校的數量)
    for i in range(len(ulist)):
        # 將每一所大學的信息以列表的形式賦值給u
        u = ulist[i]
        # u[2]是地區，判斷是否爲安徽地區（能夠本身更改地區信息，若是刪除該判斷，打印全部學校信息，也能夠更改判斷條件）
        if u[2] == province:
            # 若是爲安徽地區，咱們打印屬於安徽地區的每一所大學的信息
            print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))


# 建立一個運行函數
def main(province='安徽'):
    # 建立一個空列表，爲填充大學信息列表作準備
    uinfo = []
    # 定義一個想要爬取的網頁
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
    # 傳入想要爬取的網頁獲取該網頁文本信息
    html = getHTMLText(url)
    # 給填充大學信息函數傳值
    fillUnivList(uinfo, html)
    # 給打印大學信息函數傳值
    printUnivList(uinfo,province=province)


main()
# main(province='北京')

一、爬取豆瓣250

import requests
import time
from openpyxl import Workbook
from bs4 import BeautifulSoup

wb = Workbook()
sheet = wb.active
count = 1
for i in range(0,100,25):
    ret = requests.get('https://movie.douban.com/top250?start=%s&filter='%(i))
    bs = BeautifulSoup(ret.text,'html.parser')
    ol = bs.find(name='ol',attrs={'class':'grid_view'})
    li_list = ol.find_all(name='li')
    sheet.title = '好評電影'
    sheet['A1'].value = '序號'
    sheet['B1'].value = '電影名稱'
    sheet['C1'].value = '電影評分'
    sheet['D1'].value = '電影連接'
    sheet['E1'].value = '電影圖片'
    for li in li_list:
        name = li.find(name='span',attrs={'class':'title'})
        a = li.find(name='a')
        span = li.find(name='span',attrs={'class':'rating_num'})
        img = a.find(name='img')
        count += 1
        sheet['A%s'%(count)].value = count - 1
        sheet['B%s'%(count)].value = name.text
        sheet['C%s'%(count)].value = span.text
        sheet['D%s'%(count)].value = a['href']
        sheet['E%s'%(count)].value = img['src']
    time.sleep(1)
wb.save('好評電影.xlsx')

二、爬取汽車之家

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


def run(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text,'html.parser')
    # 獲取ul
    ul = soup.find(name='ul',attrs={"class":"article"})
    # 獲取全部的li
    li_list = ul.find_all(name='li')
    infos = []
    for li in li_list:
        name = li.find(name="h3")
        name1 = ""
        if name:
            name1 = (name.text)
        href = li.find(name='a')
        href1 = ""
        if href:
            href1 = ('http:'+href['href'])
        info = li.find(name='p')
        info1 = ""
        if info:
            info1 = (info.text)
        infos.append({"title":name1,"href":href1,"info":info1})
    print(infos)

if __name__ == '__main__':
    url = 'https://www.autohome.com.cn/news/'
    run(url)

三、爬取鬥圖表情包

import requests
from bs4 import BeautifulSoup
ret = requests.get('https://www.doutula.com/photo/list?page=0')
bs = BeautifulSoup(ret.text,'html.parser')
div = bs.find(name='div',attrs={'class':'page-content text-center'})

a_list = div.find_all(name='a')
for a in a_list:
    img = a.find(name='img')
    img_name = img.get('alt')
    img_url = img.get('data-backup')

    if img_name and img_url:
           # print(img_name)
           # print(img_url)
           f = open('表情包/%s.jpg'%(img_name),'wb')
           ret_img = requests.get(img_url)
           f.write(ret_img.content)

四、爬取梨視頻

import requests
import re
from bs4 import BeautifulSoup

ret = requests.get('https://www.pearvideo.com/')
print(ret.text)

bs = BeautifulSoup(ret.text,'html.parser')
div_list = bs.find_all(name='div',attrs={'class':'vervideo-tbd'})

num = 0
for div in div_list:
    a = div.find(name='a')
    video_url = 'https://www.pearvideo.com/' + a.get('href')
    video_ret = requests.get(video_url)

    mp4_url = re.search('(https:\/\/)[^\s]+mp4',video_ret.text).group()
    print(mp4_url)
    mp4_ret = requests.get(mp4_url)
    f = open('梨視頻%s.mp4'%(num),'wb')
    f.write(mp4_ret.content)
    num += 1

實如今線翻譯功能

import requests
import json
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}


def main(keys=''):
    url = 'http://fy.iciba.com/ajax.php?a=fy'
    data = {
        'f': 'auto',
        't': 'auto',
        'w': keys
    }
    response = requests.post(url,headers=headers,data=data)
    info = response.text
    data_list = json.loads(info)
    try:
        val = data_list['content']['word_mean'] # 中文轉英文
    except:
        val = data_list['content']['out']  # 英文轉中文
    return val

if __name__ == '__main__':
    keys = input('請輸入須要翻譯的英文或者中文...')
    if not keys:
          print('請您正確輸入須要翻譯的中文或者英文...')
    else:
        data = main(keys)
        print(data)

selenium小項目

開胃菜

# 百度搜索老男孩
from selenium import webdriver
# 打開瀏覽器
b = webdriver.Chrome()
# 請求百度
b.get('https://www.baidu.com')
# 找到百度的input輸入框的標識符 id：kw
ele = b.find_element_by_id('kw')
# 清除輸入框信息
ele.clear()
# 輸入 老男孩 
ele.send_keys('老男孩')
# 查找點擊按鈕節點
su = b.find_element_by_id('su')
# 點擊按鈕
su.click()

爬取京東商城

from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 鍵盤按鍵操做
import time

def get_goods(driver):
    try:
        goods = driver.find_elements_by_class_name('gl-item')

        for good in goods:
            detail_url = good.find_element_by_tag_name('a').get_attribute('href')

            p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n','')
            price = good.find_element_by_css_selector('.p-price i').text
            p_commit = good.find_element_by_css_selector('.p-commit a').text
            msg = '''
            商品 : %s
            連接 : %s
            價錢 ：%s
            評論 ：%s
            ''' % (p_name, detail_url, price, p_commit)

            print(msg, end='\n\n')

        button = driver.find_element_by_partial_link_text('下一頁')
        button.click()
        time.sleep(1)
        get_goods(driver)
    except Exception:
        pass


def spider(url, keyword):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(3)  # 使用隱式等待
    try:
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys(keyword)
        input_tag.send_keys(Keys.ENTER)
        get_goods(driver)
    finally:
        driver.close()


if __name__ == '__main__':
    spider('https://www.jd.com/', keyword='華爲P30')

爬蟲與數據分析之雨女無瓜

import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
import numpy as np
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator

url = "https://comment.bilibili.com/92542241.xml"
r = requests.get(url)
r.encoding = 'utf8'


soup = BeautifulSoup(r.text,'lxml')
d = soup.find_all('d')

dlst = []
n = 0
for i in d:
    n += 1
    danmuku = {}
    danmuku['彈幕'] = i.text
    danmuku['網址'] = url
    danmuku['時間'] = datetime.date.today()
    dlst.append(danmuku)

df = pd.DataFrame(dlst)

with open('sign.txt','w',encoding='utf8') as f:
    for text in df['彈幕'].values:
        pattern = re.compile(r'[一-龥]+')
        filter_data = re.findall(pattern,text)
        f.write("".join(filter_data))

with open('sign.txt', 'r', encoding='utf8') as f:
    data = f.read()
    segment = jieba.lcut(data)
    words_df = pd.DataFrame({"segment": segment})

word_stat = words_df.groupby(by=['segment'])['segment'].agg({'計數':np.size})
words_stat = word_stat.reset_index().sort_values(by=['計數'],ascending=False)

color_mask = imread('01.jpg')

wordcloud = WordCloud(
    # font_path="simhei.ttf",   # mac上沒有該字體
    font_path="C:\Windows\Fonts\simkai.ttf",
    # 設置字體能夠顯示中文
    background_color="white",  # 背景顏色
    max_words=3000,  # 詞雲顯示的最大詞數
    mask=color_mask,  # 設置背景圖片
    max_font_size=200,  # 字體最大值
    random_state=100,
    width=1000, height=860, margin=2,
    # 設置圖片默認的大小,可是若是使用背景圖片的話,                                                   # 那麼保存的圖片大小將會按照其大小保存,margin爲詞語邊緣距離
)

# 生成詞雲, 能夠用generate輸入所有文本,也能夠咱們計算好詞頻後使用generate_from_frequencies函數
word_frequence = {x[0]: x[1] for x in words_stat.head(500).values}
word_frequence_dict = {}
for key in word_frequence:
    word_frequence_dict[key] = word_frequence[key]

wordcloud.generate_from_frequencies(word_frequence_dict)
# 從背景圖片生成顏色值
# image_colors = ImageColorGenerator(color_mask)
# 從新上色
# wordcloud.recolor(color_func=image_colors)
# 保存圖片
wordcloud.to_file('output.png')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。