視頻課程連接:http://edu.51cto.com/course/14870.html
html
爬蟲,稱爲網頁蜘蛛或網絡機器人,用於自動獲(爬)取互聯網上的信息,本質上就是一段代碼java
任何一門高級開發語言均可以實現爬蟲,並不僅有Pythonpython
經過代碼,模擬瀏覽器向服務器發送HTTP或HTTPS請求,而後對服務器響應的結果進行處理,從中獲取想要的數據mysql
三步走:正則表達式
使用urllib模塊模擬瀏覽器發送請求redis
# 獲取數據 def get_data(): url = 'https://search.51job.com/list/070200,000000,0000,00,9,99,java%25E5%25BC%2580%25E5%258F%2591,2,1.html' # 建立Request對象,指定url和請求頭 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } req = request.Request(url, headers=headers) response = request.urlopen(req) # print(type(response)) # HTTPResponse類型 # print(response.getcode()) # 響應狀態碼 # print(response.info()) if response.getcode() == 200: data = response.read() # 讀取響應結果 # print(type(data)) # bytes類型 data = str(data, encoding='gbk') # 轉換爲str # print(data) # 將數據寫入文件中 with open('index.html', mode='w', encoding='gbk') as f: f.write(data)
三種方式:sql
字符串解析json
使用字符串+正則表達式瀏覽器
使用XPath服務器
XPath是一門在XML文檔中查找信息的語言,用來在XML文檔中對元素和屬性進行遍歷。
使用Chrome瀏覽器的開發人員工具,獲取XPath
使用第三方模塊BeautifulSoup
Beautiful Soup 是一個能夠從HTML或XML文件中提取數據的Python庫
安裝pip install beautifulsoup4
# 處理數據 def parse_data(): with open('index.html', mode='r', encoding='gbk') as f: html = f.read() # 建立BeautifulSoup實例,解析html數據 bs = BeautifulSoup(html, 'html.parser') # 指定使用html解析器parser ''' 查找數據 ''' # 1.find()方法,獲取第一個匹配的標籤 # div = bs.find('div') # print(div) # print(type(div)) # Tag類型 # 2.find_all()方法,獲取全部匹配的標籤 # metas = bs.find_all('meta') # 返回的是集合 # print(metas[0]) # print(bs.find_all(id='hello')) # 根據id獲取,返回的是集合 # print(bs.find_all(class_='itany')) # 根據class獲取 # 3.select()方法,使用CSS選擇器來獲取元素 # print(bs.select('#hello')) # print(bs.select('.itany')) # print(bs.select('p#world span')) # print(bs.select('[title]')) # 4.get_text()方法,獲取Tag中的文本 # value = bs.select('#hello')[0].get_text(strip=True) # print(len(value)) # print(value) # 獲取職位信息 divs = bs.select('#resultList .el') result = [] for div in divs[1:]: title = div.select('.t1')[0].get_text(strip=True) company = div.select('.t2')[0].get_text(strip=True) addr = div.select('.t3')[0].get_text(strip=True) salary = div.select('.t4')[0].get_text(strip=True) pubDate = div.select('.t5')[0].get_text(strip=True) # print(title, company, addr, salary, pubDate) row = { 'title': title, 'company': company, 'addr': addr, 'salary': salary, 'pubDate': pubDate } result.append(row) return result
# 存儲數據到MySQL def save_to_mysql(data): config = { 'host': 'localhost', 'port': 3306, 'user': 'root', 'password': '', 'database': 'python', 'charset': 'utf8' } conn = pymysql.connect(**config) cursor = conn.cursor() sql = ''' insert into t_job (title, company, addr, salary, pubDate) values (%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubDate)s) ''' cursor.executemany(sql, data) conn.commit() cursor.close() conn.close()
使用openpyxl模塊操做Excel
安裝openpyxl:pip install openpyxl
工做薄Workbook
工做表Sheet
單元格Cell
# 存儲數據到Excel def save_to_excel(data): # 建立工做薄Workbook book = Workbook() # 建立工做表Sheet sheet = book.create_sheet('南京Java招聘信息', 0) # 向工做表中添加數據 sheet.append(['職位名', '公司名', '工做地點', '薪資', '發佈時間']) for item in data: row = [item['title'], item['company'], item['addr'], item['salary'], item['pubDate']] sheet.append(row) # 輸出保存 book.save('51job.xlsx')
安裝redis庫:pip install redis
# 存儲數據到Redis def save_to_redis(data): config = { 'host': '192.168.2.30', 'port': 6379, 'charset': 'utf8' } r = redis.Redis(**config) # r.set('name', 'tom') for item in data: r.lpush('jobs', item) # 從Redis中讀取數據 def read_from_redis(): config = { 'host': '192.168.2.30', 'port': 6379, 'charset': 'utf8', 'decode_responses': True # 讀取時解碼 } r = redis.Redis(**config) print(r.lrange('jobs', 0, -1))
from urllib import request import json def get_data(): url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=400&page_start=0' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } req = request.Request(url, headers=headers) response = request.urlopen(req) if response.getcode() == 200: result = response.read() # print(type(result)) # bytes類型 return result def parse_data(html): # 將字符串形式的json轉換爲dict字典 data = json.loads(html) # print(type(data), data) movies = data['subjects'] for movie in movies: print(movie['title'], movie['rate']) if __name__ == '__main__': parse_data(get_data())
步驟:
from urllib import request import json from datetime import datetime, timedelta import time # 獲取數據 def get_data(url): headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } req = request.Request(url, headers=headers) response = request.urlopen(req) if response.getcode() == 200: return response.read() # 處理數據 def parse_data(html): data = json.loads(html)['cmts'] comments = [] for item in data: comment = { 'id': item['id'], 'nickName': item['nickName'], 'cityName': item['cityName'] if 'cityName' in item else '', # 處理cityName不存在狀況 'content': item['content'].replace('\n', ' '), # 處理評論內容換行的狀況 'score': item['score'], 'startTime': item['startTime'] } comments.append(comment) return comments # 存儲數據到文本文件 def save_to_txt(): start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 當前時間 end_time = '2018-08-10 00:00:00' # 結束時間 while start_time > end_time: url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=0&startTime=' + start_time.replace( ' ', '%20') try: html = get_data(url) except: time.sleep(1) html = get_data(url) else: time.sleep(0.1) comments = parse_data(html) print(comments) start_time = comments[14]['startTime'] # 末尾評論時間 start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') - timedelta(seconds=1) # 向前減1秒,防止獲取到重複數據 start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S') for item in comments: with open('comments.txt', mode='a', encoding='utf-8') as f: f.write(str(item['id']) + ',' + item['nickName'] + ',' + item['cityName'] + ',' + item[ 'content'] + ',' + str(item['score']) + ',' + item['startTime'] + '\n') if __name__ == '__main__': # url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=15&startTime=2018-09-01%2011%3A10%3A00' # comments = parse_data(get_data(url)) # print(comments) save_to_txt()
pyecharts類庫
from collections import Counter from pyecharts import Geo import json from pyecharts import Bar def render(): # 獲取全部城市信息 cities = [] with open('comments.txt', mode='r', encoding='utf-8') as f: rows = f.readlines() for row in rows: city = row.split(',')[2] if city != '': cities.append(city) # 對城市數據和座標文件中的地名進行處理 handle(cities) # 統計每一個城市出現的次數 # data = [] # [('南京',25),('北京',59)] # for city in set(cities): # data.append((city, cities.count(city))) data = Counter(cities).most_common() # 根據城市數據生成地理座標圖 geo = Geo( "《一出好戲》粉絲位置分佈", "數據來源:貓眼", title_color="#fff", title_pos="center", width=1200, height=600, background_color="#404a59", ) attr, value = geo.cast(data) geo.add( "", attr, value, visual_range=[0, 3500], visual_text_color="#fff", symbol_size=15, is_visualmap=True, ) geo.render('粉絲位置分佈.html') # 根據城市數據生成柱狀圖 cities_top20 = Counter(cities).most_common(20) # 返回出現次數最多的20條 bar = Bar("《一出好戲》粉絲來源排行榜TOP20", '數據來源:貓眼', title_pos='center', width=1200, height=600) attr, value = bar.cast(cities_top20) bar.add("", attr, value) bar.render('粉絲來源排行榜-柱狀圖.html') # 處理地名數據,解析座標文件中找不到地名的問題 def handle(cities): with open( 'C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json', mode='r', encoding='utf-8') as f: data = json.loads(f.read()) # 將str轉換爲dict # 循環判斷處理 data_new = data.copy() # 複製一份地名數據 for city in set(cities): count = 0 for k in data: count += 1 if k == city: break if k.startswith(city): # 處理簡寫的地名,如南京市 簡寫爲 南京 data_new[city] = data[k] break if k.startswith(city[0:-1]) and len(city) >= 3: # 處理行政變動的地名,如溧水縣 改成 溧水區 data_new[city] = data[k] break # 處理不存在的狀況 if count == len(data): while city in cities: cities.remove(city) # print(len(data), len(data_new)) # 寫入覆蓋座標文件 with open( 'C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json', mode='w', encoding='utf-8') as f: f.write(json.dumps(data_new, ensure_ascii=False)) # 將dict轉換爲str,指定ensure_ascii=False支持中文 if __name__ == '__main__': render()
from pyecharts import Pie # 獲取評論中全部評分 rates = [] with open('comments.txt', mode='r', encoding='utf-8') as f: rows = f.readlines() for row in rows: rates.append(row.split(',')[4]) # print(rates) # 定義星級 attr = ['五星', '四星', '三星', '二星', '一星'] value = [ rates.count('5') + rates.count('4.5'), rates.count('4') + rates.count('3.5'), rates.count('3') + rates.count('2.5'), rates.count('2') + rates.count('1.5'), rates.count('1') + rates.count('0.5') ] # print(value) pie = Pie("《一出好戲》評分星級", title_pos='center', width=900) pie.add("", attr, value, is_label_show=True, is_legend_show=False) pie.render('電影評分-餅圖.html')
jieba(結巴)是一個強大的分詞庫,完美支持中文分詞
Matplotlib 是一個Python的 2D繪圖庫,能夠生成繪圖,直方圖,功率譜,條形圖,錯誤圖,散點圖等
wordcloud基於Python的詞雲生成類庫,很好用,並且功能強大
import jieba import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS # 獲取全部評論內容 comments = [] with open('comments.txt', mode='r', encoding='utf-8') as f: rows = f.readlines() for row in rows: comment = row.split(',')[3] if comment != '': comments.append(comment) # 設置分詞 comment_after_split = jieba.cut(str(comments), cut_all=False) words = ' '.join(comment_after_split) # 以空格進行拼接 # print(words) # 設置屏蔽詞彙 stopwords = STOPWORDS.copy() stopwords.add('電影') stopwords.add('一出') stopwords.add('好戲') stopwords.add('有點') # 導入背景圖 bg_image = plt.imread('love.jpg') # 設置詞雲參數 wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, stopwords=stopwords, max_font_size=400, random_state=50,font_path='STKAITI.TTF') # 將分詞後數據導入雲圖 wc.generate_from_text(words) # 繪製圖像 plt.imshow(wc) plt.axis('off') # 不顯示座標軸 plt.show() # 顯示圖像 # 保存圖像到文件 wc.to_file('詞雲圖.jpg')