因興趣寫了一點點爬蟲,淺談爬蟲的一些簡單操做吧。html
不管接口還好,仍是html文本也好,獲取其餘的數據,先經過抓包工具肯定好要爬取的url地址python
對於互聯網上的站點,接口等,模擬人類操做構造響應的請求,設置對應的請求頭,來獲取想要的數據數據庫
第三步爬取到想要的數據,構造好結構化數據就能夠進行持久化存儲了。文件或者數據庫均可瀏覽器
再就是爬蟲異常、分佈式爬蟲和反爬網絡
https://search.sina.com.cn/?q=%E6%98%A5%E8%8A%82&range=all&c=news&sort=time
爬取urlapp
def __send_page_req(self, page_num: int = 1): """ 發起請求 :return: """ search_url = self.__init_search_url(page_num) response = requests.get(search_url) response = Selector(response=response) return response def __init_search_url(self, page: int = 1): """ 構造請求數據 :param page: :return: """ params = { 'q': self.search_keyword, 'range': "all", 'c': 'news', 'sort': "time", 'page': page } str_params = urllib.parse.urlencode(params) return self.search_url + '?' + str_params
def parse_page_req(self, page_num: int = 1): pass
import requests from scrapy.selector import Selector import urllib.parse import datetime import re class SinaNewsSpider: """ 新浪新聞搜索 """ search_url = 'https://search.sina.com.cn/?{params}' spider_source = 'sina' title_compile = re.compile('<a.*?>([\s\S]*?)</a>') article_min_date = '2020-12-01 00:00:00' # 新聞的最先時間 def __init__(self, search_keyword: str): self.search_keyword = search_keyword def go(self): page_num = 1 while True: news_data, min_date = self.parse_page_req(page_num) [self.__save(item) for item in self.parse_data(news_data)] if min_date > self.article_min_date: page_num += 1 else: break def __save(self, data): """ 數據存儲 :param data: :return: """ print(data) pass def parse_data(self, news_data): """ 數據解析 :param news_data: :return: """ for news in news_data: content = self.__get_content(news['detail_url']) if content is None: print('error:', news) else: item = {} item['content'] = content item['source'] = 'sina' item['keyword'] = self.search_keyword item['news_url'] = news['detail_url'] item['insert_time'] = str(datetime.datetime.today()) item['title'] = news['title'] item['release_time'] = news['release_time'] item['author'] = news['author'] yield item def __get_content(self, url): response = requests.get(url) response = Selector(text=response.content.decode('utf-8')) content = response.xpath('//div[@id="article"]').extract_first() content_artibody = response.xpath('//div[@id="artibody"]').extract_first() content_section = response.xpath('//section[@class="art_pic_card art_content"]').extract_first() return content or content_artibody or content_section def parse_page_req(self, page_num: int = 1): """ 解析翻頁請求 :param response: :return: """ response = self.__send_page_req(page_num) news_list = response.xpath('//div[@id="result"]/div[@class="box-result clearfix"]') news_data = [] for news in news_list: item = {} title = news.xpath(".//h2/a").extract_first() item['title'] = self.title_compile.findall(title)[0] item['detail_url'] = news.xpath(".//h2/a/@href").extract_first() source_time_str = news.xpath(".//h2/span/text()").extract_first().strip() item['author'], item['release_time'] = source_time_str.split(" ", maxsplit=1) news_data.append(item) return news_data, min(map(lambda x: item['release_time'], news_data)) def __send_page_req(self, page_num: int = 1): """ 發起請求 :return: """ search_url = self.__init_search_url(page_num) response = requests.get(search_url) response = Selector(response=response) return response def __init_search_url(self, page: int = 1): """ 構造請求數據 :param page: :return: """ params = { 'q': self.search_keyword, 'range': "all", 'c': 'news', 'sort': "time", 'page': page } str_params = urllib.parse.urlencode(params) return self.search_url.format(params=str_params) sina = SinaNewsSpider("春節") news_data = sina.go()
python 仍是一個很是不錯的工具。