基於xpath選擇器、PyQuery、正則表達式的格式清理工具

1,使用xpath清理沒必要要的標籤元素,以及無內容標籤

from lxml import etree


def xpath_clean(self, text: str, xpath_dict: dict) -> str:
        '''
        xpath 清除沒必要要的元素
        :param text: html_content
        :param xpath_dict: 清除目標xpath
        :return: string type html_content
        '''
        remove_by_xpath = xpath_dict if xpath_dict else dict()

        # 必然清除的項目 除非極端狀況 通常這些都是要清除的
        remove_by_xpath.update({
            '_remove_2': '//iframe',
            '_remove_4': '//button',
            '_remove_5': '//form',
            '_remove_6': '//input',
            '_remove_7': '//select',
            '_remove_8': '//option',
            '_remove_9': '//textarea',
            '_remove_10': '//figure',
            '_remove_11': '//figcaption',
            '_remove_12': '//frame',
            '_remove_13': '//video',
            '_remove_14': '//script',
            '_remove_15': '//style'
        })

        parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        selector = etree.HTML(text, parser=parser)

        # 常規刪除操做,不須要的標籤刪除
        for xpath in remove_by_xpath.values():
            for bad in selector.xpath(xpath):
                bad_string = etree.tostring(bad, encoding='utf-8',
                                            pretty_print=True).decode()
                logger.debug(f"clean article content : {bad_string}")
                bad.getparent().remove(bad)

        skip_tip = "name()='img' or name()='tr' or " \
                   "name()='th' or name()='tbody' or " \
                   "name()='thead' or name()='table'"
        # 判斷全部p標籤,是否有內容存在,沒有的直接刪除
        for p in selector.xpath(f"//*[not({skip_tip})]"):
            # 跳過邏輯
            if p.xpath(f".//*[{skip_tip}]") or \
                    bool(re.sub('\s', '', p.xpath('string(.)'))):
                continue

            bad_p = etree.tostring(p, encoding='utf-8',
                                   pretty_print=True).decode()
            logger.debug(f"clean p tag : {bad_p}")
            p.getparent().remove(p)

        return etree.tostring(selector, encoding='utf-8',
                              pretty_print=True).decode()

2,使用pyquery清理標籤屬性,並返回處理後源碼和純淨文本

#!/usr/bin/env python
# -*-coding:utf-8-*-

from pyquery import PyQuery as pq


def pyquery_clean(self, text, url, pq_dict) -> object:
        '''
        pyquery 作出必要的處理,
        :param text:
        :param url:
        :param pq_dict:
        :return:
        '''
        # 刪除pq表達式字典
        remove_by_pq = pq_dict if pq_dict else dict()
        # 標籤屬性白名單
        attr_white_list = ['rowspan', 'colspan']
        # 圖片連接key
        img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
        # 生成pyquery對象
        dom = pq(text)

        # 刪除無用標籤
        for bad_tag in remove_by_pq.values():
            for bad in dom(bad_tag):
                bad_string = pq(bad).html()
                logger.debug(f"clean article content : {bad_string}")
            dom.remove(bad_tag)

        # 標籤各個屬性處理
        for tag in dom('*'):
            for key, value in tag.attrib.items():
                # 跳過邏輯,保留表格的rowspan和colspan屬性
                if key in attr_white_list:
                    continue
                # 處理圖片連接,不完整url,補充完整後替換
                if key in img_key_list:
                    img_url = self.absolute_url(url, value)
                    pq(tag).remove_attr(key)
                    pq(tag).attr('src', img_url)
                    pq(tag).attr('alt', '')
                # img標籤的alt屬性保留爲空
                elif key == 'alt':
                    pq(tag).attr(key, '')
                # 其他全部屬性作刪除操做
                else:
                    pq(tag).remove_attr(key)

        return dom.text(), dom.html()

 3,正則表達清理空格以及換行符內容

#!/usr/bin/env python
# -*-coding:utf-8-*-

import re    


def regular_clean(self, str1: str, str2: str):
        '''
        正則表達式處理數據格式
        :param str1: content
        :param str2: html_content
        :return: 返回處理後的結果
        '''

        def new_line(text):
            text = re.sub('<br\s?/?>', '<br>', text)
            text = re.sub(
                '</?a>|</?em>|</?html>|</?body>|'
                '</?head>|<[a-zA-Z]{1,10}\s?/>|'
                '</?strong>|</?blockquote>|</?b>|'
                '</?span>|</?i>|</?hr>|</?font>',
                '',
                text)
            text = re.sub('\n', '', text)
            text = re.sub('<h[1-6]>', '<p>', text)
            text = re.sub('</h[1-6]>', '</p>', text)
            text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
            return text

        str1, str2 = self.clean_blank(str1), self.clean_blank(str2)  # TODO 處理空白行問題

        # TODO  html_content處理 1,刪除多餘的沒法使用的標籤以及影響數據展現的標籤  2,換行符問題處理以及更換

        str2 = new_line(text=str2)

        return str1, str2

結尾部分,各個方法封裝類代碼展現

#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
author: szhan
date:2020-08-17
summery: 清理html_conent以及獲取純淨數據格式
'''

import re
from lxml import etree
from pyquery import PyQuery as pq
from urllib.parse import urlsplit, urljoin

from loguru import logger


class CleanArticle:

    def __init__(
            self,
            text: str,
            url: str = '',
            xpath_dict: dict = None,
            pq_dict: dict = None
    ):
        self.text = text
        self.url = url
        self.xpath_dict = xpath_dict or dict()
        self.pq_dict = pq_dict or dict()

    @staticmethod
    def absolute_url(baseurl: str, url: str) -> str:
        '''
        補充url
        :param baseurl:scheme url
        :param url: target url
        :return: complete url
        '''
        target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
        return target_url

    @staticmethod
    def clean_blank(text):
        '''
        空白處理
        :param text:
        :return:
        '''
        text = text.replace('&#13;', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
        text = re.sub('\s{2,}', '', text)
        text = re.sub('\n{2,}', '\n', text)
        text = text.strip('\n').strip()
        return text

    def run(self):
        '''
        :return:處理後的content, html_content
        '''
        if (not bool(self.text)) or (not isinstance(self.text, str)):
            raise ValueError('html_content has a bad type value')
        # 首先,使用xpath去除空格,以及註釋,iframe, button, form, script, style, video等標籤
        text = self.xpath_clean(self.text, self.xpath_dict)

        # 第二步,使用pyquery處理具體細節方面
        str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)

        # 最終的正則處理
        content, html_content = self.regular_clean(str1, str2)

        return content, html_content

    def xpath_clean(self, text: str, xpath_dict: dict) -> str:
        '''
        xpath 清除沒必要要的元素
        :param text: html_content
        :param xpath_dict: 清除目標xpath
        :return: string type html_content
        '''
        remove_by_xpath = xpath_dict if xpath_dict else dict()

        # 必然清除的項目 除非極端狀況 通常這些都是要清除的
        remove_by_xpath.update({
            '_remove_2': '//iframe',
            '_remove_4': '//button',
            '_remove_5': '//form',
            '_remove_6': '//input',
            '_remove_7': '//select',
            '_remove_8': '//option',
            '_remove_9': '//textarea',
            '_remove_10': '//figure',
            '_remove_11': '//figcaption',
            '_remove_12': '//frame',
            '_remove_13': '//video',
            '_remove_14': '//script',
            '_remove_15': '//style'
        })

        parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        selector = etree.HTML(text, parser=parser)

        # 常規刪除操做,不須要的標籤刪除
        for xpath in remove_by_xpath.values():
            for bad in selector.xpath(xpath):
                bad_string = etree.tostring(bad, encoding='utf-8',
                                            pretty_print=True).decode()
                logger.debug(f"clean article content : {bad_string}")
                bad.getparent().remove(bad)

        skip_tip = "name()='img' or name()='tr' or " \
                   "name()='th' or name()='tbody' or " \
                   "name()='thead' or name()='table'"
        # 判斷全部p標籤,是否有內容存在,沒有的直接刪除
        for p in selector.xpath(f"//*[not({skip_tip})]"):
            # 跳過邏輯
            if p.xpath(f".//*[{skip_tip}]") or \
                    bool(re.sub('\s', '', p.xpath('string(.)'))):
                continue

            bad_p = etree.tostring(p, encoding='utf-8',
                                   pretty_print=True).decode()
            logger.debug(f"clean p tag : {bad_p}")
            p.getparent().remove(p)

        return etree.tostring(selector, encoding='utf-8',
                              pretty_print=True).decode()

    def pyquery_clean(self, text, url, pq_dict) -> object:
        '''
        pyquery 作出必要的處理,
        :param text:
        :param url:
        :param pq_dict:
        :return:
        '''
        # 刪除pq表達式字典
        remove_by_pq = pq_dict if pq_dict else dict()
        # 標籤屬性白名單
        attr_white_list = ['rowspan', 'colspan']
        # 圖片連接key
        img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
        # 生成pyquery對象
        dom = pq(text)

        # 刪除無用標籤
        for bad_tag in remove_by_pq.values():
            for bad in dom(bad_tag):
                bad_string = pq(bad).html()
                logger.debug(f"clean article content : {bad_string}")
            dom.remove(bad_tag)

        # 標籤各個屬性處理
        for tag in dom('*'):
            for key, value in tag.attrib.items():
                # 跳過邏輯,保留表格的rowspan和colspan屬性
                if key in attr_white_list:
                    continue
                # 處理圖片連接,不完整url,補充完整後替換
                if key in img_key_list:
                    img_url = self.absolute_url(url, value)
                    pq(tag).remove_attr(key)
                    pq(tag).attr('src', img_url)
                    pq(tag).attr('alt', '')
                # img標籤的alt屬性保留爲空
                elif key == 'alt':
                    pq(tag).attr(key, '')
                # 其他全部屬性作刪除操做
                else:
                    pq(tag).remove_attr(key)

        return dom.text(), dom.html()

    def regular_clean(self, str1: str, str2: str):
        '''
        正則表達式處理數據格式
        :param str1: content
        :param str2: html_content
        :return: 返回處理後的結果
        '''

        def new_line(text):
            text = re.sub('<br\s?/?>', '<br>', text)
            text = re.sub(
                '</?a>|</?em>|</?html>|</?body>|'
                '</?head>|<[a-zA-Z]{1,10}\s?/>|'
                '</?strong>|</?blockquote>|</?b>|'
                '</?span>|</?i>|</?hr>|</?font>',
                '',
                text)
            text = re.sub('\n', '', text)
            text = re.sub('<h[1-6]>', '<p>', text)
            text = re.sub('</h[1-6]>', '</p>', text)
            text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
            return text

        str1, str2 = self.clean_blank(str1), self.clean_blank(str2)  # TODO 處理空白行問題

        # TODO  html_content處理 1,刪除多餘的沒法使用的標籤以及影響數據展現的標籤  2,換行符問題處理以及更換

        str2 = new_line(text=str2)

        return str1, str2


if __name__ == '__main__':
    with open('html_content.html', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        html = ''
        for line in lines:
            html += line
    ca = CleanArticle(text=html)
    _, html_content = ca.run()
    print(html_content)

若有不一樣意見可私信交流,歡迎轉發,謝謝你們!

相關文章
相關標籤/搜索