regex(python)

時間 2019-11-20

標籤 regex python 欄目 Python 简体版

原文原文鏈接

正則表達式html

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/7/26 16:39
# @Author  : jackendoff
# @Site    : 
# @File    : 正則學習.py
# @Software: PyCharm

# re模塊的使用過程
import re
# '''
# re模塊的使用過程
# re.match(pattern, string, flags=0)
# 從頭匹配一個符合規則的字符串，從起始位置開始匹配，匹配成功返回一個對象，未匹配成功返回None
#  · pattern: 正則模型
#  · string： 要匹配的字符
#  · flags： 匹配模式
#  這個方法並非徹底匹配，當pattern結束時若string還有剩餘字符，仍視爲成功，想要徹底匹配，能夠在表達式末尾加上邊界匹配符‘$’
# '''
# result = re.match('^\d{3}$', '124')
# print(result.group())  # 返回被re匹配的字符串124
# print(result.start())  # 返回匹配開始的位置0
# print(result.end())  # 返回匹配結束的位置3
# print(result.span())  # 返回一個元組包含匹配（開始，結束）的位置(0, 3)




'''
re.search函數會在字符串內查找模式匹配，只要找到第一個匹配而後返回，若是字符串沒有匹配，則返回None
格式：re.search(pattern, string, flags=0)
match()和search()的區別：
match()函數只檢測re是否是在string的開始位置匹配，search()會掃描
整個string查找匹配；
也就是說match()只有在0位置匹配成功的化纔有返回，若是不是開始位置匹配成功的話，match()就返回none。

'''
# import re
# ret = re.search(r'\d+', '閱讀次數爲9999')
# print(ret.group())  # 運行結果9999
#
# print(re.match('super', 'superstition').span())  # 結果(0, 5)
# print(re.match('super', 'insuperstition'))  # 結果None
# print(re.search('super', 'superstition').span())  # 結果(0, 5)
# print(re.search('super', 'insuperstition').span())  # 結果(2, 7)




'''
re.findall遍歷匹配，能夠獲取字符串中全部匹配字符串，返回一個列表
'''
# import re
#
# ret = re.findall(r'\d+', '閱讀次數：9999次，轉發次數：887次，評論次數：3次')
# print(ret)  # 運行結果['9999', '887', '3']




'''
sub將匹配到的數據進行替換
使用re替換string中每個匹配的字符串後返回替換後的字符串
格式：re.sub(pattern, repl, string,count)
'''

# import re
#
# ret = re.sub(r'\d+', '10000', '閱讀次數：9999次， 轉發次數：883次， 評論次數：3次')
# print(ret)  # 運行結果 閱讀次數：10000次， 轉發次數：10000次， 評論次數：10000次

import re
# def add(temp):
#     strNum = temp.group()
#     print(strNum)  # 運行結果997
#     num = int(strNum) + 1
#     return str(num)  # 返回一個字符串

# ret = re.sub(r'\d+', add, 'python = 997')  # 能夠分解爲result = re.search(r'\d+', 'python=997') --》add(result)
# print(ret)  # 運行結果python = 998
#
# ret = re.sub(r'\d+', add, 'python = 99')
# print(ret)  # 運行結果python = 100

'''
import re
def add(temp):
    strNum = temp.group()
    print(strNum)  # 運行結果997
    num = int(strNum) + 1
    return str(num)
result = re.search(r'\d+', 'python=997')
print(add(result))
'''




'''
split根據匹配進行切割字符串，並返回一個列表
按照可以匹配的字符串string分割都返回列表。
可使用re.split來分割字符串
格式：re.split(pattern, string[,maxspit])

'''

import re

ret = re.split(r":| ", 'info:xiaozhang 33 shandong')
print(ret)  # 運行結果 ['info', 'xiaozhang', '33', 'shandong']



'''
pyhton裏數量詞默認是貪婪的，老是儘量多地匹配字符；
非貪婪則相反，老是嘗試匹配儘量少的字符
在'*','+','?','{m, n}後面加上？，使貪婪變爲非貪婪。'
'''

量詞：python

　　* 重複0次或更屢次git

　　+ 重複一次或更屢次正則表達式

　　？重複0次或一次算法

　　{n} 重複n次數據庫

　　{n,} 重複n次或更屢次windows

　　{n,m} 重複n到m次設計模式

　　貪心匹配（默認）服務器

　　惰性匹配：量詞 + ''?''併發

　　.*?x 前面取任意長度字符直到取到x

字符：

　　. 匹配除換行之外的任意字符

　　\w 匹配字母或數字或下劃線

　　\d 匹配數字(digit)

　　\s 匹配任意空白符（space）

　　\n 匹配一個換行符

　　\t 匹配一個製表符（TAB）

　　\b 匹配一個單詞的結尾

　　^ 匹配字符串的開始

　　$ 匹配字符串的結尾

　　\W 匹配非字母或數字或下劃線

　　\D 匹配非數字

　　\S 匹配非空白符

　　a|b 匹配字符a或者字符b

　　( ) 匹配括號內的表達式，也表示一個組（search）

　　[...] 匹配字符組中的字符

　　[^...] 匹配除字符組之外的全部字符

pyhton re模塊

python正則

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time    : 2018/7/8 9:27
 4 # @Author  : jackendoff
 5 # @Site    : 
 6 # @File    : text.py
 7 # @Software: PyCharm
 8 
 9 import re
10 
11 # find_str = re.findall('a', 'jai jack')  # 匹配全部，並顯示全部
12 # print(find_str)
13 #
14 # search_str = re.search('a', 'jia jack')  # 匹配找到的第一個，並顯示
15 # print(search_str)
16 # search_gp = search_str.group()
17 # print(search_gp)
18 #
19 # match_str = re.match('a', 'a bc')  # 匹配第一個
20 # print(match_str)
21 # match_gp = match_str.group()
22 # print(match_gp)
23 #
24 # find_str = re.findall('[a-z]\d', 'ji58a991')  # 匹配a-z和數字
25 # print(find_str)
26 # find_str = re.findall('([a-z])\d', 'ji58a991')  # 匹配a-z和數字
27 # print(find_str)
28 # find_str = re.findall('(?:[a-z])\d', 'ji58a991')  # ()表明‘組’在findall裏優先顯示  '?:'表明‘取消優先’
29 # print(find_str)
30 #
31 # search_str = re.search('([a-z])(\d)', 'jia123nfdj8684')  # 匹配a-z和數字 分組
32 # search_gpp = search_str.group()  # 顯示匹配的數字和字母
33 # print(search_gpp)
34 # search_gp = search_str.group(1)  # 顯示第一個組
35 # print(search_gp)
36 # search_gp = search_str.group(2)  # 顯示第二個組
37 # print(search_gp)
38 
39 # sp_str = re.split('\d', 'jia4jfa5f78a')  # 依據數字分割
40 # print(sp_str)
41 # split_str = re.split('[a1]', 'jiafd54f1k')  # 先根據a分割，仔根據1分割
42 # print(split_str)
43 #
44 # sub_str = re.sub('[\d]', '0', 'jia123fd6', 2)  # 將數字替換成0，2表示只替換兩個
45 # print(sub_str)
46 # sub_str = re.subn('[\d]', '0', 'jia123fd6')  # 將數字替換成0，返回一個元組（替換的結果，替換了多少次）
47 # print(sub_str)
48 
49 # obj = re.compile('\d{3}')  # 將正則表達式編譯成一個正則對象（當表達式特別複雜的時候此對象能夠屢次使用）匹配三個數字
50 # obj_str = obj.search('jia123kill')  # 直接使用對象調用search進行匹配
51 # o_str = obj_str.group()
52 # print(o_str)
53 
54 # ret = re.finditer('\d', 'jia5fd55fs6f5')  # finditer返回一個存放匹配結果的迭代器（節省內存)
55 # print(ret)
56 # # for i in ret:
57 #     print(i.group())

正則案例，windows命令行超多

1. 初步使用
In [1]: import re
In [11]: re.match(r"abc","abcde").group()
In [12]: re.match(r"bc","abcde").group()
In [14]: re.search(r"bc","abcde").group()

2. 匹配單個字符
-----------------------.匹配任意字符(除\n)-------------------------------
In [5]: re.match(r".","A").group()
In [6]: re.match(r".","a").group()
In [7]: re.match(r".",".").group()
In [8]: re.match(r".","\n").group()
In [9]: re.match(r".","x").group()
In [10]: re.match(r"A.C","ABC").group()

-------------------------[]匹配集合中任何一個字符---------------------------
In [11]: re.match(r"[aA]BC","ABC").group()
In [12]: re.match(r"[aA]BC","aBC").group()
In [13]: re.match(r"[aA]BC","xBC").group()
In [14]: re.match(r"[0123456789]BC","1BC").group()
In [15]: re.match(r"[0123456789]BC","9BC").group()
-------------------------[-]表示範圍-----------------------------
In [16]: re.match(r"[0-9]BC","9BC").group()
In [17]: re.match(r"[0-9]BC","2BC").group()
In [18]: re.match(r"[0-9a-zA-Z]BC","ABC").group()
In [19]: re.match(r"[0-9a-zA-Z]BC","aBC").group()
In [21]: re.match(r"[0-9a-zA-Z]BC","&BC").group()
In [22]: re.match(r"[0-35-9]BC","1BC").group()
In [24]: re.match(r"[0-35-9]BC","4BC").group()
-------------------------[^]對匹配範圍取反-----------------------------
In [25]: re.match(r"[^4]BC","4BC").group()
In [26]: re.match(r"[^4]BC","1BC").group()
In [27]: re.match(r"[^4]BC","9BC").group()
In [28]: re.match(r"[^4a-z]BC","aBC").group()
In [29]: re.match(r"[^4a-z]BC","zBC").group()
In [30]: re.match(r"[^4a-z]BC","ZBC").group()

-------------------------\d匹配數字字符 \D匹配非數字---------------------
In [31]: re.match(r"[0-9]BC","4BC").group()
In [32]: re.match(r"\dBC","4BC").group()
In [33]: re.match(r"\dBC","2BC").group()

In [34]: re.match(r"\DBC","2BC").group()
In [35]: re.match(r"\DBC","aBC").group()
In [36]: re.match(r"\DBC","xBC").group()
--------------------------\w匹配單詞字符 \w匹配非單詞------------------------
In [38]: re.match(r"\wBC", "ABC").group()
In [39]: re.match(r"\wBC", "aBC").group()
In [40]: re.match(r"\wBC", "1BC").group()
In [41]: re.match(r"\wBC", "_BC").group()
In [42]: re.match(r"[\da-zA-Z_]BC", "1BC").group()
In [43]: re.match(r"[\da-zA-Z_]BC", "ABC").group()
In [44]: re.match(r"[\da-zA-Z_]BC", "aBC").group()
In [45]: re.match(r"[\da-zA-Z_]BC", "_BC").group()

In [46]: re.match(r"\WBC", "_BC").group()
In [47]: re.match(r"\WBC", "$BC").group()
In [48]: re.match(r"\WBC", " BC").group()



3. 匹配多個字符
In [50]: re.match(r"嫦娥\w號", "嫦娥一號升空了", re.A).group()
In [51]: re.match(r"嫦娥\d號", "嫦娥1號升空了", re.A).group()
In [52]: re.match(r"嫦娥\d\d號", "嫦娥11號升空了", re.A).group()
In [53]: re.match(r"嫦娥\d\d\d號", "嫦娥111號升空了", re.A).group()
In [54]: re.match(r"嫦娥\d\d\d\d\d\d號", "嫦娥111111號升空了", re.A).group()

In [55]: re.match(r"嫦娥\d{6}號", "嫦娥111111號升空了", re.A).group()
In [56]: re.match(r"嫦娥\d{6}號", "嫦娥11號升空了", re.A).group()
In [57]: re.match(r"嫦娥\d{2,6}號", "嫦娥11號升空了", re.A).group()
In [58]: re.match(r"嫦娥\d{2,6}號", "嫦娥11111號升空了", re.A).group()

In [59]: re.match(r"嫦娥\d{1,}號", "嫦娥11111號升空了").group()
In [60]: re.match(r"嫦娥\d{1,}號", "嫦娥11111111111號升空了").group()
In [61]: re.match(r"嫦娥\d+號", "嫦娥11111111111號升空了").group()
In [63]: re.match(r"嫦娥\d+號", "嫦娥號升空了").group()

In [64]: re.match(r"嫦娥\d{0,}號", "嫦娥號升空了").group()
In [65]: re.match(r"嫦娥\d{0,}號", "嫦娥1111號升空了").group()
In [66]: re.match(r"嫦娥\d*號", "嫦娥1111號升空了").group()
In [67]: re.match(r"嫦娥\d*號", "嫦娥號升空了").group()

In [81]: re.match(r"嫦娥\d{0,1}號", "嫦娥號升空了").group()
In [82]: re.match(r"嫦娥\d{0,1}號", "嫦娥1號升空了").group()
In [83]: re.match(r"嫦娥\d?號", "嫦娥1號升空了").group()
In [84]: re.match(r"嫦娥\d?號", "嫦娥11號升空了").group()

4. 匹配開始和結束位置
	.在正則中表示匹配除\n以外的任意字符  若是要再正則中表示.自己的含義 使用\.


In [68]: re.match(r"\w@163.com","123456@163.com").group()
In [69]: re.match(r"\w{4,20}@163.com","123456@163.com").group()
In [70]: re.match(r"\w{4,20}@163.com","123456@163Acom").group()
In [71]: re.match(r"\w{4,20}@163\.com","123456@163Acom").group()
In [72]: re.match(r"\w{4,20}@163\.com","123456@163.com").group()

In [73]: re.match(r"\w{4,20}@163\.com","123456@163.com.cc").group()
In [74]: re.match(r"\w{4,20}@163\.com",".ccc.123456@163.com").group()
In [75]: re.match(r"\w{4,20}@163\.com","123456@163.com.cc").group()
In [76]: re.match(r"\w{4,20}@163\.com$","123456@163.com.cc").group()
In [77]: re.search(r"\w{4,20}@163\.com$","123456@163.com.cc").group()
In [78]: re.search(r"\w{4,20}@163\.com$","ccc.123456@163.com").group()
In [79]: re.match(r"\w{4,20}@163\.com$","ccc.123456@163.com").group()

In [80]: re.search(r"^\w{4,20}@163\.com$","ccc.123456@163.com").group()
In [81]: re.search(r"^\w{4,20}@163\.com$","123456@163.com").group()

5. 分組

----------------------()將感興趣的數據放到分組中-------------------------
In [86]: re.match(r"嫦娥\d+號", "嫦娥9號升空了").group()
In [87]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group()
In [88]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group(0)
In [89]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group(1)
In [90]: re.search(r"(^\w{4,20})@163\.com$","123456@163.com").group(1)
In [91]: re.search(r"(^\w{4,20})@(163)\.com$","123456@163.com").group(1)
In [92]: re.search(r"(^\w{4,20})@(163)\.com$","123456@163.com").group(2)

----------------------|匹配左右任何一個正則表達式--------------------------
In [93]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@163.com").group(1)
In [94]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@163.com").group(0)
In [95]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@qq.com").group(0)

---------------------(|)匹配()中任何一個正則表達式並將匹配結果放到分組中-------
In [96]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@qq.com").group(0)
In [97]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@163.com").group(0)
In [98]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@263.com").group(0)
In [99]: re.search(r"(^\w{4,20})@(163|qq|263)\.com$","123456@263.com").group(0)
In [100]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(0)
In [101]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(1)
In [102]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(2)

---------------------引用分組(匿名 只能經過分組號引用)--------------------------------
In [103]: re.match(r"<\w+>.*", "<html>hh</html>").group()
In [104]: re.match(r"<(\w+)>.*", "<html>hh</html>").group()
In [105]: re.match(r"<(\w+)>.*", "<html>hh</html>").group(1)
In [106]: re.match(r"<(\w+)>.*</\1>", "<html>hh</html>").group(1)
In [107]: re.match(r"<(\w+)>.*</\1>", "<html>hh</htm>").group(1)
In [108]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h1></html>").group(1)
In [109]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h1></html>").group()
In [110]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h2></html>").group()


In [111]: re.match(r"(\d{3,4})-(\d{6,8})", "010-12345678").group(1)
In [112]: re.match(r"(\d{3,4})-(\d{6,8})", "010-12345678").group(2)
In [113]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(2)
In [114]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(1)
In [115]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(3)

------------------建立有名分組 給分組起名  使用有名分組------------------------
In [116]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}))", "010-12345678").group(3)
In [118]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group()

In [119]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(1)

In [120]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2)

In [121]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2)

In [122]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(1)

In [123]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2)

In [124]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(3)

6. 高級函數
----------------------------------------------------------------------------
In [127]: ret = re.match(r"\d+", "閱讀次數爲 9999").group()
In [129]: ret = re.search(r"\d+", "閱讀次數爲 9999").group()
In [131]: re.search(r"\d+", "閱讀次數爲 9999").group()

----------------------------------------------------------------------------

In [132]: re.findall(r"\d+", "閱讀次數爲 9999").group()  # 不能夠用.group()
In [133]: re.findall(r"\d+", "閱讀次數爲 9999")
In [134]: re.findall(r"\d+", "閱讀次數爲")  # 沒匹配到返回空列表


----------------------------------------------------------------------------
In [135]: re.sub(r"\d+","999", "python=666")
In [136]: re.sub(r"\d+","999", "python=666 cpp=688")
In [137]: re.sub(r"\d+","999", "python=666 cpp=688", 1)
In [138]: re.sub(r"\d+","999", "python=666 cpp=688")

In [139]: def add(matchobj):
     ...:     data = matchobj.group()
     ...:     number = int(data) + 1000
     ...:     return str(number)

In [140]: re.sub(r"\d+",add, "python=666 cpp=688")

In [141]: data = """
     ...: <div>
     ...:         <p>崗位職責：</p>
     ...: <p>完成推薦算法、數據統計、接口、後臺等服務器端相關工做</p>
     ...: <p><br></p>
     ...: <p>必備要求：</p>
     ...: <p>良好的自我驅動力和職業素養，工做積極主動、結果導向</p>
     ...: <p> <br></p>
     ...: <p>技術要求：</p>
     ...: <p>一、一年以上 Python 開發經驗，掌握面向對象分析和設計，瞭解設計模式</p>
     ...: <p>二、掌握HTTP協議，熟悉MVC、MVVM等概念以及相關WEB開發框架</p>
     ...: <p>三、掌握關係數據庫開發設計，掌握 SQL，熟練使用 MySQL/PostgreSQL 中的一種<br></p>
     ...: <p>四、掌握NoSQL、MQ，熟練使用對應技術解決方案</p>
     ...: <p>五、熟悉 Javascript/CSS/HTML5，JQuery、React、Vue.js</p>
     ...: <p> <br></p>
     ...: <p>加分項：</p>
     ...: <p>大數據，數理統計，機器學習，sklearn，高性能，大併發。</p>
     ...: 
     ...:         </div>"""

In [142]: re.sub(r"<\w+>","",data)
In [143]: re.sub(r"<\w+>|\n","",data)
In [144]: re.sub(r"<\w+>|\n|</\w+>","",data)
In [145]: re.sub(r"</?\w+>|\n|","",data)
In [146]: re.sub(r"</?\w+>|\n| ","",data)

7. 貪婪模式和非貪婪模式<>
----------------------------------------------------------------------------
In [147]: re.split(r"\s","123456_abcdef ghjkl\t12312312")
In [148]: re.split(r"\s|_","123456_abcdef ghjkl\t12312312")
In [149]: re.search(r"\d+", "嫦娥199999號升空了")
In [150]: re.search(r"\d+", "嫦娥1999999999號升空了").group()
In [151]: re.search(r"\d+\d+", "1999999999").group()
In [152]: re.search(r"(\d+)(\d+)", "1999999999").group()
In [153]: re.search(r"(\d+)(\d+)", "1999999999").group(1)
In [154]: re.search(r"(\d+)(\d+)", "1999999999").group(2)
In [155]: re.search(r"(\d+?)(\d+)", "1999999999").group(2)

In [156]: re.search(r"(\d+?)(\d+)", "1999999999").group(1)
In [157]: url = """<img data-original="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_2016111319
     ...: 17_small.jpg" src="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_201611131917_small.j
     ...: pg" style="display: inline;">
     ...: """


In [158]: re.search(r"https:.*\.jpg",url).group()
In [159]: re.search(r"https:.*?\.jpg",url).group()

8 r標識原生字符串
----------------------------------------------------------------------------
In [160]: path = "c:\a\b\c"
In [161]: path
Out[161]: 'c:\x07\x08\\c'

In [162]: print(path)
c\c
In [163]: path = "c:\\a\\b\\c"

In [164]: path
Out[164]: 'c:\\a\\b\\c'

In [165]: path = "c:\\a\\b\c"
Out[166]: 'c:\\a\\b\\c'

In [167]: re.match("c:\\a",path).group()
In [168]: re.match(r"c:\\a",path).group()
In [169]: re.match("c:\\\\a",path).group()
In [170]: re.match("c:\\\\a\\\\b",path).group()
In [171]: re.match("c:\\\\a\\\\b\\\\c",path).group()
In [172]: re.match(r"c:\\a\\b\\c",path).group()

相關標籤/搜索

Python

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。