regex(python)

正則表達式html

 

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/7/26 16:39
# @Author  : jackendoff
# @Site    : 
# @File    : 正則學習.py
# @Software: PyCharm

# re模塊的使用過程
import re
# '''
# re模塊的使用過程
# re.match(pattern, string, flags=0)
# 從頭匹配一個符合規則的字符串,從起始位置開始匹配,匹配成功返回一個對象,未匹配成功返回None
#  · pattern: 正則模型
#  · string: 要匹配的字符
#  · flags: 匹配模式
#  這個方法並非徹底匹配,當pattern結束時若string還有剩餘字符,仍視爲成功,想要徹底匹配,能夠在表達式末尾加上邊界匹配符‘$’
# '''
# result = re.match('^\d{3}$', '124')
# print(result.group())  # 返回被re匹配的字符串124
# print(result.start())  # 返回匹配開始的位置0
# print(result.end())  # 返回匹配結束的位置3
# print(result.span())  # 返回一個元組包含匹配(開始,結束)的位置(0, 3)




'''
re.search函數會在字符串內查找模式匹配,只要找到第一個匹配而後返回,若是字符串沒有匹配,則返回None
格式:re.search(pattern, string, flags=0)
match()和search()的區別:
match()函數只檢測re是否是在string的開始位置匹配,search()會掃描
整個string查找匹配;
也就是說match()只有在0位置匹配成功的化纔有返回,若是不是開始位置匹配成功的話,match()就返回none。

'''
# import re
# ret = re.search(r'\d+', '閱讀次數爲9999')
# print(ret.group())  # 運行結果9999
#
# print(re.match('super', 'superstition').span())  # 結果(0, 5)
# print(re.match('super', 'insuperstition'))  # 結果None
# print(re.search('super', 'superstition').span())  # 結果(0, 5)
# print(re.search('super', 'insuperstition').span())  # 結果(2, 7)




'''
re.findall遍歷匹配,能夠獲取字符串中全部匹配字符串,返回一個列表
'''
# import re
#
# ret = re.findall(r'\d+', '閱讀次數:9999次,轉發次數:887次,評論次數:3次')
# print(ret)  # 運行結果['9999', '887', '3']




'''
sub將匹配到的數據進行替換
使用re替換string中每個匹配的字符串後返回替換後的字符串
格式:re.sub(pattern, repl, string,count)
'''

# import re
#
# ret = re.sub(r'\d+', '10000', '閱讀次數:9999次, 轉發次數:883次, 評論次數:3次')
# print(ret)  # 運行結果 閱讀次數:10000次, 轉發次數:10000次, 評論次數:10000次

import re
# def add(temp):
#     strNum = temp.group()
#     print(strNum)  # 運行結果997
#     num = int(strNum) + 1
#     return str(num)  # 返回一個字符串

# ret = re.sub(r'\d+', add, 'python = 997')  # 能夠分解爲result = re.search(r'\d+', 'python=997') --》add(result)
# print(ret)  # 運行結果python = 998
#
# ret = re.sub(r'\d+', add, 'python = 99')
# print(ret)  # 運行結果python = 100

'''
import re
def add(temp):
    strNum = temp.group()
    print(strNum)  # 運行結果997
    num = int(strNum) + 1
    return str(num)
result = re.search(r'\d+', 'python=997')
print(add(result))
'''




'''
split根據匹配進行切割字符串,並返回一個列表
按照可以匹配的字符串string分割都返回列表。
可使用re.split來分割字符串
格式:re.split(pattern, string[,maxspit])

'''

import re

ret = re.split(r":| ", 'info:xiaozhang 33 shandong')
print(ret)  # 運行結果 ['info', 'xiaozhang', '33', 'shandong']



'''
pyhton裏數量詞默認是貪婪的,老是儘量多地匹配字符;
非貪婪則相反,老是嘗試匹配儘量少的字符
在'*','+','?','{m, n}後面加上?,使貪婪變爲非貪婪。'
'''

 

  

 

量詞:python

  * 重複0次或更屢次git

  + 重複一次或更屢次 正則表達式

  ?重複0次或一次算法

  {n} 重複n次數據庫

  {n,} 重複n次或更屢次windows

  {n,m} 重複n到m次設計模式

 

  貪心匹配(默認)服務器

  惰性匹配: 量詞 + ''?''併發

  .*?x 前面取任意長度字符直到取到x

字符:

  . 匹配除換行之外的任意字符

  \w 匹配字母或數字或下劃線

  \d 匹配數字(digit)

  \s 匹配任意空白符(space)

  \n 匹配一個換行符

  \t 匹配一個製表符(TAB)

  \b 匹配一個單詞的結尾

  ^ 匹配字符串的開始

  $ 匹配字符串的結尾

  \W 匹配非字母或數字或下劃線

  \D 匹配非數字

  \S 匹配非空白符

  a|b 匹配字符a或者字符b

  ( ) 匹配括號內的表達式,也表示一個組(search)

  [...] 匹配字符組中的字符

  [^...] 匹配除字符組之外的全部字符

 

pyhton re模塊

python正則

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time    : 2018/7/8 9:27
 4 # @Author  : jackendoff
 5 # @Site    : 
 6 # @File    : text.py
 7 # @Software: PyCharm
 8 
 9 import re
10 
11 # find_str = re.findall('a', 'jai jack')  # 匹配全部,並顯示全部
12 # print(find_str)
13 #
14 # search_str = re.search('a', 'jia jack')  # 匹配找到的第一個,並顯示
15 # print(search_str)
16 # search_gp = search_str.group()
17 # print(search_gp)
18 #
19 # match_str = re.match('a', 'a bc')  # 匹配第一個
20 # print(match_str)
21 # match_gp = match_str.group()
22 # print(match_gp)
23 #
24 # find_str = re.findall('[a-z]\d', 'ji58a991')  # 匹配a-z和數字
25 # print(find_str)
26 # find_str = re.findall('([a-z])\d', 'ji58a991')  # 匹配a-z和數字
27 # print(find_str)
28 # find_str = re.findall('(?:[a-z])\d', 'ji58a991')  # ()表明‘組’在findall裏優先顯示  '?:'表明‘取消優先’
29 # print(find_str)
30 #
31 # search_str = re.search('([a-z])(\d)', 'jia123nfdj8684')  # 匹配a-z和數字 分組
32 # search_gpp = search_str.group()  # 顯示匹配的數字和字母
33 # print(search_gpp)
34 # search_gp = search_str.group(1)  # 顯示第一個組
35 # print(search_gp)
36 # search_gp = search_str.group(2)  # 顯示第二個組
37 # print(search_gp)
38 
39 # sp_str = re.split('\d', 'jia4jfa5f78a')  # 依據數字分割
40 # print(sp_str)
41 # split_str = re.split('[a1]', 'jiafd54f1k')  # 先根據a分割,仔根據1分割
42 # print(split_str)
43 #
44 # sub_str = re.sub('[\d]', '0', 'jia123fd6', 2)  # 將數字替換成0,2表示只替換兩個
45 # print(sub_str)
46 # sub_str = re.subn('[\d]', '0', 'jia123fd6')  # 將數字替換成0,返回一個元組(替換的結果,替換了多少次)
47 # print(sub_str)
48 
49 # obj = re.compile('\d{3}')  # 將正則表達式編譯成一個正則對象(當表達式特別複雜的時候此對象能夠屢次使用)匹配三個數字
50 # obj_str = obj.search('jia123kill')  # 直接使用對象調用search進行匹配
51 # o_str = obj_str.group()
52 # print(o_str)
53 
54 # ret = re.finditer('\d', 'jia5fd55fs6f5')  # finditer返回一個存放匹配結果的迭代器(節省內存)
55 # print(ret)
56 # # for i in ret:
57 #     print(i.group())

 正則案例,windows命令行  超多

1. 初步使用
In [1]: import re
In [11]: re.match(r"abc","abcde").group()
In [12]: re.match(r"bc","abcde").group()
In [14]: re.search(r"bc","abcde").group()

2. 匹配單個字符
-----------------------.匹配任意字符(除\n)-------------------------------
In [5]: re.match(r".","A").group()
In [6]: re.match(r".","a").group()
In [7]: re.match(r".",".").group()
In [8]: re.match(r".","\n").group()
In [9]: re.match(r".","x").group()
In [10]: re.match(r"A.C","ABC").group()

-------------------------[]匹配集合中任何一個字符---------------------------
In [11]: re.match(r"[aA]BC","ABC").group()
In [12]: re.match(r"[aA]BC","aBC").group()
In [13]: re.match(r"[aA]BC","xBC").group()
In [14]: re.match(r"[0123456789]BC","1BC").group()
In [15]: re.match(r"[0123456789]BC","9BC").group()
-------------------------[-]表示範圍-----------------------------
In [16]: re.match(r"[0-9]BC","9BC").group()
In [17]: re.match(r"[0-9]BC","2BC").group()
In [18]: re.match(r"[0-9a-zA-Z]BC","ABC").group()
In [19]: re.match(r"[0-9a-zA-Z]BC","aBC").group()
In [21]: re.match(r"[0-9a-zA-Z]BC","&BC").group()
In [22]: re.match(r"[0-35-9]BC","1BC").group()
In [24]: re.match(r"[0-35-9]BC","4BC").group()
-------------------------[^]對匹配範圍取反-----------------------------
In [25]: re.match(r"[^4]BC","4BC").group()
In [26]: re.match(r"[^4]BC","1BC").group()
In [27]: re.match(r"[^4]BC","9BC").group()
In [28]: re.match(r"[^4a-z]BC","aBC").group()
In [29]: re.match(r"[^4a-z]BC","zBC").group()
In [30]: re.match(r"[^4a-z]BC","ZBC").group()

-------------------------\d匹配數字字符 \D匹配非數字---------------------
In [31]: re.match(r"[0-9]BC","4BC").group()
In [32]: re.match(r"\dBC","4BC").group()
In [33]: re.match(r"\dBC","2BC").group()

In [34]: re.match(r"\DBC","2BC").group()
In [35]: re.match(r"\DBC","aBC").group()
In [36]: re.match(r"\DBC","xBC").group()
--------------------------\w匹配單詞字符 \w匹配非單詞------------------------
In [38]: re.match(r"\wBC", "ABC").group()
In [39]: re.match(r"\wBC", "aBC").group()
In [40]: re.match(r"\wBC", "1BC").group()
In [41]: re.match(r"\wBC", "_BC").group()
In [42]: re.match(r"[\da-zA-Z_]BC", "1BC").group()
In [43]: re.match(r"[\da-zA-Z_]BC", "ABC").group()
In [44]: re.match(r"[\da-zA-Z_]BC", "aBC").group()
In [45]: re.match(r"[\da-zA-Z_]BC", "_BC").group()

In [46]: re.match(r"\WBC", "_BC").group()
In [47]: re.match(r"\WBC", "$BC").group()
In [48]: re.match(r"\WBC", " BC").group()



3. 匹配多個字符
In [50]: re.match(r"嫦娥\w號", "嫦娥一號升空了", re.A).group()
In [51]: re.match(r"嫦娥\d號", "嫦娥1號升空了", re.A).group()
In [52]: re.match(r"嫦娥\d\d號", "嫦娥11號升空了", re.A).group()
In [53]: re.match(r"嫦娥\d\d\d號", "嫦娥111號升空了", re.A).group()
In [54]: re.match(r"嫦娥\d\d\d\d\d\d號", "嫦娥111111號升空了", re.A).group()

In [55]: re.match(r"嫦娥\d{6}號", "嫦娥111111號升空了", re.A).group()
In [56]: re.match(r"嫦娥\d{6}號", "嫦娥11號升空了", re.A).group()
In [57]: re.match(r"嫦娥\d{2,6}號", "嫦娥11號升空了", re.A).group()
In [58]: re.match(r"嫦娥\d{2,6}號", "嫦娥11111號升空了", re.A).group()

In [59]: re.match(r"嫦娥\d{1,}號", "嫦娥11111號升空了").group()
In [60]: re.match(r"嫦娥\d{1,}號", "嫦娥11111111111號升空了").group()
In [61]: re.match(r"嫦娥\d+號", "嫦娥11111111111號升空了").group()
In [63]: re.match(r"嫦娥\d+號", "嫦娥號升空了").group()

In [64]: re.match(r"嫦娥\d{0,}號", "嫦娥號升空了").group()
In [65]: re.match(r"嫦娥\d{0,}號", "嫦娥1111號升空了").group()
In [66]: re.match(r"嫦娥\d*號", "嫦娥1111號升空了").group()
In [67]: re.match(r"嫦娥\d*號", "嫦娥號升空了").group()

In [81]: re.match(r"嫦娥\d{0,1}號", "嫦娥號升空了").group()
In [82]: re.match(r"嫦娥\d{0,1}號", "嫦娥1號升空了").group()
In [83]: re.match(r"嫦娥\d?號", "嫦娥1號升空了").group()
In [84]: re.match(r"嫦娥\d?號", "嫦娥11號升空了").group()

4. 匹配開始和結束位置
	.在正則中表示匹配除\n以外的任意字符  若是要再正則中表示.自己的含義 使用\.


In [68]: re.match(r"\w@163.com","123456@163.com").group()
In [69]: re.match(r"\w{4,20}@163.com","123456@163.com").group()
In [70]: re.match(r"\w{4,20}@163.com","123456@163Acom").group()
In [71]: re.match(r"\w{4,20}@163\.com","123456@163Acom").group()
In [72]: re.match(r"\w{4,20}@163\.com","123456@163.com").group()

In [73]: re.match(r"\w{4,20}@163\.com","123456@163.com.cc").group()
In [74]: re.match(r"\w{4,20}@163\.com",".ccc.123456@163.com").group()
In [75]: re.match(r"\w{4,20}@163\.com","123456@163.com.cc").group()
In [76]: re.match(r"\w{4,20}@163\.com$","123456@163.com.cc").group()
In [77]: re.search(r"\w{4,20}@163\.com$","123456@163.com.cc").group()
In [78]: re.search(r"\w{4,20}@163\.com$","ccc.123456@163.com").group()
In [79]: re.match(r"\w{4,20}@163\.com$","ccc.123456@163.com").group()

In [80]: re.search(r"^\w{4,20}@163\.com$","ccc.123456@163.com").group()
In [81]: re.search(r"^\w{4,20}@163\.com$","123456@163.com").group()

5. 分組

----------------------()將感興趣的數據放到分組中-------------------------
In [86]: re.match(r"嫦娥\d+號", "嫦娥9號升空了").group()
In [87]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group()
In [88]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group(0)
In [89]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group(1)
In [90]: re.search(r"(^\w{4,20})@163\.com$","123456@163.com").group(1)
In [91]: re.search(r"(^\w{4,20})@(163)\.com$","123456@163.com").group(1)
In [92]: re.search(r"(^\w{4,20})@(163)\.com$","123456@163.com").group(2)

----------------------|匹配左右任何一個正則表達式--------------------------
In [93]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@163.com").group(1)
In [94]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@163.com").group(0)
In [95]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@qq.com").group(0)

---------------------(|)匹配()中任何一個正則表達式並將匹配結果放到分組中-------
In [96]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@qq.com").group(0)
In [97]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@163.com").group(0)
In [98]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@263.com").group(0)
In [99]: re.search(r"(^\w{4,20})@(163|qq|263)\.com$","123456@263.com").group(0)
In [100]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(0)
In [101]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(1)
In [102]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(2)

---------------------引用分組(匿名 只能經過分組號引用)--------------------------------
In [103]: re.match(r"<\w+>.*", "<html>hh</html>").group()
In [104]: re.match(r"<(\w+)>.*", "<html>hh</html>").group()
In [105]: re.match(r"<(\w+)>.*", "<html>hh</html>").group(1)
In [106]: re.match(r"<(\w+)>.*</\1>", "<html>hh</html>").group(1)
In [107]: re.match(r"<(\w+)>.*</\1>", "<html>hh</htm>").group(1)
In [108]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h1></html>").group(1)
In [109]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h1></html>").group()
In [110]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h2></html>").group()


In [111]: re.match(r"(\d{3,4})-(\d{6,8})", "010-12345678").group(1)
In [112]: re.match(r"(\d{3,4})-(\d{6,8})", "010-12345678").group(2)
In [113]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(2)
In [114]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(1)
In [115]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(3)

------------------建立有名分組 給分組起名  使用有名分組------------------------
In [116]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}))", "010-12345678").group(3)
In [118]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group()

In [119]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(1)

In [120]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2)

In [121]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2)

In [122]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(1)

In [123]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2)

In [124]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(3)

6. 高級函數
----------------------------------------------------------------------------
In [127]: ret = re.match(r"\d+", "閱讀次數爲 9999").group()
In [129]: ret = re.search(r"\d+", "閱讀次數爲 9999").group()
In [131]: re.search(r"\d+", "閱讀次數爲 9999").group()

----------------------------------------------------------------------------

In [132]: re.findall(r"\d+", "閱讀次數爲 9999").group()  # 不能夠用.group()
In [133]: re.findall(r"\d+", "閱讀次數爲 9999")
In [134]: re.findall(r"\d+", "閱讀次數爲")  # 沒匹配到返回空列表


----------------------------------------------------------------------------
In [135]: re.sub(r"\d+","999", "python=666")
In [136]: re.sub(r"\d+","999", "python=666 cpp=688")
In [137]: re.sub(r"\d+","999", "python=666 cpp=688", 1)
In [138]: re.sub(r"\d+","999", "python=666 cpp=688")

In [139]: def add(matchobj):
     ...:     data = matchobj.group()
     ...:     number = int(data) + 1000
     ...:     return str(number)

In [140]: re.sub(r"\d+",add, "python=666 cpp=688")

In [141]: data = """
     ...: <div>
     ...:         <p>崗位職責:</p>
     ...: <p>完成推薦算法、數據統計、接口、後臺等服務器端相關工做</p>
     ...: <p><br></p>
     ...: <p>必備要求:</p>
     ...: <p>良好的自我驅動力和職業素養,工做積極主動、結果導向</p>
     ...: <p> <br></p>
     ...: <p>技術要求:</p>
     ...: <p>一、一年以上 Python 開發經驗,掌握面向對象分析和設計,瞭解設計模式</p>
     ...: <p>二、掌握HTTP協議,熟悉MVC、MVVM等概念以及相關WEB開發框架</p>
     ...: <p>三、掌握關係數據庫開發設計,掌握 SQL,熟練使用 MySQL/PostgreSQL 中的一種<br></p>
     ...: <p>四、掌握NoSQL、MQ,熟練使用對應技術解決方案</p>
     ...: <p>五、熟悉 Javascript/CSS/HTML5,JQuery、React、Vue.js</p>
     ...: <p> <br></p>
     ...: <p>加分項:</p>
     ...: <p>大數據,數理統計,機器學習,sklearn,高性能,大併發。</p>
     ...: 
     ...:         </div>"""

In [142]: re.sub(r"<\w+>","",data)
In [143]: re.sub(r"<\w+>|\n","",data)
In [144]: re.sub(r"<\w+>|\n|</\w+>","",data)
In [145]: re.sub(r"</?\w+>|\n|","",data)
In [146]: re.sub(r"</?\w+>|\n| ","",data)

7. 貪婪模式和非貪婪模式<>
----------------------------------------------------------------------------
In [147]: re.split(r"\s","123456_abcdef ghjkl\t12312312")
In [148]: re.split(r"\s|_","123456_abcdef ghjkl\t12312312")
In [149]: re.search(r"\d+", "嫦娥199999號升空了")
In [150]: re.search(r"\d+", "嫦娥1999999999號升空了").group()
In [151]: re.search(r"\d+\d+", "1999999999").group()
In [152]: re.search(r"(\d+)(\d+)", "1999999999").group()
In [153]: re.search(r"(\d+)(\d+)", "1999999999").group(1)
In [154]: re.search(r"(\d+)(\d+)", "1999999999").group(2)
In [155]: re.search(r"(\d+?)(\d+)", "1999999999").group(2)

In [156]: re.search(r"(\d+?)(\d+)", "1999999999").group(1)
In [157]: url = """<img data-original="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_2016111319
     ...: 17_small.jpg" src="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_201611131917_small.j
     ...: pg" style="display: inline;">
     ...: """


In [158]: re.search(r"https:.*\.jpg",url).group()
In [159]: re.search(r"https:.*?\.jpg",url).group()

8 r標識原生字符串
----------------------------------------------------------------------------
In [160]: path = "c:\a\b\c"
In [161]: path
Out[161]: 'c:\x07\x08\\c'

In [162]: print(path)
c\c
In [163]: path = "c:\\a\\b\\c"

In [164]: path
Out[164]: 'c:\\a\\b\\c'

In [165]: path = "c:\\a\\b\c"
Out[166]: 'c:\\a\\b\\c'

In [167]: re.match("c:\\a",path).group()
In [168]: re.match(r"c:\\a",path).group()
In [169]: re.match("c:\\\\a",path).group()
In [170]: re.match("c:\\\\a\\\\b",path).group()
In [171]: re.match("c:\\\\a\\\\b\\\\c",path).group()
In [172]: re.match(r"c:\\a\\b\\c",path).group()
相關文章
相關標籤/搜索