正則表達式html
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/7/26 16:39 # @Author : jackendoff # @Site : # @File : 正則學習.py # @Software: PyCharm # re模塊的使用過程 import re # ''' # re模塊的使用過程 # re.match(pattern, string, flags=0) # 從頭匹配一個符合規則的字符串,從起始位置開始匹配,匹配成功返回一個對象,未匹配成功返回None # · pattern: 正則模型 # · string: 要匹配的字符 # · flags: 匹配模式 # 這個方法並非徹底匹配,當pattern結束時若string還有剩餘字符,仍視爲成功,想要徹底匹配,能夠在表達式末尾加上邊界匹配符‘$’ # ''' # result = re.match('^\d{3}$', '124') # print(result.group()) # 返回被re匹配的字符串124 # print(result.start()) # 返回匹配開始的位置0 # print(result.end()) # 返回匹配結束的位置3 # print(result.span()) # 返回一個元組包含匹配(開始,結束)的位置(0, 3) ''' re.search函數會在字符串內查找模式匹配,只要找到第一個匹配而後返回,若是字符串沒有匹配,則返回None 格式:re.search(pattern, string, flags=0) match()和search()的區別: match()函數只檢測re是否是在string的開始位置匹配,search()會掃描 整個string查找匹配; 也就是說match()只有在0位置匹配成功的化纔有返回,若是不是開始位置匹配成功的話,match()就返回none。 ''' # import re # ret = re.search(r'\d+', '閱讀次數爲9999') # print(ret.group()) # 運行結果9999 # # print(re.match('super', 'superstition').span()) # 結果(0, 5) # print(re.match('super', 'insuperstition')) # 結果None # print(re.search('super', 'superstition').span()) # 結果(0, 5) # print(re.search('super', 'insuperstition').span()) # 結果(2, 7) ''' re.findall遍歷匹配,能夠獲取字符串中全部匹配字符串,返回一個列表 ''' # import re # # ret = re.findall(r'\d+', '閱讀次數:9999次,轉發次數:887次,評論次數:3次') # print(ret) # 運行結果['9999', '887', '3'] ''' sub將匹配到的數據進行替換 使用re替換string中每個匹配的字符串後返回替換後的字符串 格式:re.sub(pattern, repl, string,count) ''' # import re # # ret = re.sub(r'\d+', '10000', '閱讀次數:9999次, 轉發次數:883次, 評論次數:3次') # print(ret) # 運行結果 閱讀次數:10000次, 轉發次數:10000次, 評論次數:10000次 import re # def add(temp): # strNum = temp.group() # print(strNum) # 運行結果997 # num = int(strNum) + 1 # return str(num) # 返回一個字符串 # ret = re.sub(r'\d+', add, 'python = 997') # 能夠分解爲result = re.search(r'\d+', 'python=997') --》add(result) # print(ret) # 運行結果python = 998 # # ret = re.sub(r'\d+', add, 'python = 99') # print(ret) # 運行結果python = 100 ''' import re def add(temp): strNum = temp.group() print(strNum) # 運行結果997 num = int(strNum) + 1 return str(num) result = re.search(r'\d+', 'python=997') print(add(result)) ''' ''' split根據匹配進行切割字符串,並返回一個列表 按照可以匹配的字符串string分割都返回列表。 可使用re.split來分割字符串 格式:re.split(pattern, string[,maxspit]) ''' import re ret = re.split(r":| ", 'info:xiaozhang 33 shandong') print(ret) # 運行結果 ['info', 'xiaozhang', '33', 'shandong'] ''' pyhton裏數量詞默認是貪婪的,老是儘量多地匹配字符; 非貪婪則相反,老是嘗試匹配儘量少的字符 在'*','+','?','{m, n}後面加上?,使貪婪變爲非貪婪。' '''
量詞:python
* 重複0次或更屢次git
+ 重複一次或更屢次 正則表達式
?重複0次或一次算法
{n} 重複n次數據庫
{n,} 重複n次或更屢次windows
{n,m} 重複n到m次設計模式
貪心匹配(默認)服務器
惰性匹配: 量詞 + ''?''併發
.*?x 前面取任意長度字符直到取到x
字符:
. 匹配除換行之外的任意字符
\w 匹配字母或數字或下劃線
\d 匹配數字(digit)
\s 匹配任意空白符(space)
\n 匹配一個換行符
\t 匹配一個製表符(TAB)
\b 匹配一個單詞的結尾
^ 匹配字符串的開始
$ 匹配字符串的結尾
\W 匹配非字母或數字或下劃線
\D 匹配非數字
\S 匹配非空白符
a|b 匹配字符a或者字符b
( ) 匹配括號內的表達式,也表示一個組(search)
[...] 匹配字符組中的字符
[^...] 匹配除字符組之外的全部字符
pyhton re模塊
python正則
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Time : 2018/7/8 9:27 4 # @Author : jackendoff 5 # @Site : 6 # @File : text.py 7 # @Software: PyCharm 8 9 import re 10 11 # find_str = re.findall('a', 'jai jack') # 匹配全部,並顯示全部 12 # print(find_str) 13 # 14 # search_str = re.search('a', 'jia jack') # 匹配找到的第一個,並顯示 15 # print(search_str) 16 # search_gp = search_str.group() 17 # print(search_gp) 18 # 19 # match_str = re.match('a', 'a bc') # 匹配第一個 20 # print(match_str) 21 # match_gp = match_str.group() 22 # print(match_gp) 23 # 24 # find_str = re.findall('[a-z]\d', 'ji58a991') # 匹配a-z和數字 25 # print(find_str) 26 # find_str = re.findall('([a-z])\d', 'ji58a991') # 匹配a-z和數字 27 # print(find_str) 28 # find_str = re.findall('(?:[a-z])\d', 'ji58a991') # ()表明‘組’在findall裏優先顯示 '?:'表明‘取消優先’ 29 # print(find_str) 30 # 31 # search_str = re.search('([a-z])(\d)', 'jia123nfdj8684') # 匹配a-z和數字 分組 32 # search_gpp = search_str.group() # 顯示匹配的數字和字母 33 # print(search_gpp) 34 # search_gp = search_str.group(1) # 顯示第一個組 35 # print(search_gp) 36 # search_gp = search_str.group(2) # 顯示第二個組 37 # print(search_gp) 38 39 # sp_str = re.split('\d', 'jia4jfa5f78a') # 依據數字分割 40 # print(sp_str) 41 # split_str = re.split('[a1]', 'jiafd54f1k') # 先根據a分割,仔根據1分割 42 # print(split_str) 43 # 44 # sub_str = re.sub('[\d]', '0', 'jia123fd6', 2) # 將數字替換成0,2表示只替換兩個 45 # print(sub_str) 46 # sub_str = re.subn('[\d]', '0', 'jia123fd6') # 將數字替換成0,返回一個元組(替換的結果,替換了多少次) 47 # print(sub_str) 48 49 # obj = re.compile('\d{3}') # 將正則表達式編譯成一個正則對象(當表達式特別複雜的時候此對象能夠屢次使用)匹配三個數字 50 # obj_str = obj.search('jia123kill') # 直接使用對象調用search進行匹配 51 # o_str = obj_str.group() 52 # print(o_str) 53 54 # ret = re.finditer('\d', 'jia5fd55fs6f5') # finditer返回一個存放匹配結果的迭代器(節省內存) 55 # print(ret) 56 # # for i in ret: 57 # print(i.group())
正則案例,windows命令行 超多
1. 初步使用 In [1]: import re In [11]: re.match(r"abc","abcde").group() In [12]: re.match(r"bc","abcde").group() In [14]: re.search(r"bc","abcde").group() 2. 匹配單個字符 -----------------------.匹配任意字符(除\n)------------------------------- In [5]: re.match(r".","A").group() In [6]: re.match(r".","a").group() In [7]: re.match(r".",".").group() In [8]: re.match(r".","\n").group() In [9]: re.match(r".","x").group() In [10]: re.match(r"A.C","ABC").group() -------------------------[]匹配集合中任何一個字符--------------------------- In [11]: re.match(r"[aA]BC","ABC").group() In [12]: re.match(r"[aA]BC","aBC").group() In [13]: re.match(r"[aA]BC","xBC").group() In [14]: re.match(r"[0123456789]BC","1BC").group() In [15]: re.match(r"[0123456789]BC","9BC").group() -------------------------[-]表示範圍----------------------------- In [16]: re.match(r"[0-9]BC","9BC").group() In [17]: re.match(r"[0-9]BC","2BC").group() In [18]: re.match(r"[0-9a-zA-Z]BC","ABC").group() In [19]: re.match(r"[0-9a-zA-Z]BC","aBC").group() In [21]: re.match(r"[0-9a-zA-Z]BC","&BC").group() In [22]: re.match(r"[0-35-9]BC","1BC").group() In [24]: re.match(r"[0-35-9]BC","4BC").group() -------------------------[^]對匹配範圍取反----------------------------- In [25]: re.match(r"[^4]BC","4BC").group() In [26]: re.match(r"[^4]BC","1BC").group() In [27]: re.match(r"[^4]BC","9BC").group() In [28]: re.match(r"[^4a-z]BC","aBC").group() In [29]: re.match(r"[^4a-z]BC","zBC").group() In [30]: re.match(r"[^4a-z]BC","ZBC").group() -------------------------\d匹配數字字符 \D匹配非數字--------------------- In [31]: re.match(r"[0-9]BC","4BC").group() In [32]: re.match(r"\dBC","4BC").group() In [33]: re.match(r"\dBC","2BC").group() In [34]: re.match(r"\DBC","2BC").group() In [35]: re.match(r"\DBC","aBC").group() In [36]: re.match(r"\DBC","xBC").group() --------------------------\w匹配單詞字符 \w匹配非單詞------------------------ In [38]: re.match(r"\wBC", "ABC").group() In [39]: re.match(r"\wBC", "aBC").group() In [40]: re.match(r"\wBC", "1BC").group() In [41]: re.match(r"\wBC", "_BC").group() In [42]: re.match(r"[\da-zA-Z_]BC", "1BC").group() In [43]: re.match(r"[\da-zA-Z_]BC", "ABC").group() In [44]: re.match(r"[\da-zA-Z_]BC", "aBC").group() In [45]: re.match(r"[\da-zA-Z_]BC", "_BC").group() In [46]: re.match(r"\WBC", "_BC").group() In [47]: re.match(r"\WBC", "$BC").group() In [48]: re.match(r"\WBC", " BC").group() 3. 匹配多個字符 In [50]: re.match(r"嫦娥\w號", "嫦娥一號升空了", re.A).group() In [51]: re.match(r"嫦娥\d號", "嫦娥1號升空了", re.A).group() In [52]: re.match(r"嫦娥\d\d號", "嫦娥11號升空了", re.A).group() In [53]: re.match(r"嫦娥\d\d\d號", "嫦娥111號升空了", re.A).group() In [54]: re.match(r"嫦娥\d\d\d\d\d\d號", "嫦娥111111號升空了", re.A).group() In [55]: re.match(r"嫦娥\d{6}號", "嫦娥111111號升空了", re.A).group() In [56]: re.match(r"嫦娥\d{6}號", "嫦娥11號升空了", re.A).group() In [57]: re.match(r"嫦娥\d{2,6}號", "嫦娥11號升空了", re.A).group() In [58]: re.match(r"嫦娥\d{2,6}號", "嫦娥11111號升空了", re.A).group() In [59]: re.match(r"嫦娥\d{1,}號", "嫦娥11111號升空了").group() In [60]: re.match(r"嫦娥\d{1,}號", "嫦娥11111111111號升空了").group() In [61]: re.match(r"嫦娥\d+號", "嫦娥11111111111號升空了").group() In [63]: re.match(r"嫦娥\d+號", "嫦娥號升空了").group() In [64]: re.match(r"嫦娥\d{0,}號", "嫦娥號升空了").group() In [65]: re.match(r"嫦娥\d{0,}號", "嫦娥1111號升空了").group() In [66]: re.match(r"嫦娥\d*號", "嫦娥1111號升空了").group() In [67]: re.match(r"嫦娥\d*號", "嫦娥號升空了").group() In [81]: re.match(r"嫦娥\d{0,1}號", "嫦娥號升空了").group() In [82]: re.match(r"嫦娥\d{0,1}號", "嫦娥1號升空了").group() In [83]: re.match(r"嫦娥\d?號", "嫦娥1號升空了").group() In [84]: re.match(r"嫦娥\d?號", "嫦娥11號升空了").group() 4. 匹配開始和結束位置 .在正則中表示匹配除\n以外的任意字符 若是要再正則中表示.自己的含義 使用\. In [68]: re.match(r"\w@163.com","123456@163.com").group() In [69]: re.match(r"\w{4,20}@163.com","123456@163.com").group() In [70]: re.match(r"\w{4,20}@163.com","123456@163Acom").group() In [71]: re.match(r"\w{4,20}@163\.com","123456@163Acom").group() In [72]: re.match(r"\w{4,20}@163\.com","123456@163.com").group() In [73]: re.match(r"\w{4,20}@163\.com","123456@163.com.cc").group() In [74]: re.match(r"\w{4,20}@163\.com",".ccc.123456@163.com").group() In [75]: re.match(r"\w{4,20}@163\.com","123456@163.com.cc").group() In [76]: re.match(r"\w{4,20}@163\.com$","123456@163.com.cc").group() In [77]: re.search(r"\w{4,20}@163\.com$","123456@163.com.cc").group() In [78]: re.search(r"\w{4,20}@163\.com$","ccc.123456@163.com").group() In [79]: re.match(r"\w{4,20}@163\.com$","ccc.123456@163.com").group() In [80]: re.search(r"^\w{4,20}@163\.com$","ccc.123456@163.com").group() In [81]: re.search(r"^\w{4,20}@163\.com$","123456@163.com").group() 5. 分組 ----------------------()將感興趣的數據放到分組中------------------------- In [86]: re.match(r"嫦娥\d+號", "嫦娥9號升空了").group() In [87]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group() In [88]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group(0) In [89]: re.match(r"嫦娥(\d+)號", "嫦娥9號升空了").group(1) In [90]: re.search(r"(^\w{4,20})@163\.com$","123456@163.com").group(1) In [91]: re.search(r"(^\w{4,20})@(163)\.com$","123456@163.com").group(1) In [92]: re.search(r"(^\w{4,20})@(163)\.com$","123456@163.com").group(2) ----------------------|匹配左右任何一個正則表達式-------------------------- In [93]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@163.com").group(1) In [94]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@163.com").group(0) In [95]: re.search(r"(^\w{4,20})@163\.com$|^\w{4,20}@qq\.com$","123456@qq.com").group(0) ---------------------(|)匹配()中任何一個正則表達式並將匹配結果放到分組中------- In [96]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@qq.com").group(0) In [97]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@163.com").group(0) In [98]: re.search(r"(^\w{4,20})@(163|qq)\.com$","123456@263.com").group(0) In [99]: re.search(r"(^\w{4,20})@(163|qq|263)\.com$","123456@263.com").group(0) In [100]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(0) In [101]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(1) In [102]: re.search(r"(^\w{4,20})@(163|qq|263|126)\.com$","123456@263.com").group(2) ---------------------引用分組(匿名 只能經過分組號引用)-------------------------------- In [103]: re.match(r"<\w+>.*", "<html>hh</html>").group() In [104]: re.match(r"<(\w+)>.*", "<html>hh</html>").group() In [105]: re.match(r"<(\w+)>.*", "<html>hh</html>").group(1) In [106]: re.match(r"<(\w+)>.*</\1>", "<html>hh</html>").group(1) In [107]: re.match(r"<(\w+)>.*</\1>", "<html>hh</htm>").group(1) In [108]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h1></html>").group(1) In [109]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h1></html>").group() In [110]: re.match(r"<(\w+)><(\w+)>(.*)</\2></\1>", "<html><h1>www.itcast.cn</h2></html>").group() In [111]: re.match(r"(\d{3,4})-(\d{6,8})", "010-12345678").group(1) In [112]: re.match(r"(\d{3,4})-(\d{6,8})", "010-12345678").group(2) In [113]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(2) In [114]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(1) In [115]: re.match(r"((\d{3,4})-(\d{6,8}))", "010-12345678").group(3) ------------------建立有名分組 給分組起名 使用有名分組------------------------ In [116]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}))", "010-12345678").group(3) In [118]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group() In [119]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(1) In [120]: re.match(r"(?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8}) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2) In [121]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2) In [122]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(1) In [123]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(2) In [124]: re.match(r"((?P<quhao>\d{3,4})-(?P<zuoji>\d{6,8})) (?P=quhao)-(?P=zuoji)", "010-12345678 010-12345678").group(3) 6. 高級函數 ---------------------------------------------------------------------------- In [127]: ret = re.match(r"\d+", "閱讀次數爲 9999").group() In [129]: ret = re.search(r"\d+", "閱讀次數爲 9999").group() In [131]: re.search(r"\d+", "閱讀次數爲 9999").group() ---------------------------------------------------------------------------- In [132]: re.findall(r"\d+", "閱讀次數爲 9999").group() # 不能夠用.group() In [133]: re.findall(r"\d+", "閱讀次數爲 9999") In [134]: re.findall(r"\d+", "閱讀次數爲") # 沒匹配到返回空列表 ---------------------------------------------------------------------------- In [135]: re.sub(r"\d+","999", "python=666") In [136]: re.sub(r"\d+","999", "python=666 cpp=688") In [137]: re.sub(r"\d+","999", "python=666 cpp=688", 1) In [138]: re.sub(r"\d+","999", "python=666 cpp=688") In [139]: def add(matchobj): ...: data = matchobj.group() ...: number = int(data) + 1000 ...: return str(number) In [140]: re.sub(r"\d+",add, "python=666 cpp=688") In [141]: data = """ ...: <div> ...: <p>崗位職責:</p> ...: <p>完成推薦算法、數據統計、接口、後臺等服務器端相關工做</p> ...: <p><br></p> ...: <p>必備要求:</p> ...: <p>良好的自我驅動力和職業素養,工做積極主動、結果導向</p> ...: <p> <br></p> ...: <p>技術要求:</p> ...: <p>一、一年以上 Python 開發經驗,掌握面向對象分析和設計,瞭解設計模式</p> ...: <p>二、掌握HTTP協議,熟悉MVC、MVVM等概念以及相關WEB開發框架</p> ...: <p>三、掌握關係數據庫開發設計,掌握 SQL,熟練使用 MySQL/PostgreSQL 中的一種<br></p> ...: <p>四、掌握NoSQL、MQ,熟練使用對應技術解決方案</p> ...: <p>五、熟悉 Javascript/CSS/HTML5,JQuery、React、Vue.js</p> ...: <p> <br></p> ...: <p>加分項:</p> ...: <p>大數據,數理統計,機器學習,sklearn,高性能,大併發。</p> ...: ...: </div>""" In [142]: re.sub(r"<\w+>","",data) In [143]: re.sub(r"<\w+>|\n","",data) In [144]: re.sub(r"<\w+>|\n|</\w+>","",data) In [145]: re.sub(r"</?\w+>|\n|","",data) In [146]: re.sub(r"</?\w+>|\n| ","",data) 7. 貪婪模式和非貪婪模式<> ---------------------------------------------------------------------------- In [147]: re.split(r"\s","123456_abcdef ghjkl\t12312312") In [148]: re.split(r"\s|_","123456_abcdef ghjkl\t12312312") In [149]: re.search(r"\d+", "嫦娥199999號升空了") In [150]: re.search(r"\d+", "嫦娥1999999999號升空了").group() In [151]: re.search(r"\d+\d+", "1999999999").group() In [152]: re.search(r"(\d+)(\d+)", "1999999999").group() In [153]: re.search(r"(\d+)(\d+)", "1999999999").group(1) In [154]: re.search(r"(\d+)(\d+)", "1999999999").group(2) In [155]: re.search(r"(\d+?)(\d+)", "1999999999").group(2) In [156]: re.search(r"(\d+?)(\d+)", "1999999999").group(1) In [157]: url = """<img data-original="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_2016111319 ...: 17_small.jpg" src="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_201611131917_small.j ...: pg" style="display: inline;"> ...: """ In [158]: re.search(r"https:.*\.jpg",url).group() In [159]: re.search(r"https:.*?\.jpg",url).group() 8 r標識原生字符串 ---------------------------------------------------------------------------- In [160]: path = "c:\a\b\c" In [161]: path Out[161]: 'c:\x07\x08\\c' In [162]: print(path) c\c In [163]: path = "c:\\a\\b\\c" In [164]: path Out[164]: 'c:\\a\\b\\c' In [165]: path = "c:\\a\\b\c" Out[166]: 'c:\\a\\b\\c' In [167]: re.match("c:\\a",path).group() In [168]: re.match(r"c:\\a",path).group() In [169]: re.match("c:\\\\a",path).group() In [170]: re.match("c:\\\\a\\\\b",path).group() In [171]: re.match("c:\\\\a\\\\b\\\\c",path).group() In [172]: re.match(r"c:\\a\\b\\c",path).group()