python正則表達式

時間 2019-12-11

原文原文鏈接

正則表達式

正則表達式是一種用來模糊匹配字符串的方法，它的設計思想是用一種描述性的語言來給字符串定義一個規則，凡是符合規則的字符串，咱們就認爲它「匹配了」，不然該「沒有匹配到該字符串」html

在線調試工具：點擊python

1、經常使用正則表達式

單字符：
1. . : 除換行之外全部字符
2. [] ：[aoe] [a-w] 匹配集合中任意一個字符
3. \d ：數字 [0-9]
4. \D : 非數字
5. \w ：數字、字母、下劃線、中文
6. \W : 非\w
7. \s ：全部的空白字符包,括空格、製表符、換頁符等等。等價於 [ \f\n\r\t\v]。
8. \S : 非空白
數量修飾：
1. * : 任意屢次 >=0
2. + : 至少1次 >=1
3. ? : 無關緊要 0次或者1次
4. {m} ：固定m次 hello{3,}
5. {m,} ：至少m次
6. {m,n} ：m-n次
邊界：
1. $ : 以某某結尾
2. ^ : 以某某開頭
分組：
1. (ab)
2. 貪婪模式： .*
3. 非貪婪（惰性）模式： .*?
re.I : 忽略大小寫
re.M ：多行匹配
re.S ：單行匹配
re.sub(正則表達式, 替換內容, 字符串)

 
         import 
         re 
        
         key 
         = 
         'bobo@hit.edu.com' 
         #想要匹配到hit. 
        
         res 
         = 
         re.findall( 
         'h.*\.' 
         ,key) 
        
         print 
         (res)   
         #['hit.edu.'] 
        
         res 
         = 
         re.findall( 
         'h.*?\.' 
         ,key) 
        
         print 
         (res)   
         #['hit.'] 
        
         #匹配出i開頭的行 
        
         string  
         = 
         '''fall in love with you 
        
         i love you very much 
        
         i love she 
        
         i love her''' 
        
         res 
         = 
         re.findall( 
         '^.*' 
         ,string,re.M) 
        
         print 
         (res)  
         #['fall in love with you', 'i love you very much', 'i love she', 'i love her'] 
        
         string1  
         = 
         """<div>靜夜思 
        
         窗前明月光 
        
         疑是地上霜 
        
         舉頭望明月 
        
         低頭思故鄉 
        
         </div>""" 
        
         res 
         = 
         re.findall( 
         '<div>(.*)</div>' 
         ,string1,re.S) 
        
         print 
         (res) 
         #['靜夜思\n窗前明月光\n疑是地上霜\n舉頭望明月\n低頭思故鄉\n'] 
        
         string1  
         = 
         """<div>靜夜思 
        
         窗前明月光 
        
         疑是地上霜 
        
         舉頭望明月 
        
         低頭思故鄉 
        
         </div>""" 
        
         res 
         = 
         re.findall( 
         '<div>(.*)</div>' 
         ,string1) 
        
         print 
         (res) 
         #[]

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

 
         # =================================匹配模式================================= 
        
 
         #一對一的匹配 
        
 
         # 'hello'.replace(old,new) 
        
 
         # 'hello'.find('pattern') 
        

            
        
 
         #正則匹配 
        
 
         import 
         re 
        
 
         #\w與\W 
        
 
         print 
         (re.findall( 
         '\w' 
         , 
         'hello egon 123' 
         ))  
         #['h', 'e', 'l', 'l', 'o', 'e', 'g', 'o', 'n', '1', '2', '3'] 
        
 
         print 
         (re.findall( 
         '\W' 
         , 
         'hello egon 123' 
         ))  
         #[' ', ' '] 
        

            
        
 
         #\s與\S 
        
 
         print 
         (re.findall( 
         '\s' 
         , 
         'hello  egon  123' 
         ))  
         #[' ', ' ', ' ', ' '] 
        
 
         print 
         (re.findall( 
         '\S' 
         , 
         'hello  egon  123' 
         ))  
         #['h', 'e', 'l', 'l', 'o', 'e', 'g', 'o', 'n', '1', '2', '3'] 
        

            
        
 
         #\n \t都是空,均可以被\s匹配 
        
 
         print 
         (re.findall( 
         '\s' 
         , 
         'hello \n egon \t 123' 
         ))  
         #[' ', '\n', ' ', ' ', '\t', ' '] 
        

            
        
 
         #\n與\t 
        
 
         print 
         (re.findall(r 
         '\n' 
         , 
         'hello egon \n123' 
         ))  
         #['\n'] 
        
 
         print 
         (re.findall(r 
         '\t' 
         , 
         'hello egon\t123' 
         ))  
         #['\n'] 
        

            
        
 
         #\d與\D 
        
 
         print 
         (re.findall( 
         '\d' 
         , 
         'hello egon 123' 
         ))  
         #['1', '2', '3'] 
        
 
         print 
         (re.findall( 
         '\D' 
         , 
         'hello egon 123' 
         ))  
         #['h', 'e', 'l', 'l', 'o', ' ', 'e', 'g', 'o', 'n', ' '] 
        

            
        
 
         #\A與\Z 
        
 
         print 
         (re.findall( 
         '\Ahe' 
         , 
         'hello egon 123' 
         ))  
         #['he'],\A==>^ 
        
 
         print 
         (re.findall( 
         '123\Z' 
         , 
         'hello egon 123' 
         ))  
         #['he'],\Z==>$ 
        

            
        
 
         #^與$ 
        
 
         print 
         (re.findall( 
         '^h' 
         , 
         'hello egon 123' 
         ))  
         #['h'] 
        
 
         print 
         (re.findall( 
         '3$' 
         , 
         'hello egon 123' 
         ))  
         #['3'] 
        

            
        
 
         # 重複匹配：| . | * | ? | .* | .*? | + | {n,m} | 
        
 
         #. 
        
 
         print 
         (re.findall( 
         'a.b' 
         , 
         'a1b' 
         ))  
         #['a1b'] 
        
 
         print 
         (re.findall( 
         'a.b' 
         , 
         'a1b a*b a b aaab' 
         ))  
         #['a1b', 'a*b', 'a b', 'aab'] 
        
 
         print 
         (re.findall( 
         'a.b' 
         , 
         'a\nb' 
         ))  
         #[] 
        
 
         print 
         (re.findall( 
         'a.b' 
         , 
         'a\nb' 
         ,re.S))  
         #['a\nb'] 
        
 
         print 
         (re.findall( 
         'a.b' 
         , 
         'a\nb' 
         ,re.DOTALL))  
         #['a\nb']同上一條意思同樣 
        

            
        
 
         #* 
        
 
         print 
         (re.findall( 
         'ab*' 
         , 
         'bbbbbbb' 
         ))  
         #[] 
        
 
         print 
         (re.findall( 
         'ab*' 
         , 
         'a' 
         ))  
         #['a'] 
        
 
         print 
         (re.findall( 
         'ab*' 
         , 
         'abbbb' 
         ))  
         #['abbbb'] 
        

            
        
 
         #? 
        
 
         print 
         (re.findall( 
         'ab?' 
         , 
         'a' 
         ))  
         #['a'] 
        
 
         print 
         (re.findall( 
         'ab?' 
         , 
         'abbb' 
         ))  
         #['ab'] 
        
 
         #匹配全部包含小數在內的數字 
        
 
         print 
         (re.findall( 
         '\d+\.?\d*' 
         , 
         "asdfasdf123as1.13dfa12adsf1asdf3" 
         ))  
         #['123', '1.13', '12', '1', '3'] 
        

            
        
 
         #.*默認爲貪婪匹配 
        
 
         print 
         (re.findall( 
         'a.*b' 
         , 
         'a1b22222222b' 
         ))  
         #['a1b22222222b'] 
        

            
        
 
         #.*?爲非貪婪匹配：推薦使用 
        
 
         print 
         (re.findall( 
         'a.*?b' 
         , 
         'a1b22222222b' 
         ))  
         #['a1b'] 
        

            
        
 
         #+ 
        
 
         print 
         (re.findall( 
         'ab+' 
         , 
         'a' 
         ))  
         #[] 
        
 
         print 
         (re.findall( 
         'ab+' 
         , 
         'abbb' 
         ))  
         #['abbb'] 
        

            
        

            
        
 
         #{n,m} 
        
 
         print 
         (re.findall( 
         'ab{2}' 
         , 
         'abbb' 
         ))  
         #['abb'] 
        
 
         print 
         (re.findall( 
         'ab{2,4}' 
         , 
         'abbb' 
         ))  
         #['abbb'] 
        
 
         print 
         (re.findall( 
         'ab{1,}' 
         , 
         'abbb' 
         ))  
         #['abbb']  #'ab{1,}' ===> 'ab+' 
        
 
         print 
         (re.findall( 
         'ab{0,}' 
         , 
         'abbb' 
         ))  
         #['abbb']  #'ab{0,}' ===> 'ab*' 
        

            
        
 
         #[] 
        
 
         print 
         (re.findall( 
         'a[1*-]b' 
         , 
         'a1b a*b a-b' 
         ))  
         #[]內的都爲普通字符了，且若是-沒有被轉意的話，應該放到[]的開頭或結尾  #['a1b', 'a*b', 'a-b'] 
        
 
         print 
         (re.findall( 
         'a[^1*-]b' 
         , 
         'a1b a*b a-b a=b' 
         ))  
         #[]內的^表明的意思是取反，因此結果爲['a=b'] 
        
 
         print 
         (re.findall( 
         'a[0-9]b' 
         , 
         'a1b a*b a-b a=b' 
         ))  
         #結果爲['a1b'] 
        
 
         print 
         (re.findall( 
         'a[a-z]b' 
         , 
         'a1b a*b a-b a=b aeb' 
         ))  
         #結果爲['aeb'] 
        
 
         print 
         (re.findall( 
         'a[a-zA-Z]b' 
         , 
         'a1b a*b a-b a=b aeb aEb' 
         ))  
         #結果爲['aeb', 'aEb'] 
        

            
        
 
         #\# print(re.findall('a\\c','a\c')) #對於正則來講a\\c確實能夠匹配到a\c,可是在python解釋器讀取a\\c時，會發生轉義，而後交給re去執行，因此拋出異常 
        
 
         print 
         (re.findall(r 
         'a\\c' 
         , 
         'a\c' 
         ))  
         #r表明告訴解釋器使用rawstring，即原生字符串，把咱們正則內的全部符號都當普通字符處理，不要轉義  ['a\\c'] 
        
 
         print 
         (re.findall( 
         'a\\\\c' 
         , 
         'a\c' 
         ))  
         #同上面的意思同樣，和上面的結果同樣都是['a\\c'] 
        

            
        
 
         #():分組 
        
 
         print 
         (re.findall( 
         'ab+' 
         , 
         'ababab123' 
         ))  
         #['ab', 'ab', 'ab'] 
        
 
         print 
         (re.findall( 
         '(ab)+123' 
         , 
         'ababab123' 
         ))  
         #['ab']，匹配到末尾的ab123中的ab 
        
 
         print 
         (re.findall( 
         '(?:ab)+123' 
         , 
         'ababab123' 
         ))  
         #findall的結果不是匹配的所有內容，而是組內的內容,?:可讓結果爲匹配的所有內容  ['ababab123'] 
        
 
         print 
         (re.findall( 
         'href="(.*?)"' 
         , 
         '<a href="http://www.baidu.com">點擊</a>' 
         )) 
         #['http://www.baidu.com'] 
        
 
         print 
         (re.findall( 
         'href="(?:.*?)"' 
         , 
         '<a href="http://www.baidu.com">點擊</a>' 
         )) 
         #['href="http://www.baidu.com"'] 
        

            
        
 
         #| 
        
 
         print 
         (re.findall( 
         'compan(?:y|ies)' 
         , 
         'Too many companies have gone bankrupt, and the next one is my company' 
         ))    
         #['companies', 'company'] 
        

            
        
 
         # =================================補充================================= 
        

            
        
 
         print 
         (re.findall( 
         "<(?P<tag_name>\w+)>\w+</(?P=tag_name)>" 
         , 
         "<h1>hello</h1>" 
         ))  
         #['h1'] 
        
 
         print 
         (re.search( 
         "<(?P<tag_name>\w+)>\w+</(?P=tag_name)>" 
         , 
         "<h1>hello</h1>" 
         ).group())  
         #<h1>hello</h1> 
        
 
         print 
         (re.search( 
         "<(?P<tag_name>\w+)>\w+</(?P=tag_name)>" 
         , 
         "<h1>hello</h1>" 
         ).groupdict())  
         #{'tag_name': 'h1'} 
        

            
        
 
         print 
         (re.search(r 
         "<(\w+)>\w+</(\w+)>" 
         , 
         "<h1>hello</h1>" 
         ).group())  
         #<h1>hello</h1> 
        
 
         print 
         (re.search(r 
         "<(\w+)>\w+</\1>" 
         , 
         "<h1>hello</h1>" 
         ).group())   
         #<h1>hello</h1> 
        

            
        

            
        
 
         #補充二 
        

            
        
 
         #使用|，先匹配的先生效，|左邊是匹配小數，而findall最終結果是查看分組，全部即便匹配成功小數也不會存入結果 
        
 
         #而不是小數時，就去匹配(-?\d+)，匹配到的天然就是，非小數的數，在此處即整數 
        

            
        

            
        
 
         print 
         (re.findall(r 
         "-?\d+\.\d*|(-?\d+)" 
         , 
         "1-2*(60+(-40.35/5)-(-4*3))" 
         ))  
         #找出全部整數['1', '-2', '60', '', '5', '-4', '3'] 
        

            
        
 
         #找到全部數字: 
        
 
         print 
         (re.findall( 
         '\D?(\-?\d+\.?\d*)' 
         , 
         "1-2*(60+(-40.35/5)-(-4*3))" 
         ))  
         # ['1','2','60','-40.35','5','-4','3'] 
        

            
        
 
         #計算器做業參考：http://www.cnblogs.com/wupeiqi/articles/4949995.html 
        
 
         expression 
         = 
         '1-2*((60+2*(-3-40.0/5)*(9-2*5/3+7/3*99/4*2998+10*568/14))-(-4*3)/(16-3*2))' 
        

            
        
 
         content 
         = 
         re.search( 
         '\(([\-\+\*\/]*\d+\.?\d*)+\)' 
         ,expression).group() 
        
 
         print 
         (content)   
         #(-3-40.0/5) 
        

            
        

            
        
 
         #爲什麼一樣的表達式search與findall卻有不一樣結果: 
        
 
         print 
         (re.search( 
         '\(([\+\-\*\/]*\d+\.?\d*)+\)' 
         , 
         "1-12*(60+(-40.35/5)-(-4*3))" 
         ).group())  
         #(-40.35/5) 
        
 
         print 
         (re.findall( 
         '\(([\+\-\*\/]*\d+\.?\d*)+\)' 
         , 
         "1-12*(60+(-40.35/5)-(-4*3))" 
         ))  
         #['/5', '*3'] 
        

            
        
 
         #看這個例子:(\d)+至關於(\d)(\d)(\d)(\d)...,是一系列分組 
        
 
         print 
         (re.search( 
         '(\d)+' 
         , 
         '123' 
         ).group()) 
         #123  #group的做用是將全部組拼接到一塊兒顯示出來 
        
 
         print 
         (re.findall( 
         '(\d)+' 
         , 
         '123' 
         ))  
         #['3']  #findall結果是組內的結果,且是最後一個組的結果 
        

re模塊提供的方法

 
         # ===========================re模塊提供的方法介紹=========================== 
        
 
         import 
         re 
        
 
         #1 
        
 
         print 
         (re.findall( 
         'e' 
         , 
         'rose like play' 
         ) )    
         #['e', 'e'],返回全部知足匹配條件的結果,放在列表裏 
        
 
         #2 
        
 
         print 
         (re.search( 
         'e' 
         , 
         'rose like play' 
         ).group())  
         #e,只到找到第一個匹配而後返回一個包含匹配信息的對象,該對象能夠經過調用group()方法獲得匹配的字符串,若是字符串沒有匹配，則返回None。 
        

            
        
 
         #3 
        
 
         print 
         (re.match( 
         'e' 
         , 
         'rose like play' 
         ))     
         #None,同search,不過在字符串開始處進行匹配,徹底能夠用search+^代替match 
        

            
        
 
         #4 
        
 
         print 
         (re.split( 
         '[ab]' 
         , 
         'abcd' 
         ))      
         #['', '', 'cd']，先按'a'分割獲得''和'bcd',再對''和'bcd'分別按'b'分割 
        

            
        
 
         #5 
        
 
         print 
         ( 
         '===>' 
         ,re.sub( 
         'a' 
         , 
         'A' 
         , 
         'rose like play' 
         ))  
         #===> rose like plAy，不指定n，默認替換全部 
        
 
         print 
         ( 
         '===>' 
         ,re.sub( 
         'a' 
         , 
         'A' 
         , 
         'rose like play' 
         , 
         1 
         ))  
         #===> rose like plAy 
        
 
         print 
         ( 
         '===>' 
         ,re.sub( 
         'a' 
         , 
         'A' 
         , 
         'rose like play' 
         , 
         2 
         ))  
         #===> rose like plAy 
        
 
         print 
         ( 
         '===>' 
         ,re.sub( 
         '^(\w+)(.*?\s)(\w+)(.*?\s)(\w+)(.*?)$' 
         ,r 
         '\5\2\3\4\1' 
         , 
         'rose like play' 
         ))  
         #===> play like rose 
        

            
        
 
         print 
         ( 
         '===>' 
         ,re.subn( 
         'a' 
         , 
         'A' 
         , 
         'rose like play' 
         ))  
         #===> ('rose like plAy', 1),結果帶有總共替換的個數 
        

            
        
 
         #6 
        
 
         obj 
         = 
         re. 
         compile 
         ( 
         '\d{2}' 
         ) 
        

            
        
 
         print 
         (obj.search( 
         'abc123eeee' 
         ).group())  
         #12 
        
 
         print 
         (obj.findall( 
         'abc123eeee' 
         ))  
         #['12'], 重用了obj