py12 正則表達式 re模塊

時間 2019-12-06

原文原文鏈接

Python3 正則表達式

Python 自1.5版本起增長了re 模塊，它提供 Perl 風格的正則表達式模式。python

re 模塊使 Python 語言擁有所有的正則表達式功能。正則表達式

re模塊源碼express

# --------------------------------------------------------------------
# public interface

def match(pattern, string, flags=0):
    """Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found."""
    return _compile(pattern, flags).match(string)

def fullmatch(pattern, string, flags=0):
    """Try to apply the pattern to all of the string, returning
    a match object, or None if no match was found."""
    return _compile(pattern, flags).fullmatch(string)

def search(pattern, string, flags=0):
    """Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found."""
    return _compile(pattern, flags).search(string)

def sub(pattern, repl, string, count=0, flags=0):
    """Return the string obtained by replacing the leftmost
    non-overlapping occurrences of the pattern in string by the
    replacement repl.  repl can be either a string or a callable;
    if a string, backslash escapes in it are processed.  If it is
    a callable, it's passed the match object and must return
    a replacement string to be used."""
    return _compile(pattern, flags).sub(repl, string, count)

def subn(pattern, repl, string, count=0, flags=0):
    """Return a 2-tuple containing (new_string, number).
    new_string is the string obtained by replacing the leftmost
    non-overlapping occurrences of the pattern in the source
    string by the replacement repl.  number is the number of
    substitutions that were made. repl can be either a string or a
    callable; if a string, backslash escapes in it are processed.
    If it is a callable, it's passed the match object and must
    return a replacement string to be used."""
    return _compile(pattern, flags).subn(repl, string, count)

def split(pattern, string, maxsplit=0, flags=0):
    """Split the source string by the occurrences of the pattern,
    returning a list containing the resulting substrings.  If
    capturing parentheses are used in pattern, then the text of all
    groups in the pattern are also returned as part of the resulting
    list.  If maxsplit is nonzero, at most maxsplit splits occur,
    and the remainder of the string is returned as the final element
    of the list."""
    return _compile(pattern, flags).split(string, maxsplit)

def findall(pattern, string, flags=0):
    """Return a list of all non-overlapping matches in the string.

    If one or more capturing groups are present in the pattern, return
    a list of groups; this will be a list of tuples if the pattern
    has more than one group.

    Empty matches are included in the result."""
    return _compile(pattern, flags).findall(string)

def finditer(pattern, string, flags=0):
    """Return an iterator over all non-overlapping matches in the
    string.  For each match, the iterator returns a match object.

    Empty matches are included in the result."""
    return _compile(pattern, flags).finditer(string)

def compile(pattern, flags=0):
    "Compile a regular expression pattern, returning a pattern object."
    return _compile(pattern, flags)

def purge():
    "Clear the regular expression caches"
    _cache.clear()
    _compile_repl.cache_clear()

def template(pattern, flags=0):
    "Compile a template pattern, returning a pattern object"
    return _compile(pattern, flags|T)

re模塊各函數參數的含義編程

正則表達式修飾符 - 可選標誌（flags）app

正則表達式能夠包含一些可選標誌修飾符來控制匹配的模式。修飾符被指定爲一個可選的標誌。多個標誌能夠經過按位 OR(|) 它們來指定。如 re.I | re.M 被設置成 I 和 M 標誌：編程語言

re模塊中各類方法函數

re.match(pattern, string, flags=0)

只匹配一個，成功返回Match object, 失敗返回None（匹配開頭）this

re.fullmatch(pattern, string, flags=0)

徹底匹配string，只有pattern和string徹底同樣纔算匹配上，失敗返回Nonespa

re.search(pattern, string, flags=0)

只匹配一個，成功返回Match object, 失敗返回None3d

re.findall(pattern, string, flags=0)

查找全部匹配成功字符串，並返回list

re.finditer(pattern, string, flags=0)

查找全部匹配字符串, 並返回iterator

re.split(pattern, string[, maxsplit=0, flags=0])

按照可以匹配的子串將字符串分割後返回列表，maxsplit表示最大分割次數，默認爲0

re.sub(pattern, repl, string, count=0)

用於替換字符串中的匹配項

repl : 替換的字符串，也可爲一個函數

count : 模式匹配後替換的最大次數，默認 0 表示替換全部的匹配

re.compile(pattern[, flags])

用於編譯正則表達式，生成一個正則表達式（ Pattern ）對象，供 match() 和 search() 這兩個函數使用

原生字符串

與大多數編程語言相同，正則表達式裏使用"\"做爲轉義字符，這就可能形成反斜槓困擾。假如你須要匹配文本中的字符"\"，那麼使用編程語言表示的正則表達式裏將須要4個反斜槓"\\\\"：前兩個和後兩個分別用於在編程語言裏轉義成反斜槓，轉換成兩個反斜槓後再在正則表達式裏轉義成一個反斜槓。Python裏的原生字符串很好地解決了這個問題，這個例子中的正則表達式可使用r"\\"表示。一樣，匹配一個數字的"\\d"能夠寫成r"\d"。有了原生字符串，你不再用擔憂是否是漏寫了反斜槓，寫出來的表達式也更直觀。

經常使用匹配模式(元字符)

# =================================匹配模式=================================
#一對一的匹配
# 'hello'.replace(old,new)
# 'hello'.find('pattern')

#正則匹配
import re
#\w與\W
print(re.findall('\w','hello egon 123')) #['h', 'e', 'l', 'l', 'o', 'e', 'g', 'o', 'n', '1', '2', '3']
print(re.findall('\W','hello egon 123')) #[' ', ' ']

#\s與\S
print(re.findall('\s','hello  egon  123')) #[' ', ' ', ' ', ' ']
print(re.findall('\S','hello  egon  123')) #['h', 'e', 'l', 'l', 'o', 'e', 'g', 'o', 'n', '1', '2', '3']

#\n \t都是空,均可以被\s匹配
print(re.findall('\s','hello \n egon \t 123')) #[' ', '\n', ' ', ' ', '\t', ' ']

#\n與\t
print(re.findall('\n', 'hello egon \n123'))  # ['\n']
print(re.findall(r'\n', 'hello egon \n123'))  # ['\n']
print(re.findall('\t', 'hello egon\t123'))  # ['\t']
print(re.findall(r'\t', 'hello egon\t123'))  # ['\t']

#\d與\D
print(re.findall('\d','hello egon 123')) #['1', '2', '3']
print(re.findall('\D','hello egon 123')) #['h', 'e', 'l', 'l', 'o', ' ', 'e', 'g', 'o', 'n', ' ']

#\A與\Z
print(re.findall('\Ahe','hello egon 123')) #['he'],\A==>^
print(re.findall('123\Z','hello egon 123')) #['he'],\Z==>$

#^與$
print(re.findall('^h','hello egon 123')) #['h']
print(re.findall('3$','hello egon 123')) #['3']

# 重複匹配：| . | * | ? | .* | .*? | + | {n,m} |
#.
print(re.findall('a.b', 'a1b'))  # ['a1b']
print(re.findall('a.b', 'a1b a*b a b aaab'))  # ['a1b', 'a*b', 'a b', 'aab']
print(re.findall('a.b', 'a\nb'))  # []
print(re.findall('a.b', 'a\nb', re.S))  # ['a\nb']
print(re.findall('a.b', 'a\nb', re.DOTALL))  # ['a\nb']同上一條意思同樣
print(re.findall('a..b', 'a12b'))  # ['a12b']兩個點能夠匹配兩個除換行外的任意字符
print(re.findall('a...b', 'a123b'))  # ['a123b']幾個點就能匹配幾個除換行外的任意字符

#*
print(re.findall('ab*','bbbbbbb')) #[]
print(re.findall('ab*','a')) #['a']
print(re.findall('ab*','abbbb')) #['abbbb']

#?
print(re.findall('ab?','a')) #['a']
print(re.findall('ab?','abbb')) #['ab']
#匹配全部包含小數在內的數字
print(re.findall('\d+\.?\d*',"asdfasdf123as1.13dfa12adsf1asdf3")) #['123', '1.13', '12', '1', '3']

#.*默認爲貪婪匹配
print(re.findall('a.*b','a1b22222222b')) #['a1b22222222b']
secret_code = 'hadkfalifexxIxxfasdjifja134xxlovexx23345sdfxxyouxx8dfse'
re.findall('xx.*xx', secret_code) # ['xxIxxfasdjifja134xxlovexx23345sdfxxyouxx']


#.*?爲非貪婪匹配：推薦使用
print(re.findall('a.*?b','a1b22222222b')) #['a1b']
re.findall('xx.*?xx', secret_code) # ['xxIxx', 'xxlovexx', 'xxyouxx']


#+
print(re.findall('ab+','a')) #[]
print(re.findall('ab+','abbb')) #['abbb']

#{n,m}
print(re.findall('ab{2}','abbb')) #['abb']
print(re.findall('ab{2,4}','abbb')) #['abb']
print(re.findall('ab{1,}','abbb')) #'ab{1,}' ===> 'ab+'
print(re.findall('ab{0,}','abbb')) #'ab{0,}' ===> 'ab*'

#[]
print(re.findall('a[1*-]b','a1b a*b a-b')) #[]內的都爲普通字符了，且若是-沒有被轉意的話，應該放到[]的開頭或結尾
print(re.findall('a[^1*-]b','a1b a*b a-b a=b')) #[]內的^表明的意思是取反，因此結果爲['a=b']
print(re.findall('a[0-9]b','a1b a*b a-b a=b')) #[]內的^表明的意思是取反，因此結果爲['a=b']
print(re.findall('a[a-z]b','a1b a*b a-b a=b aeb')) #[]內的^表明的意思是取反，因此結果爲['a=b']
print(re.findall('a[a-zA-Z]b','a1b a*b a-b a=b aeb aEb')) #[]內的^表明的意思是取反，因此結果爲['a=b']

#\# print(re.findall('a\\c','a\c')) #對於正則來講a\\c確實能夠匹配到a\c,可是在python解釋器讀取a\\c時，會發生轉義，而後交給re去執行，因此拋出異常
print(re.findall(r'a\\c','a\c')) #r表明告訴解釋器使用rawstring，即原生字符串，把咱們正則內的全部符號都當普通字符處理，不要轉義
print(re.findall('a\\\\c','a\c')) #同上面的意思同樣，和上面的結果同樣都是['a\\c']

#():分組 findall和search的分組匹配結果不一樣，請注意
print(re.findall('ab+','ababab123')) #['ab', 'ab', 'ab']
print(re.findall('(ab)+123','ababab123')) #['ab']，匹配到末尾的ab123中的ab
print(re.findall('(?:ab)+123','ababab123')) #findall的結果不是匹配的所有內容，而是組內的內容,?:可讓結果爲匹配的所有內容
print(re.findall('href="(.*?)"','<a href="http://www.baidu.com">點擊</a>'))#['http://www.baidu.com']
print(re.findall('href="(?:.*?)"','<a href="http://www.baidu.com">點擊</a>'))#['href="http://www.baidu.com"']
print(re.search('(dsf){2}(\|\|=){1}','adsfdsf||='))  # _sre.SRE_Match object; span=(1, 10), match='dsfdsf||='>
print(re.findall('(dsf){2}(\|\|=){1}','adsfdsf||='))  # [('dsf', '||=')]
'(?P<name>...)' 分組匹配 re.search("(?P<province>[0-9]{4})(?P<city>[0-9]{2})(?P<birthday>[0-9]{4})","371481199306143242").groupdict("city") 結果{'province': '3714', 'city': '81', 'birthday': '1993'}


#|
print(re.findall('compan(?:y|ies)','Too many companies have gone bankrupt, and the next one is my company'))

re模塊的各函數

re.match函數

re.match 嘗試從字符串的起始位置匹配一個模式，若是不是起始位置匹配成功的話，match()就返回none。

匹配成功re.match方法返回一個匹配的對象，不然返回None。

咱們可使用group(num) 或 groups() 匹配對象函數來獲取匹配表達式。

匹配對象方法	描述
group(num=0)	匹配的整個表達式的字符串，group() 能夠一次輸入多個組號，在這種狀況下它將返回一個包含那些組所對應值的元組。
groups()	返回一個包含全部小組字符串的元組，從 1 到所含的小組號。

實例

import re
line = "Cats are smarter than dogs"


matchObj = re.match(r'(.*) are (.*?) .*', line, re.M | re.I)

if matchObj:
    print("matchObj.group() : ", matchObj.group())
    print("matchObj.group(1) : ", matchObj.group(1))
    print("matchObj.group(2) : ", matchObj.group(2))
    # print("matchObj.group(3) : ", matchObj.group(3))  # 只匹配了倆沒有第三個
else:
    print("No match!!")

以上實例執行結果以下：

matchObj.group() :  Cats are smarter than dogs
matchObj.group(1) :  Cats
matchObj.group(2) :  smarter

re.search方法

re.search 掃描整個字符串並返回第一個成功的匹配。

re.match與re.search的區別

re.match只匹配字符串的開始，若是字符串開始不符合正則表達式，則匹配失敗，函數返回None；而re.search匹配整個字符串，直到找到一個匹配。

import re
 
line = "Cats are smarter than dogs";
 
matchObj = re.match( r'dogs', line, re.M|re.I)
if matchObj:
   print ("match --> matchObj.group() : ", matchObj.group())
else:
   print ("No match!!")
 
matchObj = re.search( r'dogs', line, re.M|re.I)
if matchObj:
   print ("search --> matchObj.group() : ", matchObj.group())
else:
   print ("No match!!")

re.sub

re.sub(pattern, repl, string, count=0) 用於替換字符串中的匹配項。

import re
 
phone = "2004-959-559 # 這是一個電話號碼"
 
# 刪除註釋
num = re.sub(r'#.*$', "", phone)
print ("電話號碼 : ", num)
 
# 移除非數字的內容
num = re.sub(r'\D', "", phone)
print ("電話號碼 : ", num)

電話號碼 :  2004-959-559 
電話號碼 :  2004959559