Python學習雜記_15_正則表達式

時間 2019-12-20
原文原文鏈接
正則表達式

正則表達式就是用來查找字符串的，它可以查找規則比較複雜的字符串。
使用正則表達式首先要導入re模塊
import re

s = "besttest is good！besttest best"
print(re.match(r'best',s))
# 返回對象爲 <_sre.SRE_Match object; span=(0, 8), match='besttest'>
# match方法接收3個參數，第一個是匹配的規則（待匹配的樣本字符串），也就是正則表達式；
# 第二個是被檢查的字符串；
# 第三個參數不是必填的，用於控制正則表達式的匹配方式，看下面正則表達式的匹配模式。
# match方法是從字符串的「第一個單詞」中匹配字符串，若是匹配到返回一個對象，若是匹配不到，則返回None

print(re.search(r'is',s))
# 返回對象爲 <_sre.SRE_Match object; span=(9, 11), match='is'>
# search方法的參數和match同樣，不同的是，match是從字符串開頭第一個詞匹配，只看開頭，
# 而search方法則是從字符串的整個內容裏面找，若是找到了就返回第一個，找不到就返回None。
# 如過想列出找到的內容須要用.group()方法 print(re.search(r'is',s).group()),這樣結果就是is

# 注意到了嗎？在正則表達式的前面我加了一個‘r’,它表明按字符原樣來匹配，不然當表達式字符串中包含轉義自如的話就會被轉義

print(re.findall(r'best',s))
# findall方法的參數和 match、search 同樣，不同的是，findall 是從字符串全部內容裏找，找到就返回，直到找完。
# findall 方法返回的是一個列表，並且直接返回，直接print就能夠看到，不用group()方法。

print(re.sub(r'best','BEST',s))
# sub 方法和字符串的replace方法同樣，是用來替換字符串的;
# 用正則表達式身後的字符串替換正則表達式的字符串；
# 會返回一個新的整個字符串，若是匹配不到的話，返回原來的字符串。

數量詞
要與findall方法聯合使用

星號 *
# 匹配星號*前面的一個字符0次或屢次，僅僅是*號前的一個字符。
print(re.findall(r'a*b','ab a abbbbbb b'))
結果是 ['ab', 'ab', 'b', 'b', 'b', 'b', 'b', 'b']

加號 +
# 匹配加號+前面的一個字符1次或屢次，僅僅是+號前面的一個字符。
print(re.findall(r'st+','besttest is best s'))
結果是 ['stt', 'st', 'st']

問號 ?
# 匹配問號?前面的一個字符0次或1次，僅僅是?號前面的一個字符。
print(re.findall(r'st?','besttest is best'))
結果是 ['st', 'st', 's', 'st']

指定匹配次數 {n}
# 匹配花括號前面的一個字符n次，僅僅是花括號前面的一個字符。
print(re.findall(r't{3}er', 'besttest is best  lettter letter'))
結果是 ['ttter']

指定匹配次數的範圍 {m,n}
# 匹配花括號前面的一個字符m到n次，僅僅是花括號前面的一個字符。
print(re.findall(r't{1,3}er', 'besterst is best  lettter letter'))
結果是 ['ter', 'ttter', 'tter']


通常字符

'.'     # 默認匹配除\n以外的任意一個字符
print(re.findall(r'b.','besttest is good ba bf bo'))
結果： ['be', 'ba', 'bf', 'bo']

'\'     # 轉譯符，前面的* + ?這樣的字符都有特殊含義了，若是你就是想找它的話，那就得轉譯了
        # 意思就是說若是你想讓特殊字符失去之前的含義，那麼就得給它前面加上'\'
print(re.findall(r'\?','besttest is best????'))
結果： ['?', '?', '?', '?']

'|'     # 匹配|左或|右的字符
print(re.findall(r'best|is','besttest is best'))
結果： ['best', 'is', 'best']

'[]'    # 字符集合，裏面是某些字符的集合，匹配的時候是這個集合裏面的任意一個就能夠
print(re.findall(r'be[stacj]','besttest is best bejson bed'))
結果： ['bes', 'bes', 'bej']
# 在[]裏面若是用^的話表明取反，也就是不包括的這些字符串的
print(re.findall(r'be[^stac]','besttest is best bejson'))
結果： ['bed']


邊界匹配

'^'         # 數字6的上鍵位，它表明匹配以什麼字符串開頭，多行狀況下匹配每一行的開頭。
print(re.findall(r'^b','besttest is good\nbestest',re.M))
print(re.findall(r'^http://','http://www.baidu.com is good\nbestest'))
結果： ['b', 'b']  和  ['http://']

'$'         # 匹配以什麼字符結尾,多行狀況下匹配每一行的結尾；
            # 多行時是從最後一行開始匹配。
print(re.findall(r'd$','besttest is good'))
print(re.findall(r'\.jpeg$|\.gif$|\.png$','touxiang.png\nabc.gif',re.M))
結果： ['d'] 和 ['.png', '.gif']

'\A'        # 僅以什麼字符開頭，和^不一樣的是它不能用多行模式。
print(re.findall(r'\Ahttp://','http://www.baidu.com is good\nhttp://www.sohu.com',re.M))
print(re.findall(r'\Ab','besttest is good'))
結果：['http://'] 和 ['b']

'\Z'        # 僅以什麼字符結尾，和$不一樣的是它不能用多行模式；
            # 在多行時，是從最後一行開始匹配。
print(re.findall(r'\.jpeg\Z|\.png\Z|\.gif\Z','touxiang.png\nqq.gif\nabc.jpeg',re.M))
結果： ['.jpeg']


預約義字符集合


'\d'        # 匹配數字0-9
print(re.findall(r'\d+','sdf2342312sdfs342332sdf'))
結果： ['2342312', '342332']
print(re.findall(r'\d','sdf2342312sdfs342332sdf'))
結果： ['2', '3', '4', '2', '3', '1', '2', '3', '4', '2', '3', '3', '2']

'\D'        # 匹配非數字
print(re.findall(r'\D+','sdf2342312sdfs……&（）'))
結果： ['sdf', 'sdfs……&（）']
print(re.findall(r'\D','sdf2342312sdfs……&（）'))
結果： ['s', 'd', 'f', 's', 'd', 'f', 's', '…', '…', '&', '（', '）']

'\w'        # 匹配[A-Za-z0-9],也就是全部的字母和數字和中文
print(re.findall(r'\w+','sdf234%^2312sdfs&你好你好'))
結果： ['sdf234', '2312sdfs', '你好你好']

'\W'        # 匹配不是[A-Za-z0-9]，也就是否是字母、數字和中文字符的字符串
print(re.findall(r'\W+','sdf234%^2312sdfs&你好你好！。。'))
結果： ['%^', '&', '！。。']

'\s'        # 匹配空白字符如 \t、\n、\r,空格
print(re.findall('\s','axss\n\tsdf\t\r\t sdfsd sdf '))
結果： ['\n', '\t', '\t', '\r', '\t', ' ', ' ', ' ']

'\S'        # 匹配非空白字符,不是\t、\n、\r,空格
print(re.findall('\S+','axss\n\tsdf\t\r\t sdfsd sdf '))
結果： ['axss', 'sdf', 'sdfsd', 'sdf']

[A-z]大小寫字母
[a-z]#小寫字母
[A-Z]#大寫字母
[0-9]#全部的數字

print('new_str:',new_str)
print(re.findall(r'[A-Z]',new_str))
print(re.findall(r'[a-z]',new_str))
print(re.findall(r'[0-9]',new_str))

插播程序栗子：

需求：  生成制定個數的 8位字符串，每一個字符串必須包含大寫字母、小寫字母和數字，
        把這些字符串保存在名爲password的文件裏。

import re,string,random

sub_str = string.ascii_letters + string.digits
fw = open('passwd','w')
def passwds(n):
    count = 0
    while count<n:
        new_str = ''.join(random.sample(sub_str, 8))
        if re.search(r'[A-Z]', new_str) and \
                re.search(r'[a-z]', new_str) and \
                re.search(r'[0-9]', new_str):
            fw.write(new_str+'\n')
            count+=1
passwds(5)

分組匹配'(...)'             # 分組匹配，把某些規則寫成在一個括號括起來的組裏，這樣就能夠直接對這個組進行一些匹配了。# 舉個例子，若是要匹配ip地址的話，# ip地址是相似這樣的 192.168.5.1 每一段都是1位到3位的數字，數字後面有個隔點，# 正常寫的話,要這樣寫正則匹配：# print(re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',"192.168.1.3"))# >>> ['192.168.1.3']# 這樣寫有點麻煩了，從ip地址咱們能夠發現規律，除了第一段，以後的每段都是'.\d{1,3}'，1到3個數字，# 寫重複的代碼是低級的，這樣的話就能夠用分組來簡化上面寫的正則匹配，# 把'.\d{1,3}'當作一個總體，而後讓它出現3次就ok了，能夠改爲以下的樣紙：# print(re.search(r'\d{1,3}(\.\d{1,3}){3}',"192.168.1.3").group())# 這裏用了search方法，結果和上面的同樣：# >>> 192.168.1.3# 若是用findall方法：# print(re.findall(r'\d{1,3}(\.\d{1,3}){3}',"192.168.1.3"))# 發現結果是下面這樣的，不是咱們想要的，# >>> ['.3']# 爲啥會這樣呢，用match方法和search方法都是正常的，findall方法這裏有個坑：# 就是若是findall方法裏面有分組的話，默認結果就只是分組裏面的內容，也就是匹配到小括號裏面的內容，# 若是想讓結果正確的話就在分組最前面寫上'?:'，一個問號和一個冒號就能夠了，啓用「不捕捉模式」# print(re.findall(r'\d{1,3}(?:\.\d{1,3}){3}',"192.168.1.3"))# >>> 192.168.1.3# findall方法裏面有分組的話，默認結果就只是分組裏面的內容# findall方法裏面有分組的話，默認結果就只是分組裏面的內容# findall方法裏面有分組的話，默認結果就只是分組裏面的內容