1 #!/usr/bin/env python 2 #-*- coding:UTF-8 -*- 3 ##################################################### 4 # Author: sunfx xingrhce@163.com 5 # Last modified: 2014/11/18 6 # Filename: re.py 7 # Q Q 羣: 236147801 8 ##################################################### 9 10 import re 11 12 #1.查找文本中的字符 13 14 pattern = 'this' 15 text = 'Does this text match the pattern?' 16 17 match = re.search(pattern,text) 18 19 s = match.start() 20 e = match.end() 21 22 print 'Found "%s"\nin "%s"\nfrom %d to %d ("%s")' %\ 23 (match.re.pattern,match.string,s,e,text[s:e]) 24 25 ''' 26 match.re.pattern 要匹配的內容 27 match.string 匹配的字符 28 s 匹配到內容開始索引 29 d 匹配到內容結束索引 30 text[s:e] 匹配字符 31 ''' 32 33 #2.編譯表達式 34 35 regexes = [ re.compile(p) 36 for p in ['this','that'] 37 ] #把字符轉換Regexobject格式 38 39 40 41 print 'Text: %r\n' % text #輸出text內容 42 43 for regex in regexes: 44 45 print 'Seeking "%s"->' % regex.pattern, #regex.pattern 要匹配的字符 46 47 if regex.search(text): #在text中搜索this or that 48 49 print 'match!' 50 51 else: 52 53 print 'no match' 54 55 #3.多重匹配 56 57 text = 'abbaaabbbbaaaaa' 58 59 pattern = 'ab' 60 61 for match in re.findall(pattern,text): 62 63 print 'Found: "%s"' % match 64 65 #findall 直接返回字符串 66 67 68 for match in re.finditer(pattern,text): 69 s = match.start() 70 e = match.end() 71 print 'Found "%s" at %d:%d' % (text[s:e],s,e) 72 73 #finditer 返回原輸入文字在字符串的位置 74 75 #4.模式語法 76 77 def test_patterns(text,patterns=[]): 78 79 for pattern,desc in patterns: 80 print 'Pattern %r (%s) \n' %(pattern,desc) 81 print ' %r' % text 82 for match in re.finditer(pattern,text): 83 s = match.start() 84 e = match.end() 85 substr = text[s:e] #匹配到的字符 86 n_backslashes = text[:s].count('\\') #查找文本:s座標以前的包含多少\\ 87 prefix = '.' * ( s + n_backslashes ) 88 print ' %s%r' % (prefix,substr) 89 print 90 return 91 92 test_patterns('abbaaabbbbaaaaa', 93 [('ab',"'a' followed by 'b'")] 94 ) 95 96 #貪婪模式 這種模式會減小單個匹配減小 97 ''' 98 * '匹配一次到屢次' 99 + '至少匹配一次到屢次' 100 ? '只匹配一次' 101 ab*, 'a followerd by zero or more b'), #匹配0次或者更屢次 102 ab+, 'a followerd by one or mrore b'), #最少匹配一次或者更屢次 103 ab?, 'a followerd by zero or one b'), #匹配0最多一次 104 ab{3}, 'a followerd by three b'), #最少匹配三次 105 ab{2,3}, 'a followerd by two to three b') #匹配兩至三次 106 107 108 ab*?, 'a followerd by zero or more b'), #匹配0次或者更屢次 109 ab+?, 'a followerd by one or mrore b'), #最少匹配一次或者更屢次 110 ab??, 'a followerd by zero or one b'), #匹配0最多一次 111 ab{3}?, 'a followerd by three b'), #最少匹配三次 112 ab{2,3}?, 'a followerd by two to three b') #匹配兩至三次 113 ''' 114 115 #用法以下: 116 117 str = 'absdsdsdsdsd' 118 119 print re.findall('ab*',str) 120 #['ab'] 121 122 print re.findall('ab*?',str) 123 #['a'] 124 125 #5.字符集 126 127 ''' 128 [ab] 'either a or b 匹配a或者b' 129 a[ab]+ 'a followerd by 1 more a or b 匹配一次a、b或者屢次 ' 130 a[ab]+? 'a followerd by 1 or more a or b,not greedy 匹配1一次能夠匹配屢次' 131 [^] '不包含內容' 132 [a-z] '全部小寫ASCII字母' 133 [A-Z] '全部大寫寫ASCII字母' 134 [a-zA-Z] '一個小寫和大寫的序列' 135 [A-Za-z] '一個大寫小寫的序列' 136 ''' 137 str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba' 138 139 print re.findall('[ab]',str) 140 print re.findall('a[ab]+',str) 141 print re.findall('a[ab]+?',str) 142 print re.findall('[^_]',str) 143 144 str = 'China,lovE' 145 146 print re.findall('[a-z][A-Z]',str) #['vE'] 147 print re.findall('[A-Z][a-z]',str) #['Ch'] 148 149 print re.findall('[A-Z][a-z]+',str) #['China'] 150 print re.findall('[a-z][A-Z]+',str) #['vE'] 151 152 print re.findall('[A-Z][a-z]*',str) #['China', 'E'] 153 print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE'] 154 155 print re.findall('[A-Z][a-z]?',str) #['Ch', 'E'] 156 print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE'] 157 158 ''' 159 . 元字符匹配一個字符 160 a. 161 b. 162 a.*b 163 a.*?b 164 ''' 165 166 c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd' 167 168 print re.findall('a.',c) #['ai', 'aw', 'as', 'aa', 'ab'] 169 print re.findall('b.',c) #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs'] 170 print re.findall('a.*b',c) #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #貪婪模式匹配a到b之間的任意字符長度字符 171 print re.findall('a.*?b',c) #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?結束了* 的貪婪模式, 172 #它不會到最後一個b再去匹配並且見好就收,匹配可能最短的字符 173 174 175 #6.轉義碼 176 177 ''' 178 轉義碼 含義 179 \d 一個數字 180 \D 一個非字符 181 \s 空白符(製表符、空格、換行符) 182 \S 非空白符(符號、字母、數字) 183 \w 字母數字 184 \W 非字母數字(符號、製表符、空格、換行符) 185 ''' 186 187 #7.錨定 188 189 ''' 190 錨定碼 含義 191 ^ 字符串或行的開始 192 $ 字符串或行結束 193 \A 字符串開始 194 \Z 字符串結束 195 \b 一個單詞開頭或者末尾的空串 196 \B 不在一個單詞的開頭活末尾的空串 197 ''' 198 #8.限制搜索 match、search 199 200 text = 'This is some text --with punctuation.' 201 202 pattern = 'is' 203 204 print 'Text :',text 205 print 'pattern:',pattern 206 207 m = re.match(pattern,text) #由於match是從字符開頭開始匹配 is沒有在開頭因此沒有匹配到. 208 print 'Match :',m 209 210 s = re.search(pattern,text) #is在文本中出現了兩次因此匹配到內容 211 print 'Search :',s 212 213 pattern = re.compile(r'\b\w*is\w*\b') #編譯規則 214 215 print 'Text:',text 216 217 218 pos = 0 219 while True: 220 match = pattern.search(text,pos) #搜索規則 221 if not match: 222 break 223 s = match.start() 224 e = match.end() 225 print ' %d : %d = "%s"' % (s,e-1,text[s:e]) 226 pos = e 227 228 #9 用戶組解析匹配(任何一個正則均可覺得組並嵌套在一個更大的表達式中) 229 regex = re.compile(r'(\bt\w+)\W+(\w+)') 230 231 print 'Input text :',text 232 233 print 'Pattern :',regex.pattern 234 235 match = regex.search(text) 236 print 'Entire match :',match.group(0) #表示整個表達式的字符串,子組從1開始排序 237 print 'World start with "t":',match.group(1) #匹配到的第一組 238 print 'World after "t" word :',match.group(2) #匹配到的第二組 239 240 #python對基本分組進行了擴展 (?P<name>pattern) 241 242 print text 243 print 244 for pattern in [ r'^(?P<first_word>\w+)', #組名和正則表達式組成 245 r'(?P<last_word>\w+)\S*$', 246 r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)', 247 r'(?P<ends_with_t>\w+t)\b', 248 ]: 249 regex = re.compile(pattern) 250 match = regex.search(text) 251 print 'Matching "%s"' % pattern 252 print ' ',match.groups() #匹配到全部的組的值 253 print ' ',match.groupdict() #把組名和字串生成字典 254 print 255 256 def test_patterns(text,patterns=[]): 257 '''Given source text and a list of patterns,look for 258 matches for each pattern within the text and print 259 them to stdout. 260 ''' 261 #look for each pattern in the text and print the resuls 262 263 for pattern,desc in patterns: 264 print 'Pattern %r (%s)\n' % (pattern,desc) 265 print ' %r' % text 266 for match in re.finditer(pattern,text): 267 s = match.start() 268 e = match.end() 269 prefix = ' ' * (s) #'空格 X 次數' 270 print ' %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)), 271 print match.groups() 272 if match.groupdict(): 273 print '%s%s' % (' ' * (len(text) -s),match,groupdict()) 274 print 275 return 276 277 print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')]) 278 279 ''' 280 | 表明左右表達式任意匹配一個,他老是先嚐試匹配左邊的表達式,一旦成功匹配則 281 跳過匹配右邊的表達式。若是|沒有被包括()中,則它的範圍是整個正則表達式 282 ?:pattern 283 ''' 284 285 286 #10.搜索選項 - 不區分大小寫的匹配 287 ''' 288 re.IGNORECASE 忽略大小寫 289 ''' 290 291 text = 'This is some text -- with punctuation.' 292 pattern = r'\bT\w+' 293 with_case = re.compile(pattern) 294 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小寫 295 296 print 'Text: \n %r' % text 297 print 'Pattern:\n %s' % pattern 298 print 'Case-sensitive:' 299 for match in with_case.findall(text): 300 print ' %r' % match 301 print 'Case-insensitive:' 302 for match in whitout_case.findall(text): 303 print ' %r' % match 304 305 #11.多行輸入 306 ''' 307 MULTILINE 多行匹配 308 ''' 309 310 text = 'This is some text -- with punctuation.\nA secone lines.' 311 pattern = r'(^\w+)|(\w+\S*$)' 312 single_line = re.compile(pattern) 313 multiline = re.compile(pattern,re.MULTILINE) 314 print 'Text:\n %r' % text 315 print 'Pattern:\n %s' % pattern 316 print 'Single Line :' 317 for match in single_line.findall(text): 318 print ' %r' % (match,) 319 print 'MULTILINE :' 320 for match in multiline.findall(text): 321 print ' %r' % (match,) 322 323 ''' 324 DOTALL 讓點字符也能夠匹配換行符 325 ''' 326 327 pattern = r'.+' 328 no_newlines = re.compile(pattern) 329 dotall = re.compile(pattern,re.DOTALL) 330 331 print 'Text :\n %r' % text 332 print 'Pattern:\n %s' % pattern 333 print 'No newlines :' 334 for match in no_newlines.findall(text): 335 print ' %r' % match 336 print 'Dotall :' 337 for match in dotall.findall(text): 338 print ' %r' % match 339 340 #12 Unicode匹配 341 ''' 342 re.UNICODE 匹配Unicode 343 ''' 344 345 346 import codecs 347 import sys 348 349 #set standard output encoding to UTF-8 350 351 sys.output = codecs.getwriter('UTF-8')(sys.stdout) 352 353 pattern = ur'\w+' 354 ascii_pattern = re.compile(pattern) 355 unicde_pattern = re.compile(pattern,re.UNICODE) 356 357 print 'Text :',text 358 print 'Pattern :',pattern 359 print 'ASCII :',u', '.join(ascii_pattern.findall(text)) 360 print 'Unicode :',u', '.join(unicde_pattern.findall(text)) 361 362 ''' 363 re.VERBOSE 讓正則更容易讀 364 ''' 365 366 address = re.compile( 367 ''' 368 [\w\d.+-]+ #username 369 @ 370 ([\w\d.]+\.)+ #domain name prefix 371 (com|org|edu) #TODO:support more top-level domains 372 ''', 373 re.UNICODE | re.VERBOSE) 374 375 candidates = [ 376 u'first.last@example.com', 377 u'first.last+category@gmail.com', 378 u'valid-address@mail.example.com', 379 u'not-valid@example.foo' 380 ] 381 382 for candidate in candidates: 383 match = address.search(candidate) 384 print '%-30s %s' % (candidate,'Matche' if match else 'no match') 385 386 387 address = re.compile ( 388 ''' 389 #A name is made up of letters,and may include "." 390 #for title abbreviations and middle initials. 391 ((?P<name> 392 ([\w.,]+\S+)*[\w.,]+) 393 \s* 394 # Email addresses are wrapped in angle 395 # brackets: <> but only if a name is 396 # found, so keep the start bracket in this 397 # group. 398 < 399 )? # the entire name is optional 400 401 # the address itself:username@domain.tld 402 (?P<email> 403 [\w\d.+-]+ #username 404 @ 405 ([\w\d.]+\.)+ #domain name prefix 406 (com|org|edu) #TODO:support more top-level domains 407 ) 408 >? # optional closeing angle break 409 ''', 410 re.UNICODE | re.VERBOSE) 411 412 candidates = [ 413 u'first.last@example.com', 414 u'first.last+category@gmail.com', 415 u'valid-address@mail.example.com', 416 u'not-valid@example.foo' 417 u'Fist Last <first.last@example.com>' 418 u'NO Brackets first.last@example', 419 u'First Last', 420 u'First Middle Last <first.last@example.com>', 421 u'First M. Last <first.last@example.com>', 422 u'<first.last@example.com>', 423 ] 424 425 for candidate in candidates: 426 print 'candidate:',candidate 427 match = address.search(candidate) 428 if match: 429 print ' Name:',match.groupdict()['name'] 430 print ' Email:',match.groupdict()['email'] 431 else: 432 print ' No match' 433 434 ''' 435 正則表達式標誌縮寫表 436 437 標誌 縮寫 描述 438 439 IGNORECASE i 忽略大小寫 440 MULTILINE m 多行匹配 441 DOTALL s 讓點字符也能夠匹配換行符 442 UNICODE u 匹配Unicode 443 VERBOSE x 讓正則更容易讀 444 在模式中嵌入標籤(?imu)會打開相應的選項 445 ''' 446 text = 'This is some text -- with punctuation.' 447 pattern = r'(?i)\bT\w+' 448 regex = re.compile(pattern) 449 450 print 'Text :',text 451 print 'Pattern :',pattern 452 print 'Matches :',regex.findall(text) 453 454 #13 前向或後向 455 456 address = re.compile( 457 ''' 458 # A name is made up of letters, and may include "." 459 # for title abbreviations and middle initials 460 ((?P<name> 461 ([\w.,]+\s+)*[\w.,]+ 462 ) 463 \s+ 464 ) # name is no longer optional 465 # LOOKAHEAD 466 # Email address are wrapped in angle brackets, but only 467 # if they are both present or neither is . 468 (?= (<.*>$) 469 | 470 ([^<].*[^>]$) 471 ) 472 <? # optional opening angle bracket 473 474 # The address itself: username@domain.tld 475 (?P<email> 476 [\w\d.+-]+ 477 @ 478 ([\w\d.]+\.)+ 479 (com|org|edu) 480 ) 481 >? 482 ''', 483 re.UNICODE | re.VERBOSE) 484 485 candidates = [ 486 u'First Last <first.last@example.com>', 487 u'No Brackets first.last@example.com', 488 u'Open Brackets <first.last@example.com>', 489 u'Close Brackets first.last@example.com', 490 ] 491 for candidate in candidates: 492 print 'Candidate:',candidate 493 match = address.search(candidate) 494 if match: 495 print ' Name :',match.groupdict()['name'] 496 print ' Email :',match.groupdict()['email'] 497 else: 498 print ' No match' 499 500 #自動忽略系統經常使用的noreply郵件地址 501 ''' 502 (?!noreply@.*$) 忽略這個郵件地址 503 (?<!noreply>) 兩種模式 寫在username以前不會向後斷言 504 (?<=pattern) 用確定向後斷言查找符合某個模式的文本 505 ''' 506 address = re.compile( 507 ''' 508 ^ 509 # An address: username@domain.tld 510 511 # Ignore noreply address 512 (?!noreply@.*$) 513 514 [\w\d.+-]+ # username 515 @ 516 ([\w\d.]+\.)+ # domain name prefix 517 (com|org|edu) # limit the allowed top-level domains 518 519 $ 520 ''', 521 re.UNICODE | re.VERBOSE) 522 523 candidates = [ 524 525 u'first.last@example.com', 526 u'noreply@example.com', 527 ] 528 529 for candidate in candidates: 530 print 'Candidate:',candidate 531 match = address.search(candidate) 532 if match: 533 print ' Match:',candidate[match.start():match.end()] 534 else: 535 print ' No match' 536 537 twitter = re.compile( 538 ''' 539 # A twitter handle: @username 540 (?<=@) 541 ([\w\d_]+) # username 542 ''', 543 re.UNICODE | re.VERBOSE) 544 545 text = ''' This text includes two Twitter handles. 546 One for @TheSF,and one for the author,@doughellmann. 547 ''' 548 print text 549 for match in twitter.findall(text): 550 print 'handle:',match 551 552 #14 自引用表達式 #能夠把表達式編號後面來引用 553 554 address = re.compile( 555 ''' 556 (\w+) # first name 557 \s+ 558 (([\w.]+)\s+)? # optional middle name or initial 559 (\w+) # last name 560 561 \s+ 562 < 563 564 # The address: first_name.last_name@domain.tld 565 (?P<email> 566 \1 #first name 567 \. 568 \4 #last name 569 @ 570 ([\w\d.]+\.)+ 571 (com|org|edu) 572 ) 573 > 574 ''', 575 re.UNICODE | re.VERBOSE | re.IGNORECASE) 576 577 candidates = [ 578 u'First Last <first.last@example.com>', 579 u'Different Name <first.last.example.com>', 580 u'First Middle Last <first.last@example.com>', 581 ] 582 for candidate in candidates: 583 print 'Candidate:',candidate 584 match = address.search(candidate) 585 if match: 586 print ' Match name:',match.group(1),match.group(4) 587 else: 588 print ' No match' 589 590 #正則表達式解析包括一個擴展,可使用(?P=name)指示表達式先前匹配的一個命名組的值. 591 592 address = re.compile( 593 ''' 594 595 # The regular name 596 (?P<first_name>\w+) 597 \s+ 598 (([\w.]+)\s+)? 599 (?P<last_name>\w+) 600 \s+ 601 < 602 603 # The address: first_name.last_name@domain.tld 604 (?P<email> 605 (?P=first_name) 606 \. 607 (?P=last_name) 608 @ 609 ([\w\d.]+\.)+ 610 (com|org|edu) 611 ) 612 > 613 ''', 614 re.UNICODE | re.VERBOSE | re.IGNORECASE) 615 616 candidates = [ 617 u'First last <first.last@example.com>', 618 u'Different Name <first.last@example.com>', 619 u'First Middle last <first.last@example.com>', 620 u'First M. Last<first.last@example.com>', 621 ] 622 623 for candidate in candidates: 624 print 'Candidate:',candidate 625 match = address.search(candidate) 626 if match: 627 print ' Match name:',match.groupdict()['first_name'] 628 print match.groupdict()['last_name'] 629 print ' Match email:',match.groupdict()['email'] 630 631 else: 632 print 'No match' 633 634 #15 用模式修改字符串 635 ''' 636 re支持使用正則表達式做爲搜索機制來修改文本,並且能夠替換能夠引用正則表達式中的匹配組做爲替換文本的一部分。 637 ''' 638 bold = re.compile(r'\*{2}(.*?)\*{2}') 639 text = 'Make this **bold**. This **too**.' 640 print 'Text:',text 641 print 'Bold:',bold.sub(r'<b>\1</b>',text) 642 643 ''' 644 使用命名組來替換 645 count 來限制替換次數 646 sbun 工做原理和sub類似 subn同時返回修改後的字符串和完成的替換次數 647 ''' 648 649 bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}',re.UNICODE,) 650 651 print 'Text:',text 652 print 'Bold:',bold.sub(r'<b>\g<bold_text></b>',text,count=1) 653 654 #16 利用模式拆分 655 656 ''' 657 str.split() 是分解字符串來完成解析的最經常使用方法之一,它只是支持字面值得做爲分隔符 658 ''' 659 660 text = '''Paragraph one 661 one tuo lines. 662 663 Paragraph two. 664 665 Paragraph three.''' 666 667 print 'With findall:' 668 for num,para in enumerate(re.findall(r'.+?\n{2,}|$', 669 text, 670 flags = re.DOTALL) 671 ): 672 print num,repr(para) 673 print 674 675 print 676 print 'With split:' 677 for num,para in enumerate(re.split(r'\n{2,}',text)): 678 print num,repr(para) 679 print