re模塊詳解

  1 #!/usr/bin/env python
  2 #-*- coding:UTF-8 -*-
  3 #####################################################
  4 # Author: sunfx   xingrhce@163.com
  5 # Last modified:  2014/11/18
  6 # Filename:  re.py
  7 # Q  Q  羣:  236147801
  8 #####################################################
  9  
 10 import re
 11  
 12 #1.查找文本中的字符
 13  
 14 pattern = 'this'
 15 text = 'Does this text match the pattern?'
 16  
 17 match = re.search(pattern,text)
 18  
 19 s = match.start()
 20 e = match.end()
 21  
 22 print 'Found "%s"\nin "%s"\nfrom %d to %d ("%s")' %\
 23       (match.re.pattern,match.string,s,e,text[s:e])
 24  
 25 '''
 26 match.re.pattern 要匹配的內容
 27 match.string 匹配的字符
 28 s  匹配到內容開始索引
 29 d  匹配到內容結束索引
 30 text[s:e] 匹配字符
 31 '''
 32  
 33 #2.編譯表達式
 34  
 35 regexes = [ re.compile(p)
 36             for p in ['this','that']              
 37 ] #把字符轉換Regexobject格式
 38  
 39  
 40  
 41 print 'Text: %r\n' % text #輸出text內容
 42  
 43 for regex in regexes:
 44  
 45     print 'Seeking "%s"->' % regex.pattern,  #regex.pattern 要匹配的字符
 46  
 47     if regex.search(text): #在text中搜索this or that
 48  
 49         print 'match!'
 50  
 51     else:
 52  
 53         print 'no match'
 54  
 55 #3.多重匹配
 56  
 57 text = 'abbaaabbbbaaaaa'
 58  
 59 pattern = 'ab'
 60  
 61 for match in re.findall(pattern,text):
 62  
 63     print 'Found: "%s"' % match
 64  
 65 #findall 直接返回字符串
 66  
 67  
 68 for match in re.finditer(pattern,text):
 69     s = match.start()
 70     e = match.end()
 71     print 'Found "%s" at %d:%d' % (text[s:e],s,e)
 72  
 73 #finditer 返回原輸入文字在字符串的位置
 74  
 75 #4.模式語法
 76  
 77 def test_patterns(text,patterns=[]):
 78  
 79     for pattern,desc in patterns: 
 80         print 'Pattern %r (%s) \n' %(pattern,desc) 
 81         print '   %r' % text
 82         for match in re.finditer(pattern,text):
 83             s = match.start()
 84             e = match.end()
 85             substr = text[s:e] #匹配到的字符
 86             n_backslashes = text[:s].count('\\') #查找文本:s座標以前的包含多少\\
 87             prefix = '.' * ( s + n_backslashes ) 
 88             print '    %s%r' % (prefix,substr) 
 89         print
 90     return
 91  
 92 test_patterns('abbaaabbbbaaaaa',
 93             [('ab',"'a' followed by 'b'")]
 94     )
 95  
 96 #貪婪模式 這種模式會減小單個匹配減小
 97 '''
 98      *                '匹配一次到屢次'
 99      +                '至少匹配一次到屢次'
100      ?                '只匹配一次'
101      ab*,             'a followerd by zero or more b'),  #匹配0次或者更屢次
102      ab+,             'a followerd by one or mrore b'),  #最少匹配一次或者更屢次
103      ab?,             'a followerd by zero or one b'),   #匹配0最多一次
104      ab{3},           'a followerd by three b'),         #最少匹配三次
105      ab{2,3},           'a followerd by two to three b')   #匹配兩至三次
106  
107  
108      ab*?,             'a followerd by zero or more b'),  #匹配0次或者更屢次
109      ab+?,             'a followerd by one or mrore b'),  #最少匹配一次或者更屢次
110      ab??,             'a followerd by zero or one b'),   #匹配0最多一次
111      ab{3}?,           'a followerd by three b'),         #最少匹配三次
112      ab{2,3}?,           'a followerd by two to three b')   #匹配兩至三次
113 '''
114  
115 #用法以下:
116  
117 str = 'absdsdsdsdsd'
118  
119 print re.findall('ab*',str)
120 #['ab']
121  
122 print re.findall('ab*?',str)
123 #['a']
124  
125 #5.字符集
126  
127 '''
128 [ab]     'either a or b 匹配a或者b'
129 a[ab]+   'a followerd by 1 more a or b 匹配一次a、b或者屢次 '
130 a[ab]+?  'a followerd by 1 or more a or b,not greedy 匹配1一次能夠匹配屢次'
131 [^]      '不包含內容'
132 [a-z]    '全部小寫ASCII字母' 
133 [A-Z]    '全部大寫寫ASCII字母' 
134 [a-zA-Z] '一個小寫和大寫的序列'
135 [A-Za-z] '一個大寫小寫的序列'
136 '''
137 str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba'
138  
139 print re.findall('[ab]',str)
140 print re.findall('a[ab]+',str)
141 print re.findall('a[ab]+?',str)
142 print re.findall('[^_]',str)
143  
144 str = 'China,lovE'
145  
146 print re.findall('[a-z][A-Z]',str)  #['vE'] 
147 print re.findall('[A-Z][a-z]',str)  #['Ch']
148  
149 print re.findall('[A-Z][a-z]+',str) #['China']
150 print re.findall('[a-z][A-Z]+',str) #['vE']
151  
152 print re.findall('[A-Z][a-z]*',str) #['China', 'E']
153 print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']
154  
155 print re.findall('[A-Z][a-z]?',str) #['Ch', 'E']
156 print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']
157  
158 '''
159 .      元字符匹配一個字符
160 a.
161 b.
162 a.*b
163 a.*?b
164 '''
165  
166 c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd'
167  
168 print re.findall('a.',c)  #['ai', 'aw', 'as', 'aa', 'ab']
169 print re.findall('b.',c)  #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs']
170 print re.findall('a.*b',c)  #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #貪婪模式匹配a到b之間的任意字符長度字符
171 print re.findall('a.*?b',c)  #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?結束了* 的貪婪模式,
172                              #它不會到最後一個b再去匹配並且見好就收,匹配可能最短的字符
173  
174  
175 #6.轉義碼
176  
177 '''
178 轉義碼                                   含義
179  \d                                    一個數字
180  \D                                    一個非字符
181  \s                                    空白符(製表符、空格、換行符)
182  \S                                    非空白符(符號、字母、數字)
183  \w                                    字母數字
184  \W                                    非字母數字(符號、製表符、空格、換行符)
185 '''
186  
187 #7.錨定
188  
189 '''
190 錨定碼                               含義
191   ^                              字符串或行的開始
192   $                              字符串或行結束
193   \A                             字符串開始
194   \Z                             字符串結束
195   \b                             一個單詞開頭或者末尾的空串
196   \B                             不在一個單詞的開頭活末尾的空串
197 '''
198 #8.限制搜索 match、search
199  
200 text = 'This is some text --with punctuation.'
201  
202 pattern = 'is'
203  
204 print 'Text    :',text
205 print 'pattern:',pattern
206  
207 m = re.match(pattern,text)   #由於match是從字符開頭開始匹配 is沒有在開頭因此沒有匹配到.
208 print 'Match :',m   
209  
210 s = re.search(pattern,text) #is在文本中出現了兩次因此匹配到內容
211 print 'Search :',s
212  
213 pattern = re.compile(r'\b\w*is\w*\b') #編譯規則
214  
215 print 'Text:',text
216  
217  
218 pos = 0
219 while  True:
220     match = pattern.search(text,pos) #搜索規則
221     if not match:
222         break
223     s = match.start()
224     e = match.end() 
225     print '  %d : %d = "%s"' % (s,e-1,text[s:e]) 
226     pos = e
227  
228 #9 用戶組解析匹配(任何一個正則均可覺得組並嵌套在一個更大的表達式中)
229 regex = re.compile(r'(\bt\w+)\W+(\w+)')
230  
231 print 'Input  text      :',text
232  
233 print 'Pattern          :',regex.pattern
234  
235 match = regex.search(text)
236 print 'Entire match     :',match.group(0) #表示整個表達式的字符串,子組從1開始排序
237 print 'World start with "t":',match.group(1) #匹配到的第一組
238 print 'World after "t" word :',match.group(2) #匹配到的第二組
239  
240 #python對基本分組進行了擴展 (?P<name>pattern)
241  
242 print text
243 print
244 for pattern in [ r'^(?P<first_word>\w+)',  #組名和正則表達式組成
245                  r'(?P<last_word>\w+)\S*$',
246                  r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)',
247                  r'(?P<ends_with_t>\w+t)\b',
248                  ]:
249     regex = re.compile(pattern)
250     match = regex.search(text)
251     print 'Matching "%s"' % pattern
252     print ' ',match.groups()  #匹配到全部的組的值
253     print ' ',match.groupdict() #把組名和字串生成字典 
254     print
255  
256 def test_patterns(text,patterns=[]):
257     '''Given source text and a list of patterns,look for 
258     matches for each pattern within the text and print
259     them to stdout.
260     '''
261     #look for each pattern in the text and print the resuls
262  
263     for pattern,desc in patterns:
264         print 'Pattern %r (%s)\n' % (pattern,desc)
265         print '   %r' % text
266     for match in re.finditer(pattern,text):
267         s = match.start()
268         e = match.end()
269         prefix = ' ' * (s) #'空格 X 次數'
270         print '   %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)),
271         print match.groups()
272         if match.groupdict():
273             print '%s%s' % (' ' * (len(text) -s),match,groupdict())
274             print
275     return
276  
277 print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')])
278  
279 '''
280 |       表明左右表達式任意匹配一個,他老是先嚐試匹配左邊的表達式,一旦成功匹配則
281 跳過匹配右邊的表達式。若是|沒有被包括()中,則它的範圍是整個正則表達式
282 ?:pattern
283 '''
284  
285  
286 #10.搜索選項 - 不區分大小寫的匹配
287 '''
288 re.IGNORECASE 忽略大小寫
289 '''
290  
291 text  = 'This is some text  -- with punctuation.'
292 pattern = r'\bT\w+'
293 with_case = re.compile(pattern)
294 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小寫
295  
296 print 'Text: \n  %r' % text
297 print 'Pattern:\n %s' % pattern
298 print 'Case-sensitive:'
299 for match in with_case.findall(text):
300     print '  %r' % match
301 print 'Case-insensitive:'
302 for match in whitout_case.findall(text):
303     print ' %r' % match
304  
305 #11.多行輸入
306 '''
307 MULTILINE  多行匹配
308 '''
309  
310 text = 'This is some text  -- with punctuation.\nA secone lines.'
311 pattern = r'(^\w+)|(\w+\S*$)'
312 single_line = re.compile(pattern)
313 multiline = re.compile(pattern,re.MULTILINE) 
314 print 'Text:\n %r' % text
315 print 'Pattern:\n  %s' % pattern
316 print 'Single Line :'
317 for match in single_line.findall(text):
318     print '  %r' % (match,)
319 print 'MULTILINE  :'
320 for match in multiline.findall(text):
321     print '  %r'  % (match,)
322  
323 '''
324 DOTALL 讓點字符也能夠匹配換行符
325 '''
326  
327 pattern = r'.+'
328 no_newlines = re.compile(pattern)
329 dotall = re.compile(pattern,re.DOTALL)
330  
331 print 'Text :\n   %r' % text
332 print 'Pattern:\n %s' % pattern
333 print 'No newlines :'
334 for match in no_newlines.findall(text):
335     print '  %r' % match
336 print 'Dotall    :'
337 for  match in dotall.findall(text):
338     print '  %r' % match
339  
340 #12 Unicode匹配
341 '''
342 re.UNICODE 匹配Unicode
343 '''
344  
345  
346 import codecs
347 import sys
348  
349 #set standard output encoding to UTF-8
350  
351 sys.output = codecs.getwriter('UTF-8')(sys.stdout)
352  
353 pattern = ur'\w+'
354 ascii_pattern = re.compile(pattern)
355 unicde_pattern = re.compile(pattern,re.UNICODE)
356  
357 print 'Text    :',text
358 print 'Pattern :',pattern
359 print 'ASCII   :',u', '.join(ascii_pattern.findall(text))
360 print 'Unicode :',u', '.join(unicde_pattern.findall(text))
361  
362 '''
363 re.VERBOSE 讓正則更容易讀
364 '''
365  
366 address = re.compile(
367         '''
368         [\w\d.+-]+    #username
369         @ 
370         ([\w\d.]+\.)+ #domain name prefix
371         (com|org|edu) #TODO:support more top-level domains
372         ''',
373         re.UNICODE | re.VERBOSE)
374  
375 candidates = [
376         u'first.last@example.com',
377         u'first.last+category@gmail.com',
378         u'valid-address@mail.example.com',
379         u'not-valid@example.foo'
380 ]
381  
382 for candidate in candidates:
383     match = address.search(candidate)
384     print '%-30s %s' % (candidate,'Matche' if match else 'no match')
385  
386  
387 address = re.compile (
388     '''
389     #A name is made up of letters,and may include "."
390     #for title abbreviations and middle initials.
391     ((?P<name>
392         ([\w.,]+\S+)*[\w.,]+)
393         \s*
394         # Email addresses are wrapped in angle
395         # brackets: <> but only if a name is 
396         # found, so keep the start bracket in this
397         # group.
398         <
399     )?  # the entire name is optional
400      
401     # the address itself:username@domain.tld
402     (?P<email>
403         [\w\d.+-]+    #username
404         @ 
405         ([\w\d.]+\.)+ #domain name prefix
406         (com|org|edu) #TODO:support more top-level domains
407     )
408     >? # optional closeing angle break
409     ''',
410     re.UNICODE | re.VERBOSE)
411  
412 candidates = [
413         u'first.last@example.com',
414         u'first.last+category@gmail.com',
415         u'valid-address@mail.example.com',
416         u'not-valid@example.foo'
417         u'Fist Last <first.last@example.com>'
418         u'NO Brackets first.last@example',
419         u'First Last',
420         u'First Middle Last <first.last@example.com>',
421         u'First M. Last <first.last@example.com>',
422         u'<first.last@example.com>',
423 ]
424  
425 for candidate in candidates:
426     print 'candidate:',candidate
427     match = address.search(candidate)
428     if match:
429         print ' Name:',match.groupdict()['name']
430         print ' Email:',match.groupdict()['email']
431     else:
432         print '   No match'
433  
434 '''
435                     正則表達式標誌縮寫表
436  
437     標誌                  縮寫               描述
438  
439   IGNORECASE              i           忽略大小寫
440   MULTILINE                 m           多行匹配
441   DOTALL                    s          讓點字符也能夠匹配換行符
442   UNICODE                  u          匹配Unicode
443   VERBOSE                 x          讓正則更容易讀
444 在模式中嵌入標籤(?imu)會打開相應的選項
445 '''
446 text = 'This is  some text -- with punctuation.'
447 pattern = r'(?i)\bT\w+'
448 regex = re.compile(pattern)
449  
450 print 'Text   :',text
451 print 'Pattern    :',pattern
452 print 'Matches   :',regex.findall(text)
453  
454 #13 前向或後向
455  
456 address = re.compile(
457     '''
458     # A name is made up of letters, and may include "."
459     # for title abbreviations and middle initials
460     ((?P<name>
461         ([\w.,]+\s+)*[\w.,]+
462         )
463     \s+
464     )  # name is no longer optional
465     # LOOKAHEAD
466     # Email address are wrapped in angle brackets, but only
467     # if they are both present or neither is .
468     (?= (<.*>$)
469         |
470         ([^<].*[^>]$)
471     )
472     <? # optional opening angle bracket
473  
474     # The address itself: username@domain.tld
475     (?P<email>
476         [\w\d.+-]+
477         @
478         ([\w\d.]+\.)+
479         (com|org|edu)
480     )
481     >?
482     ''',
483     re.UNICODE | re.VERBOSE)
484  
485 candidates = [
486     u'First Last <first.last@example.com>',
487     u'No Brackets first.last@example.com',
488     u'Open Brackets <first.last@example.com>',
489     u'Close Brackets first.last@example.com',
490     ]
491 for candidate in candidates:
492     print 'Candidate:',candidate
493     match = address.search(candidate)
494     if match:
495         print ' Name :',match.groupdict()['name']
496         print ' Email :',match.groupdict()['email']
497     else:
498         print '  No match'
499  
500 #自動忽略系統經常使用的noreply郵件地址
501 '''
502 (?!noreply@.*$) 忽略這個郵件地址
503 (?<!noreply>)  兩種模式 寫在username以前不會向後斷言 
504 (?<=pattern)   用確定向後斷言查找符合某個模式的文本 
505 '''
506 address = re.compile(
507     '''
508     ^
509     # An address: username@domain.tld
510  
511     # Ignore noreply address
512     (?!noreply@.*$)
513  
514     [\w\d.+-]+     # username
515     @
516     ([\w\d.]+\.)+  # domain name prefix
517     (com|org|edu)  # limit the allowed top-level domains
518  
519     $
520     ''',
521     re.UNICODE | re.VERBOSE)
522  
523 candidates = [
524  
525     u'first.last@example.com',
526     u'noreply@example.com',
527 ]
528  
529 for candidate in candidates:
530     print 'Candidate:',candidate
531     match = address.search(candidate)
532     if match:
533         print '  Match:',candidate[match.start():match.end()]
534     else:
535         print '  No match'
536  
537 twitter = re.compile(
538     '''
539     # A twitter handle: @username
540     (?<=@)
541     ([\w\d_]+)   # username
542     ''',
543     re.UNICODE | re.VERBOSE)
544  
545 text = ''' This text includes two Twitter handles.
546 One for @TheSF,and one for the author,@doughellmann.
547 '''
548 print text
549 for match in twitter.findall(text):
550     print 'handle:',match
551  
552 #14 自引用表達式 #能夠把表達式編號後面來引用
553  
554 address = re.compile(
555     '''
556     (\w+)          # first name
557     \s+
558     (([\w.]+)\s+)?  # optional middle name or initial
559     (\w+)           # last name
560  
561     \s+
562     <
563  
564     # The address: first_name.last_name@domain.tld
565     (?P<email>
566         \1         #first name
567         \.
568         \4         #last name
569         @
570         ([\w\d.]+\.)+
571         (com|org|edu)
572         )            
573     >
574     ''',
575     re.UNICODE | re.VERBOSE | re.IGNORECASE)
576  
577 candidates = [
578     u'First Last <first.last@example.com>',
579     u'Different Name <first.last.example.com>',
580     u'First Middle Last <first.last@example.com>', 
581 ]
582 for candidate in candidates:
583     print 'Candidate:',candidate
584     match = address.search(candidate)
585 if match:
586     print '  Match name:',match.group(1),match.group(4)
587 else:
588     print ' No match'
589  
590 #正則表達式解析包括一個擴展,可使用(?P=name)指示表達式先前匹配的一個命名組的值.
591  
592 address = re.compile(
593     '''
594  
595     # The regular name
596     (?P<first_name>\w+)
597     \s+
598     (([\w.]+)\s+)?
599     (?P<last_name>\w+)
600     \s+
601     <
602  
603     # The address: first_name.last_name@domain.tld
604     (?P<email>
605         (?P=first_name)
606         \.
607         (?P=last_name)
608         @
609         ([\w\d.]+\.)+
610         (com|org|edu)
611         )
612     >
613     ''',
614     re.UNICODE | re.VERBOSE | re.IGNORECASE)
615  
616 candidates = [
617     u'First last <first.last@example.com>',
618     u'Different Name <first.last@example.com>',
619     u'First Middle last <first.last@example.com>',
620     u'First M. Last<first.last@example.com>',
621 ]
622  
623 for candidate in candidates:
624     print 'Candidate:',candidate
625     match = address.search(candidate)
626     if match:
627         print '  Match name:',match.groupdict()['first_name']
628         print match.groupdict()['last_name']
629         print '  Match email:',match.groupdict()['email']
630  
631     else:
632         print 'No match'
633  
634 #15 用模式修改字符串
635 '''
636 re支持使用正則表達式做爲搜索機制來修改文本,並且能夠替換能夠引用正則表達式中的匹配組做爲替換文本的一部分。
637 '''
638 bold = re.compile(r'\*{2}(.*?)\*{2}')
639 text = 'Make this **bold**. This **too**.'
640 print 'Text:',text
641 print 'Bold:',bold.sub(r'<b>\1</b>',text)
642  
643 '''
644 使用命名組來替換
645 count 來限制替換次數
646 sbun 工做原理和sub類似 subn同時返回修改後的字符串和完成的替換次數
647 '''
648  
649 bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}',re.UNICODE,)
650  
651 print 'Text:',text
652 print 'Bold:',bold.sub(r'<b>\g<bold_text></b>',text,count=1)
653  
654 #16 利用模式拆分
655  
656 '''
657 str.split() 是分解字符串來完成解析的最經常使用方法之一,它只是支持字面值得做爲分隔符
658 '''
659  
660 text = '''Paragraph one
661 one tuo lines.
662  
663 Paragraph two.
664  
665 Paragraph three.'''
666  
667 print 'With findall:'
668 for num,para in enumerate(re.findall(r'.+?\n{2,}|$',
669                                     text,
670                                     flags = re.DOTALL)
671                             ):
672     print num,repr(para)
673     print
674  
675 print 
676 print 'With split:'
677 for num,para in enumerate(re.split(r'\n{2,}',text)):
678     print num,repr(para)
679     print
相關文章
相關標籤/搜索