每一個程序都回涉及到文本處理,如拆分字符串、搜索、替換、詞法分析等。許多任務均可以經過內建的字符串方法來輕鬆解決,但更復雜的操做就須要正則表達式來解決。html
In [1]: line = 'asdf fjdk; afed, fjek,asdf, foo' #使用正則模塊 In [2]: import re #使用正則split方法能夠匹配多分割符 In [3]: re.split(r'[;,\s]\s*',line) Out[3]: ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo'] #使用捕獲組分割會將匹配的文本也包含在最終結果中 In [4]: re.split(r'(;|,|\s)\s*',line) Out[4]: ['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo'] #若是不想在結果中看到分隔符,能夠受用?:的形式使用非捕獲組 In [5]: re.split(r'(?:,|;|\s)\s*',line) Out[5]: ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
In [6]: url = 'https://www.baidu.com' #檢查字符串的結尾,匹配則返回True In [7]: url.endswith('.com') Out[7]: True In [8]: url.endswith('.cn') Out[8]: False #檢查字符串的開頭,匹配則返回true In [10]: url.startswith('https:') Out[10]: True In [11]: url.startswith('http:') Out[11]: False In [1]: import os In [2]: filenames = os.listdir('.') In [3]: filenames Out[3]: ['.tcshrc', '.bash_logout', '.mysql_history', 'Python-3.7.0.tgz', '.bash_history', '.cache', 'anaconda-ks.cfg', '.ipython', '.cshrc', '.bashrc', '.viminfo', 'Python-3.7.0', 'mysql-boost-8.0.12.tar.gz', 'heapq_queue.py', 'mysql-8.0.12', '.bash_profile'] #應用匹配開頭字符串過濾 In [4]: [i for i in filenames if i.startswith('.')] Out[4]: ['.tcshrc', '.bash_logout', '.mysql_history', '.bash_history', '.cache', '.ipython', '.cshrc', '.bashrc', '.viminfo', '.bash_profile'] #多個結果匹配時使用元組集合 In [5]: [i for i in filenames if i.endswith(('.py','.gz','.tgz'))] Out[5]: ['Python-3.7.0.tgz', 'mysql-boost-8.0.12.tar.gz', 'heapq_queue.py'] #判斷目錄中是否有.py結尾的文件 In [6]: any(i.endswith('.py') for i in filenames) Out[6]: True #應用列子解析網頁或文本內容 from urllib.request import urlopen def read_date(name): if name.startswith(('http:','https:','ftp:')): return urlopen(name).read().decode() else: with open(name) as f: return f.read() result = read_date('test.txt') print(result) #也可使用正則匹配 In [9]: import re In [10]: re.match('http:|https:|ftp:','https://www.baidu.com') Out[10]: <re.Match object; span=(0, 6), match='https:'>
#利用fnmatch模塊中的fnmatch和fnmatchcase函數匹配文本 In [12]: from fnmatch import fnmatch,fnmatchcase In [13]: fnmatch('foo.txt','*.txt') Out[13]: True In [14]: fnmatch('foo.txt','?oo.txt') Out[14]: True In [15]: names = ['dat01.csv','dat99.csv','config.ini','foo.py'] In [18]: [i for i in names if fnmatch(i,'dat[0-9]*.csv')] Out[18]: ['dat01.csv', 'dat99.csv'] #對於Windows操做系統時使用fnmatch函數時,匹配它不區分大小寫,這時咱們可使用fnmatchcase函數來代替,它徹底按照提供的字符串來匹配 In [2]: fnmatch('foo.txt','*.TXT') Out[2]: True In [3]: fnmatchcase('foo.txt','*.TXT') Out[3]: False #推倒式過濾文件名的字符串 In [20]: addresses = [ ...: '5412 N CLARK ST', ...: '1060 W ADDISON ST', ...: '1039 W GRANVILLE AVE', ...: '2122 N CLARK ST', ...: '4802 N BROADWAY', ...: ] In [21]: [i for i in addresses if fnmatch(i,'*ST')] Out[21]: ['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST'] In [22]: [i for i in addresses if fnmatch(i,'*CLARK*')] Out[22]: ['5412 N CLARK ST', '2122 N CLARK ST']
對於簡單的文字匹配咱們只須要使用基本的字符串方法str.find()、str.endswith()、str.startswith(),對於更復雜的匹配就須要使用正則表達式模塊re來匹配了python
In [23]: text1 = 'today is 10/19/2018.Pycon starts 3/13/2019.' In [24]: import re In [25]: re.match(r'\d+/\d+/\d+',text1) In [28]: re.findall(r'\d+/\d+/\d+',text1) Out[28]: ['10/19/2018', '3/13/2019'] In [29]: text2 = '11/20/2018' In [30]: re.match(r'\d+/\d+/\d+',text2) Out[30]: <re.Match object; span=(0, 10), match='11/20/2018'> In [31]: result = re.match(r'(\d+)/(\d+)/(\d+)',text2) In [32]: result.groups() Out[32]: ('11', '20', '2018') In [33]: result.group(0) Out[33]: '11/20/2018' In [34]: result.group(1) Out[34]: '11' In [35]: result.group(2) Out[35]: '20' In [36]: result.group(3) Out[36]: '2018' #分組取出全部的日期並按格式輸出 In [39]: text1 = 'today is 10/19/2018.Pycon starts 3/13/2019.' In [40]: for month,day,year in re.findall(r'(\d+)/(\d+)/(\d+)',text1): ...: print('{}-{}-{}'.format(year,month,day)) ...: 2018-10-19 2019-3-13 #若是文本數據比較大可使用finditer()方法,以迭代器的方法匹配 In [43]: text1 = 'today is 10/19/2018.Pycon starts 3/13/2019.' In [44]: for i in re.finditer(r'(\d+)/(\d+)/(\d+)',text1): ...: print(i.groups()) ...: ('10', '19', '2018') ('3', '13', '2019')
對於簡單的文本模式,可使用str.replace()方法便可mysql
In [45]: text = 'abcabcabcabc' In [46]: text.replace('a','ee') Out[46]: 'eebceebceebceebc'
針對更爲複雜的匹配,可使用re模塊中的sub()方法正則表達式
In [47]: text3 = 'today is 10/19/2018. pycon starts 3/13/2013.' In [49]: re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',text3) Out[49]: 'today is 2018-10-19. pycon starts 2013-3-13.'
更爲複雜的例子,如把日期換成字符格式sql
In [54]: text3 = 'today is 10/19/2018. pycon starts 3/13/2013.' In [55]: from calendar import month_abbr In [56]: def change_date(m): ...: mon_name = month_abbr[int(m.group(1))] ...: return '{} {} {}'.format(m.group(2),mon_name,m.group(3)) ...: ...: In [57]: re.sub(r'(\d+)/(\d+)/(\d+)',change_date,text3) Out[57]: 'today is 19 Oct 2018. pycon starts 13 Mar 2013.' #subn()能夠返回完成了多少次替換 In [58]: re.subn(r'(\d+)/(\d+)/(\d+)',change_date,text3) Out[58]: ('today is 19 Oct 2018. pycon starts 13 Mar 2013.', 2)
要進行不分區大小寫的文本操做時,可使用re模塊進程操做時都要加上re.IGNORECASE標記shell
In [60]: text = 'UPPER PYTHON,lower python, mixed Python' In [61]: re.findall('python',text,flags=re.IGNORECASE) Out[61]: ['PYTHON', 'python', 'Python']
import re def matchcase(word): def replace(m): text = m.group() if text.isupper(): return word.upper() elif text.islower(): return word.lower() elif text[0].isupper(): return word.capitalize() else: return word return replace #保持原字符大小寫或首字母大寫替換實例 text = 'UPPER PYTHON,lower python,Mixed Python' print(re.sub('python',matchcase('snake'),text,flags=re.IGNORECASE))
str_pat = re.compile(r'\"(.*)\"') text1 = 'computer says "no."' str_pat.findall(text1) Out[18]: ['no.'] text2 = 'computer says "no." phone says "yes."' str_pat.findall(text2) #在使用.*貪婪匹配時它將匹配儘量多的匹配項 Out[20]: ['no." phone says "yes.'] str_pat = re.compile(r'\"(.*?)\"') #只須要在多匹配後加上?號,就會以最少的匹配模式進行匹配 str_pat.findall(text2) Out[22]: ['no.', 'yes.']
comment = re.compile(r'python(.*?)end') text1 = 'python is ver good \n so so end' comment.findall(text1) #.*匹配不到換行符 Out[27]: [] comment = re.compile(r'python(.*?)end',flags=re.DOTALL) #加上標記re.DOTALL將匹配全部的字符包括換行符 comment.findall(text1) Out[29]: [' is ver good \n so so '] comment = re.compile(r'python((?:.|\n)*?)end') #(?:.|\n)會指定一個非捕獲組,它只作匹配但不捕獲結果,也不分配組號 comment.findall(text1) Out[31]: [' is ver good \n so so ']
s1 = 'spicy\u00f1o' #它使用的是(U+00F1)全組成的(fully composed) s2 = 'spicy\u0303o' #它使用的是(U+0303)拉丁字母組合而成 s1 == s2 #因此字符比較是不相等的 Out[35]: False s1 Out[36]: 'spicyño' s2 Out[37]: 'spicỹo'
#strip()方法用來從字符串的開始和結尾處去掉字符,lstrip()和rstrip()分別從左或右開始執行去除字符操做,默認去除的是空格符,也能夠指定 In [21]: s = ' hello world \n' In [22]: s.strip() Out[22]: 'hello world' In [23]: s.lstrip() Out[23]: 'hello world \n' In [24]: s.rstrip() Out[24]: ' hello world' In [25]: t = '-----hello=====' In [26]: t.lstrip('-') #指定去除字符 Out[26]: 'hello=====' In [27]: t.strip('-=') #能夠指定多個字符 Out[27]: 'hello' #使用上面的方法不能去除中間的字符,要去除中間的字符可使用replace()方法或正則表達式替換 In [28]: s.replace(' ','') Out[28]: 'helloworld\n' In [29]: re.sub('\s+', '',s) Out[29]: 'helloworld'
#對應基本的字符串對齊,可使用字符串方法ljust()、rjust()和center(),分別表示左對齊,右對齊和居中對齊,它還能夠填充字符可選參數 In [31]: text = 'hello world' In [32]: text.ljust(30) Out[32]: 'hello world ' In [33]: text.rjust(30) Out[33]: ' hello world' In [34]: text.center(30) Out[34]: ' hello world ' In [35]: text.center(30,'=') Out[35]: '=========hello world==========' #format()函數也能夠用來完成對齊任務,須要作的就是合理利用'<'、'>'和'^'字符分別表示左對齊、右對齊和居中對齊,並提供一個指望的寬度值,若是想指定填充字符,能夠在對齊符前指定: In [36]: format(text,'>20') Out[36]: ' hello world' In [37]: format(text,'<20') Out[37]: 'hello world ' In [38]: format(text,'^20') Out[38]: ' hello world ' In [39]: format(text,'=^20') Out[39]: '====hello world=====' In [40]: format(text,'=^20s') Out[40]: '====hello world=====' In [41]: format(text,'*^20s') Out[41]: '****hello world*****' #當格式化多個值時,也可使用format()方法 In [42]: '{:>10s}{:<10s}'.format('hello','world') Out[42]: ' helloworld ' In [43]: '{:#>10s} {:&<10s}'.format('hello','world') Out[43]: '#####hello world&&&&&'
#合併的字符串在一個序列或可迭代對象中,最好的方法是使用join()方法 In [44]: data = ['I','like','is','python'] In [45]: ' '.join(data) Out[45]: 'I like is python' In [46]: ','.join(data) Out[46]: 'I,like,is,python' #利用生成器表達式轉換後連接字符串會更高效 In [47]: ','.join(str(d) for d in data) Out[47]: 'I,like,is,python'
#在字符串中給變量賦值通常常見的處理方式是使用format()方法 In [5]: str_variable = "{name} today {num} old year" In [6]: str_variable.format(name='zhang',num=20) Out[6]: 'zhang today 20 old year' #另外一種方式是使用format_map()和vars()聯合匹配當前環境中的變量名 In [7]: name = 'python' In [8]: num = 18 In [9]: str_variable.format_map(vars()) Out[9]: 'python today 18 old year' #vars()還可用在類實例上 In [10]: class info: ...: def __init__(self,name,num): ...: self.name = name ...: self.num = num ...: In [11]: a = info('shell',23) In [12]: str_variable.format_map(vars(a)) Out[12]: 'shell today 23 old year' #對於傳遞參數不夠時將會拋出異常,能夠定義一個帶有__missing__()方法的字典類來處理 In [13]: class safesub(dict): ...: def __missing__(self,key): ...: return '{' + key + '}' ...: In [14]: del num In [15]: str_variable.format_map(safesub(vars())) Out[15]: 'python today {num} old year'
#textwrap模塊能夠以多種方式從新格式化字符串: >>> import textwrap >>> s = "look into eyes, look into my eyes, the eyes,the eyes, \ ... the eyes, not around the eyes, don't look around the eyes, \ ... look into my eyes, you're under." >>> print(textwrap.fill(s,70) ... ) look into eyes, look into my eyes, the eyes,the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under. >>> print(textwrap.fill(s,40)) look into eyes, look into my eyes, the eyes,the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under. >>> print(textwrap.fill(s,40,initial_indent=' ')) look into eyes, look into my eyes, the eyes,the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under. >>> print(textwrap.fill(s,40,subsequent_indent=' ')) look into eyes, look into my eyes, the eyes,the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under. #能夠經過os.get_terminal_size()來獲取終端的尺寸大小 >>> import os >>> print(textwrap.fill(s,os.get_terminal_size().columns)) look into eyes, look into my eyes, the eyes,the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under. >>> print(os.get_terminal_size()) os.terminal_size(columns=105, lines=32)
#使用html.escape()函數來替換HTML標籤爲文本樣式 In [1]: s = 'Elements are written aa "<tag>text</tag>".' In [2]: import html In [3]: s Out[3]: 'Elements are written aa "<tag>text</tag>".' In [4]: html.escape(s) Out[4]: 'Elements are written aa "<tag>text</tag>".' #忽略quote標籤 In [5]: html.escape(s,quote=False) Out[5]: 'Elements are written aa "<tag>text</tag>".' #處理ASCII文本 In [6]: s1 = 'Spicy "Jalapeño".' In [7]: from html.parser import HTMLParser In [9]: p = HTMLParser() In [11]: p.unescape(s1) Out[11]: 'Spicy "Jalapeño".' #生成ASCII文本 In [12]: s2 = p.unescape(s1) In [13]: s2.encode('ascii',errors='xmlcharrefreplace') Out[13]: b'Spicy "Jalapeño".' #處理XML實體 In [14]: s3 = 'the prompt is >>>' In [15]: from xml.sax.saxutils import unescape In [16]: unescape(s3) Out[16]: 'the prompt is >>>'
#從左到右將字符串解析爲標記流(stream of tokens) In [17]: text = 'foo = 23 + 42 * 10' In [18]: tokens= [('NAME','foo'),('EQ','='),('NUM','23'),('PLUS','+'),('NUM','42'),('TIMES','*'),('NUM',' ...: 10')] In [19]: import re #使用正則表達式 InIn [20]: NAME = r'(?P<NAME>[a-zA_][a-zA-Z_0-9]*)' In [21]: NUM = r'(?P<NUM>\d+)' In [22]: PLUS = r'(?P<PLUS>\+)' In [23]: TIMES = r'(?P<TIMES>\*)' In [24]: EQ = r'(?P<EQ>=)' In [25]: WS = r'(?P<WS>\s+)' In [26]: master_pat = re.compile('|'.join([NAME,NUM,PLUS,TIMES,EQ,WS])) #使用模式對象的scanner()方法來完成分詞操做 In [27]: scanner = master_pat.scanner('foo = 42') #在給定的文本中重複調用match()方法,一次匹配一個模式,下面是匹配過程 In [28]: scanner.match() Out[28]: <re.Match object; span=(0, 3), match='foo'> In [29]: _.lastgroup,_.group() Out[29]: ('NAME', 'foo') In [30]: scanner.match() Out[30]: <re.Match object; span=(3, 4), match=' '> In [31]: _.lastgroup,_.group() Out[31]: ('WS', ' ') In [32]: scanner.match() Out[32]: <re.Match object; span=(4, 5), match='='> In [33]: _.lastgroup,_.group() Out[33]: ('EQ', '=') In [34]: scanner.match() Out[34]: <re.Match object; span=(5, 6), match=' '> In [35]: _.lastgroup,_.group() Out[35]: ('WS', ' ') In [36]: scanner.match() Out[36]: <re.Match object; span=(6, 8), match='42'> In [37]: _.lastgroup,_.group() Out[37]: ('NUM', '42') #經過生成器函數來轉化爲代碼的形式 In [40]: from collections import namedtuple In [41]: token = namedtuple('token',['type','value']) In [42]: def generate_tokens(pat,text): ...: scanner = pat.scanner(text) ...: for m in iter(scanner.match,None): ...: yield token(m.lastgroup,m.group()) ...: In [43]: for tok in generate_tokens(master_pat,'foo = 42'): ...: print(tok) ...: token(type='NAME', value='foo') token(type='WS', value=' ') token(type='EQ', value='=') token(type='WS', value=' ') token(type='NUM', value='42') #過濾空格標記 In [45]: tokens = (tok for tok in generate_tokens(master_pat,text) if tok.type != 'WS') In [46]: for tok in tokens:print(tok) token(type='NAME', value='foo') token(type='EQ', value='=') token(type='NUM', value='23') token(type='PLUS', value='+') token(type='NUM', value='42') token(type='TIMES', value='*') token(type='NUM', value='10')
import re import collections #定義文本分詞變量 NUM = r'(?P<NUM>\d+)' PLUS = r'(?P<PLUS>\+)' MINUS = r'(?P<MINUS>-)' TIMES = r'(?P<TIMES>\*)' DIVIDE = r'(?P<DIVIDE>/)' LPAREN = r'(?P<LPAREN>\()' RPAREN = r'(?P<RPAREN>\))' WS = r'(?P<WS>\s+)' master_pat = re.compile('|'.join([NUM,PLUS,MINUS,TIMES,DIVIDE,LPAREN,RPAREN,WS])) Token = collections.namedtuple('Token',['type','value']) #過濾文本分詞 def generate_tokens(text): scanner = master_pat.scanner(text) for m in iter(scanner.match,None): tok = Token(m.lastgroup,m.group()) if tok.type != 'WS': yield tok class ExpressionEvaluator: def parse(self,text): self.tokens = generate_tokens(text) self.nexttok = None self.tok = None self._advance() return self.expr() def _advance(self): self.tok,self.nexttok = self.nexttok,next(self.tokens,None) def _accept(self,toktype): if self.nexttok and self.nexttok.type == toktype: self._advance() return True else: return False def _expect(self,toktype): if not self._accept(toktype): raise SyntaxError('Expected' + toktype) def expr(self): exprval = self.term() while self._accept('PLUS') or self._accept('MINUS'): op = self.tok.type right = self.term() if op == 'PLUS': exprval += right elif op == 'MINUS': exprval -= right return exprval def term(self): termval = self.factor() while self._accept('TIMES') or self._accept('DIVIDE'): op = self.tok.type right = self.factor() if op == 'TIMES': termval *= right elif op == 'DIVIDE': termval /= right return termval def factor(self): if self._accept('NUM'): return int(self.tok.value) elif self._accept('LPAREN'): exprval = self.expr() self._expect('RPAREN') return exprval else: raise SyntaxError('Expected NUMBER or LPAREN') if __name__ == '__main__': e = ExpressionEvaluator() print(e.parse('2')) print(e.parse('2 + 3')) print(e.parse('2 + 3 * 4')) print(e.parse('2 + (3 + 4) * 5'))
In [2]: data = b'hello world' In [3]: data Out[3]: b'hello world' #切片 In [4]: data[0:5] Out[4]: b'hello' #分隔 In [6]: data.split() Out[6]: [b'hello', b'world'] #替換 In [7]: data.replace(b'hello',b'python') Out[7]: b'python world' #在返回單個切片時將返回ASCII字節表對應的位置 In [8]: data[0] Out[8]: 104