re模塊經常使用函數

時間 2020-02-16

標籤模塊經常使用函數简体版

原文原文鏈接

1. findall() 函數　html

1 find('正則表達式'，‘待匹配的字符串’)  #返回匹配到字符串，並存放在列表中

　　詳解見：http://www.javashuo.com/article/p-ofgfrghx-bc.html正則表達式

1 import re
2 
3 ret = re.findall('www.(baidu|oldboy).com', 'www.oldboy.com')
4 print(ret)  # ['oldboy']     這是由於findall會優先把匹配結果組裏內容返回,若是想要匹配結果,取消權限便可
5 
6 ret = re.findall('www.(?:baidu|oldboy).com', 'www.oldboy.com')
7 print(ret)  # ['www.oldboy.com']

練習 findall() 與分組

2. search() 函數json

1 import re
2 ret = re.search('abc', 'this is abcABC').group()
3 print(ret) #結果 : 'abc'
4 # 函數會在字符串內按規則匹配,當找到第一個匹配結果時就結束查找，而後返回一個包含匹配信息的對象,該對象能夠 經過   調用group()方法   獲得匹配的字符串, 
5 # 若是沒有匹配到字符串，則返回None

1 obj = re.compile('\d{3}')  #將正則表達式編譯成爲一個 正則表達式對象，規則要匹配的是3個數字
2 ret = obj.search('abc123eeee') #正則表達式對象調用search，參數爲待匹配的字符串
3 print(ret.group())  #結果 ： 123

3. match() 函數ide

1 import re
2 ret = re.match('this', 'this is abcABC').group()
3 print(ret) #結果 : 'this'
4 # 在字符串開始處 就按規則匹配，其它 與search()函數使用方法同樣

4 finditer() 函數函數

1 import re
2 ret = re.finditer('\d', 'ds3sy4784a')   #finditer返回一個存放匹配結果的迭代器
3 print(ret)  # <callable_iterator object at 0x10195f940>
4 print(next(ret).group())  #查看第一個結果  '3'
5 print(next(ret).group())  #查看第二個結果  '4'
6 print([i.group() for i in ret])  #查看剩餘的匹配結果 ['7', '8', '4']

5 sub() 和 subn() 函數this

1 import re
2 ret = re.sub('\d', 'H', 'eva3egon4yuan4', 1)#將數字替換成'H'，參數1表示只替換1個
3 print(ret) #evaHegon4yuan4
4 
5 ret = re.subn('\d', 'H', 'eva3egon4yuan4')
6 print(ret)
7 #將數字替換成'H'，返回元組(替換的結果,替換了多少次),即('evaHegonHyuanH', 3)

6 split() 函數url

1 ret = re.split('[ab]', 'abcd')  # 先按'a'分割獲得''和'bcd',在對''和'bcd'分別按'b'分割
2 print(ret)  # ['', '', 'cd']

1 ret=re.split("\d+","eva3egon4yuan")
2 print(ret) #結果 ： ['eva', 'egon', 'yuan']
3 
4 ret=re.split("(\d+)","eva3egon4yuan")
5 print(ret) #結果 ： ['eva', '3', 'egon', '4', 'yuan']
6 
7 #在匹配部分加上（）以後所切出的結果是不一樣的，
8 #沒有（）的沒有保留所匹配的項，可是有（）的卻可以保留了匹配的項，
9 #這個在某些須要保留匹配部分的使用過程是很是重要的。

練習-split() 與分組

擴展練習spa

 1 ret = re.search("<(?P<tag_name>\w+)>\w+</(?P=tag_name)>","<h1>hello</h1>")
 2 #還能夠在分組中利用 ?P<name>的形式給分組起名字
 3 #獲取的匹配結果能夠直接用group('名字')拿到對應的值
 4 print(ret.group('tag_name'))  #結果 ：h1
 5 print(ret.group())  #結果 ：<h1>hello</h1>
 6 
 7 ret = re.search(r"<(\w+)>\w+</\1>","<h1>hello</h1>")
 8 #若是不給組起名字，也能夠用  \序號   來找到對應的組，表示要找的內容和前面的組內容一致
 9 #獲取的匹配結果能夠直接用group(序號)拿到對應的值
10 print(ret.group(1)) #結果 ：h1
11 print(ret.group())  #結果 ：<h1>hello</h1>

匹配標籤

 1 import re
 2 
 3 #匹配數字
 4 ret=re.findall(r"\d+","1-2*(60+(-40.35/5)-(-4*3))")
 5 print(ret) #['1', '2', '60', '40', '35', '5', '4', '3']
 6 
 7 #匹配整數
 8 ret=re.findall(r"-?\d+\.\d*|(-?\d+)","1-2*(60+(-40.35/5)-(-4*3))")
 9 print(ret) #['1', '-2', '60', '', '5', '-4', '3']
10 ret.remove("")
11 print(ret) #['1', '-2', '60', '5', '-4', '3']
12 
13 #匹配實數
14 ret=re.findall(r"-?\d+\.\d*|-?\d+","1-2*(60+(-40.35/5)-(-4*3))")
15 print(ret) #['1', '-2', '60', '-40.35', '5', '-4', '3']

匹配數字/整數/實數

 1 import re
 2 from urllib.request import urlopen
 3 
 4 def getPage(url):
 5     response = urlopen(url)
 6     return response.read().decode('utf-8')
 7 
 8 def parsePage(s):
 9     ret = re.findall(
10         '<div class="item">.*?'
11         '<div class="pic">.*?'
12         '<em .*?>'
13         '(?P<id>\d+).*?'
14         '<span class="title">(?P<title>.*?)</span>.*?'
15         '<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?'
16         '<span>(?P<comment_num>.*?)評價</span>',s,re.S)
17     return ret
18 
19 def main(num):
20     url = 'https://movie.douban.com/top250?start=%s&filter=' % num    # %s標記每頁的首條信息
21     response_html = getPage(url)    #獲取網頁的所有內容
22     ret = parsePage(response_html) #解析所需求的信息
23     print(ret)
24 
25 count = 0
26 for i in range(10):   # 10頁
27     main(count)
28     count += 25        # 每頁25條信息

爬取網頁-簡單版

 1 from urllib.request import urlopen
 2 import re
 3 import json
 4 
 5 def getPage(url):
 6     response = urlopen(url)
 7     return response.read().decode('utf-8')
 8 def parsePage(s):
 9     com = re.compile(
10         '<div class="item">.*?'
11         '<div class="pic">.*?'
12         '<em .*?>'
13         '(?P<id>\d+).*?'
14         '<span class="title">(?P<title>.*?)</span>.*?'
15         '<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?'
16         '<span>(?P<comment_num>.*?)評價</span>', re.S)
17     ret = com.finditer(s)   # finditer返回一個存放匹配結果的迭代器
18     for i in ret:
19         yield {
20             "id": i.group("id"),
21             "title": i.group("title"),
22             "rating_num": i.group("rating_num"),
23             "comment_num": i.group("comment_num"),
24         }
25 def main(num):
26     url = 'https://movie.douban.com/top250?start=%s&filter=' % num
27     response_html = getPage(url)
28     ret = parsePage(response_html)
29     #print(ret)
30 
31     f = open("move_info7", "a", encoding="utf-8")
32     for obj in ret:
33         print(obj)
34         data = json.dumps(obj, ensure_ascii=False)
35         f.write(data + "\n")
36     f.close()
37 
38 if __name__ == '__main__':
39     count = 0
40     for i in range(10):
41         main(count)
42         count += 25