re模塊是python提供的一套關於處理正則表達式的模塊。python
做用:搜索正則表達式
import re res = re.search(r"o", "hello world") print(res.group()) # o s = re.search(r"c", "hello world") print(s.group()) # AttributeError: 'NoneType' object has no attribute 'group'
做用:從開頭匹配url
import re res = re.match(r"h", "hello world") print(res.group()) # h s = re.match(r"c", "hello world") print(s.group()) # AttributeError: 'NoneType' object has no attribute 'group'
做用:查找全部,返回listspa
import re lst = re.findall(r"\d+", "name Tom age 18 phone 2354786") print(lst) # ['18', '2354786']
import re lst = re.findall(r"www\.(baidu|qq)\.com", "www.baidu.com") print(lst) # ['baidu']
import re lst = re.findall(r"www\.(?:baidu|qq)\.com", "www.baidu.com") print(lst) # ['www.baidu.com']
做用:查找全部,返回迭代器code
import re lst = re.finditer(r"\d+", "name Tom age 18 phone 2354786") for el in lst: print(el.group()) 結果: 18 2354786
做用:分割,返回list對象
import re ret = re.split(r"[abc]", "qwerafjbfcd") # 先按a分割,再按b分割,而後按c分割 print(ret) # ['qwer', 'fj', 'f', 'd']
import re ret = re.split("\d+", "eva3egon4yuan") print(ret) # ['eva', 'egon', 'yuan']
import re ret = re.split("(\d+)", "eva3egon4yuan") print(ret) # ['eva', '3', 'egon', '4', 'yuan']
做用:替換blog
import re ret = re.sub(r"\s", "__", "hello world") print(ret) # hello__world
做用:替換,返回元組(替換的結果,替換次數)utf-8
import re ret = re.subn(r"\s", "__", "name age gender phone") print(ret) # ('name__age__gender__phone', 3)
做用:將正則表達式編譯成一個正則表達式對象,進行預加載字符串
import re obj = re.compile(r"\d{3}") ret = obj.search("abc333eee") print(ret.group()) # 333
正則表達式中,"."表示匹配除"\n"之外的全部字符。對於字符串中有換行,此時正則匹配到的則是多個字符串,而利用re.S,"."能夠匹配"\n",即獲得的就是一個總體字符串。get
from urllib.request import urlopen import re # url url = "url" # 獲取所有內容 content = urlopen(url).read().decode() # 預加載正則表達 obj = re.compile(r"正則表達") # 獲取特定內容 res = obj.search(content).group("組名")
from urllib.request import urlopen import re # 預加載正則表達式 obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?導演: (?P<director>.*?) .*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<people>.*?)人評價</span>', re.S) def get_content(url): """ 獲取內容 :param url: 網址 :return: 網頁所有內容 """ content = urlopen(url).read().decode("utf-8") return content def parse_content(content): """ 解析內容 :param content: 網頁所有內容 :return: 字典形式的所需內容 """ pc = obj.finditer(content) for el in pc: yield { "name": el.group("name"), "director": el.group("director"), "score": el.group("score"), "people": el.group("people") } def main(): """ 獲取並解析內容,將所需內容寫入文件中 :return: None """ for i in range(10): url = "https://movie.douban.com/top250?start=%s&filter=" % (i*25) p = parse_content(get_content(url)) with open("movie.txt", mode="a", encoding="utf-8") as f: for el in p: f.write(str(el) + "\n") if __name__ == "__main__": main()