^ 以某個開頭
javascript
$ 以某個結尾
html
* 某個任意屢次,大於等於0
java
? 讓某個取消貪婪匹配,能夠理解爲改成從左到右匹配到某個爲止
app
+ 某個至少爲一次,大於等於1
url
{ }例,{2,5},某個出現2到5次.....{2},{2,}等
spa
| 或者(a|b),選a或者b
code
[ ]有三種意思,1.[13567]中括號的任選一個-------2.[0-9],[a-z]-------3.[.]就表明.號,不表明任意字符了
htm
[^]例,[^1]非1
ip
[a-z]同上
utf-8
/s空格
/S非空格
/w表明[A-Za-z0-9_]
/W表明非[A-Za-z0-9_]
[u4E00-u9FA5]表明漢字
( )略
/d數字
import re line1 = "你出生在2016-09-01" line2 = "你出生在2016-9-1" line3 = "你出生在2016/09/01" line4 = "你出生在2016年9月1號" line5 = "你出生在2016-09" regex_str = "(你出生在\d{4}(-|/|年)\d{1,2}($|(月|/|-)\d{1,2}($|號)))" match_obj = re.match(regex_str, line5) if match_obj: print(match_obj.group(1)) #五個line都能匹配
# -*- coding: utf-8 -*- # @Author: Lai import re import os import requests BASE_PATH = "E:/EW/" def get_block_link(url): html_obj = requests.get(url) reg = '<a target="_blank" href="(.*?)" class="l mr10"><img onerror' link_list = re.findall(reg, html_obj.text, re.S) return link_list # 獲得標題和多章節的連接 def get_child_link(url): links = [] for link in get_block_link(url): html_obj = requests.get(link) reg = '<a href="(.*?)" class="reader" title=".*?">.*?</a>' link = re.findall(reg, html_obj.text)[0] links.append(link) return links def get_charterName_and_content(url): html_obj = requests.get(url) html_obj.encoding = "GBK" reg_content = 'id="content"><script type="text/javascript">style5\(\);</script>(.*?)<script type="text/javascript">' if re.findall(reg_content, html_obj.text, re.S): content = re.findall(reg_content, html_obj.text, re.S)[0] else: content = "空" reg = ' class="article_title">.*?</a> >.*?\s(.*?)</div>' if re.findall(reg, html_obj.text, re.S): charter_name = re.findall(reg, html_obj.text, re.S)[0] else: charter_name = "空" res = (charter_name, content) return res # 獲得每一個章節的連接 def get_child_in_info(url): for link in get_child_link(url): html_obj = requests.get(link) html_obj.encoding = "GBK" reg = '<div class="chapName"><span class="r">.*?</span><strong>(.*?)</strong><div class="clear">' if re.findall(reg, html_obj.text, re.S): title = re.findall(reg, html_obj.text, re.S)[0] else: title = "空" path = os.path.join(BASE_PATH, title) reg_url = '<li><a href="(.*?)" title=".*?">.*?</a></li>' urls = re.findall(reg_url, html_obj.text) for url in urls: chart_name, content = get_charterName_and_content(url) if not os.path.exists(path): os.mkdir(path) else: pass position = os.path.join(path, chart_name+".html") with open(position, "w") as f: f.write(content) if __name__ == "__main__": real_url = "http://www.quanshuwang.com/list/1_1.html" # get_block_link(real_url) # print(get_child_link(real_url)) get_child_in_info(real_url)