發現某站點文章不少,爬取全部文章名和連接,並保存在txt文檔中,方便後續查看php
#!/usr/bin/python # -*- coding: UTF-8 -*- import urllib,urllib2,re,requests import sys reload(sys) sys.setdefaultencoding('utf-8') domain = ['http://linux.linuxidc.com/'] name_url = [] #一級頁面url name name_url2 = [] #二級頁面url name name_url3 = [] #三級頁面url name name_url4 = [] #四級頁面url name def get(): hd = {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"} url = 'http://linux.linuxidc.com/index.php' html = requests.get(url,headers=hd).text #print html url_content = re.compile(r'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)',re.S) #編譯 url_contents = re.findall(url_content,html) #匹配頁面 #print url_contents for i in url_contents: url_reg = re.compile(r'<a href="(.*?)">') #過濾資料連接 name_reg = re.compile(r'<a href=".*?">(.*?)</a></div>') #過濾資料名稱 url_items = re.findall(url_reg,i) name_items = re.findall(name_reg,i) #print name_items[0] #拼接地址連接 url = domain + url_items url_items = [''.join(url)] #print url_items[0] for i,v in zip(name_items,url_items): name_url.append([i,v]) #print i,v for j in name_url: #j[0]=name j[1]=url if j[1] == 'http://linux.linuxidc.com/index.php?folder=cHVi': #忽略pub目錄 continue elif j[1] == 'http://linux.linuxidc.com/index.php?folder=MjAxMcTq18rBzw==': # 忽略2011資料目錄 continue else: #獲取其餘目錄 #print i[0] html2 = requests.get(j[1], headers=hd).text # print html2 url_content2 = re.compile(r'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)',re.S) # 編譯 url_contents2 = re.findall(url_content, html2) # 匹配二級頁面 #print url_contents2 for p in url_contents2: url_reg2 = re.compile(r'<a href="(.*?)">') # 過濾二級頁面資料連接 name_reg2 = re.compile(r'<a href=".*?">(.*?)</a></div>') # 過濾二級頁面資料名稱 url_items2 = re.findall(url_reg2, p) name_items2 = re.findall(name_reg2, p) #print name_items2,url_items2 #拼接地址連接 url2 = domain + url_items2 url_items2 = [''.join(url2)] #print name_items2[0],url_items2[0] for m,n in zip(name_items2,url_items2): name_url2.append([m,n]) #print m,n for k in name_url2: #k[0]=name k[1]=url html3 = requests.get(k[1], headers=hd).text #print html3 url_content3 = re.compile(r'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)',re.S) # 編譯 url_contents3 = re.findall(url_content3,html3) #匹配三級頁面 #print url_contents3 for p in url_contents3: url_reg3 = re.compile(r'<a href="(.*?)">') #過濾三級頁面資料連接 name_reg3 = re.compile(r'<a href=".*?">(.*?)</a></div>') # 過濾三級頁面資料名稱 url_items3 = re.findall(url_reg3, p) name_items3 = re.findall(name_reg3, p) #print name_items3,url_items3 # 拼接地址連接 url3 = domain + url_items3 url_items3 = [''.join(url3)] #print name_items3[0],url_items3[0] for m, n in zip(name_items3, url_items3): name_url3.append([m, n]) #print m,n for l in name_url3: #l[0]=name l[1]=url html4 = requests.get(l[1],headers=hd).text #print html4 url_content4 = re.compile(r'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)',re.S) # 編譯 url_contents4 = re.findall(url_content4, html4) # 匹配四級頁面 # print url_contents4 for p in url_contents4: url_reg4 = re.compile(r'<a href="(.*?)">') # 過濾四級頁面資料連接 name_reg4 = re.compile(r'<a href=".*?">(.*?)</a></div>') # 過濾四級頁面資料名稱 url_items4 = re.findall(url_reg4, p) name_items4 = re.findall(name_reg4, p) # print name_items4,url_items4 # 拼接地址連接 url4 = domain + url_items4 url_items4 = [''.join(url4)] # print name_items4[0],url_items4[0] for m, n in zip(name_items4, url_items4): name_url4.append([m, n]) f = open('get_list.txt','a+') print "正在保存--%s" %m print >> f,"%s,%s" %(m,n) if __name__ == "__main__": get()
執行過程:html
在腳本文件同路徑下會生成保存的文件:python
文件內容:文章標題,文章連接
linux
報錯:api
requests.exceptions.ConnectionError: HTTPConnectionPool(host='linux.linuxidc.com', port=80): Max retries exceeded with url: /index.php?folder=MjAxN8Tq18rBzy8z1MIvMjXI1Q== (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x0000000002B6D198>: Failed to establish a new connection: [Errno 10060] ',))session
緣由:http鏈接太多沒有關閉致使
app
解決:使用requests的session客戶端模式和保持長鏈接的狀態
dom
#定義 request = requests.Session() #代碼中所有替換爲 html = request.get(url,headers=hd).text