上次寫博客仍是兩個月之前的事,今天閒來無事,決定把之前剛接觸python爬蟲時的一個想法付諸行動:就是從網站上爬取小說,這樣能夠省下好多流量(^_^)。html
由於只是閒暇之餘寫的,還望各位看官海涵;不足之處,不用客氣///^_^.......python
好了,上菜了‘(*>﹏<*)′ubuntu
from bs4 import BeautifulSoup import urllib.request import re import os,time def getUrls(url): urls = [] #url = 'http://www.qu.la/book/1258/' req = urllib.request.Request(url) page = urllib.request.urlopen(req) html = page.read() soup = BeautifulSoup(html,'html.parser') i = 0 for k in soup.find_all(href=re.compile('.html')): #print('www.qu.la'+k['href'],k.get_text()) if i != 0: urls.append('http://www.qu.la'+k['href']) i = i+1 return urls def getContent(url): #url = 'http://www.qu.la/book/1258/759251.html' headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') opener = urllib.request.build_opener() opener.addheaders = [headers] html = opener.open(url).read() soup = BeautifulSoup(html,'html.parser') content = soup.find('div',id='content') title = soup.find('h1') return title.get_text(),content.get_text() if __name__ == '__main__': urls = getUrls('http://www.qu.la/book/1258/') #print(urls) fp = open("異界之魔武流氓.txt","w") for url in urls: print(url) title,content = getContent(url) fp.write(title+"\n") fp.write(content.replace(' ','\n')+"\n") time.sleep(2) fp.close() print("Done")
getUrls()函數是爲了從小說的目錄頁獲取各章節的連接(這一步也能夠獲取到章節的名稱(⊙o⊙)),getContent()是根據提供的章節連接從網頁中獲取章節的名稱和內容。瀏覽器
估計網站有防爬蟲的設置,以上代碼在測試過程當中並不能獲取到全部章節 。。。。。。^_^|||app
添加「user_agents」後,模擬多個瀏覽器訪問則能夠解決上述問題,同時修改了存入txt文件中的內容,去除獲取到的文章內容中的廣告信息。修改後的代碼以下python爬蟲
from bs4 import BeautifulSoup import urllib.request import re import os,time,random def getUrls(url): urls = [] #url = 'http://www.qu.la/book/1258/' req = urllib.request.Request(url) page = urllib.request.urlopen(req) html = page.read() soup = BeautifulSoup(html,'html.parser') i = 0 for k in soup.find_all(href=re.compile('.html')): #print('www.qu.la'+k['href'],k.get_text()) if i != 0: urls.append('http://www.qu.la'+k['href']) i = i+1 return urls def getContent(url): #url = 'http://www.qu.la/book/1258/759251.html' user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ] agent = random.choice(user_agents) opener = urllib.request.build_opener() opener.addheaders = [("User-agent",agent),("Accept","*/*"),] html = opener.open(url).read() ''' req = urllib.request.Request(url) page = urllib.request.urlopen(req) html = page.read() ''' soup = BeautifulSoup(html,'html.parser') content = soup.find('div',id='content') title = soup.find('h1') return title.get_text(),content.get_text() if __name__ == '__main__': urls = getUrls('http://www.qu.la/book/1258/') #print(urls) fp = open("異界之魔武流氓.txt","w") i = 0 for url in urls: print(url) title,content = getContent(url) fp.write(title+"\n") content = content.replace(' ','\n') fp.write(content[0:-71]+"\n") #time.sleep(2) fp.close() print("Done")