看到一個抓取新聞的爬蟲,原文地址:http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/html
修改了下加入了一點改進,重要的是:能在python3下運行啦~python
附上源碼:windows
# -*- coding:utf-8 -*- import os,time import sys import urllib from urllib import request import re from lxml import etree def StringListSave(save_path, filename, slist): if not os.path.exists(save_path): os.makedirs(save_path) path = save_path+"/"+filename+".txt" with open(path, "w+", encoding='GB18030') as fp: for s in slist: fp.write("%s\t\t%s\n" % (s[0], s[1])) def CellPage(save_path, filename, slist): '''單個新聞內容的存儲''' folder = save_path+'/'+filename print (folder) if not os.path.exists(folder): os.mkdir(folder) i = 0 for item, url in slist: #設置每一個頻道保存多少條 if i >= 50:break #過濾不符合windows的文件名 newitem = re.sub(r"[\/\\\:\*\?\"\<\>\|]","",item) print (item) with open(folder+'/'+newitem+'.html', "w+", encoding='GB18030') as fp: PageContent = request.urlopen(url).read().decode("GB18030") fp.write("%s\n" % PageContent) i += 1 def Page_Info(myPage): '''Regex''' mypage_Info = re.findall(r'<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>', myPage, re.S) return mypage_Info def New_Page_Info(new_page): '''Regex(slowly) or Xpath(fast)''' dom = etree.HTML(new_page) new_items = dom.xpath('//tr/td/a/text()') new_urls = dom.xpath('//tr/td/a/@href') assert(len(new_items) == len(new_urls)) return zip(new_items, new_urls) def Spider(url): i = 0 print ("downloading ", url) myPage = request.urlopen(url).read().decode("GB18030") myPageResults = Page_Info(myPage) ntime = time.strftime("%Y%m%d",time.localtime(time.time())) save_path = "news-" + ntime filename = str(i)+"_"+u"Ranking" StringListSave(save_path, filename, myPageResults) i += 1 for item, url in myPageResults: print ("downloading ", url) new_page = request.urlopen(url).read().decode("GB18030") newPageResults = New_Page_Info(new_page) filename = str(i)+"_"+item StringListSave(save_path, filename, newPageResults) newPageResults = New_Page_Info(new_page) CellPage(save_path, filename, newPageResults) i += 1 if __name__ == '__main__': print ("start") start_url = "http://news.163.com/rank/" Spider(start_url) print ("end")
而後把它打包成exe文件,此次使用的是Pyinstallerdom
1. http://www.pyinstaller.org/ 在官網下載Pyinstalleride
2. pyinstall 依賴一些windows組件,須要在http://sourceforge.net/projects/pywin32/ 下載相應版本,如pywin32-220.win32-py3.5.exe。若是出現「Unable to find vcvarsall.bat 」,能夠直接下載編譯好的http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxmlurl
3. 命令窗進入pyinstaller目錄,運行spa
python pyinstaller.py --console --onefile hello.py
4. 文件就打包好了.net
5. 運行exe文件的時候有時候由於環境問題運行不起來,好比報如下錯誤:code
只須要安裝下VC2015便可。VC下載地址:http://pan.baidu.com/s/1o66GMLkxml