協程提取上海人民法院網站信息:javascript
import gevent import gevent.monkey import selenium import selenium.webdriver import urllib.request from bs4 import BeautifulSoup import time gevent.monkey.patch_all() #自動切換 def download(url,start,end,file): driver = selenium.webdriver.PhantomJS(executable_path=r"D:\python爬蟲視頻\爬蟲代碼\seleniumText\phantomjs-2.1.1-windows\bin\phantomjs.exe") driver.get(url) gevent.sleep(10) #0,100 ;100-200 ;200-300 try: for i in range(start,end): js="javascript:goPage('"+str(i)+"')" driver.execute_script(js) # 共有1175頁 print("js is run",i) gevent.sleep(10) #循環提取頁面 #提取頁面的數據 soup = BeautifulSoup(driver.page_source, "lxml") # 解析數據 table = soup.find("table", attrs={"id": "report"}) # attrs 爲屬性 trs = table.find("tr").find_next_siblings() # .find_next_siblings() 從下一個查找 for tr in trs: tds = tr.find_all("td") linestr="" #拼合數據 for td in tds: linestr+=td.text linestr+=" # " #分割 好作切割 linestr += "\r\n" print(linestr) file.write(linestr.encode("utf-8",errors="ignore")) #寫入保存文件 except: pass driver.quit() url="http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc" file=open("save.text","wb") #保存文件 #建立一個列表,加載多個任務, #gevent.joinall 表示所有加入協程 gevent.joinall([gevent.spawn(download,url,0,235,file), #0,2 設置分配抓取頁數 gevent.spawn(download,url,235,470,file), #file 爲共享一個文件 gevent.spawn(download,url,470,705,file), gevent.spawn(download,url,705,940,file), gevent.spawn(download,url,940,1175,file), ]) file.close()