PHP一直沒有一個好用的多線程機制,雖然可使用一些trick的手段來實現並行的效果(例如藉助apache或者nginx服務器等,或者fork一個子進程,或者直接動態生成多個PHP腳本多進程運行),可是不管從代碼結構上,仍是從使用的複雜程度上,用起來都不是那麼順手。還據說過一個pthreads的PHP的擴展,這是一個真正可以實現PHP多線程的擴展,看github上它的介紹:Absolutely, this is not a hack, we don't use forking or any other such nonsense, what you create are honest to goodness posix threads that are completely compatible with PHP and safe ... this is true multi-threading :)python
1 #!/usr/bin/env python 2 # coding=utf-8 3 class config: 4 keyword = '青島' 5 search_type = 'shop' 6 url = 'http://s.taobao.com/search?q=' + keyword + '&commend=all&search_type='+ search_type +'&sourceId=tb.index&initiative_id=tbindexz_20131207&app=shopsearch&s='
#!/usr/bin/env python # coding=utf-8 import requests from search_config import config class Scrapy(): def __init__(self, threadname, start_num): self.threadname = threadname self.start_num = start_num print threadname + 'start.....' def run(self): url = config.url + self.start_num response = requests.get(url) print self.threadname + 'end......' def main(): for i in range(0,13,6): scrapy = Scrapy('scrapy', str(i)) scrapy.run() if __name__ == '__main__': main()
#!/usr/bin/env python # coding=utf-8 import requests from search_config import config import threading class Scrapy(threading.Thread): def __init__(self, threadname, start_num): threading.Thread.__init__(self, name = threadname) self.threadname = threadname self.start_num = start_num print threadname + 'start.....' #重寫Thread類的run方法 def run(self): url = config.url + self.start_num response = requests.get(url) print self.threadname + 'end......' def main(): for i in range(0,13,6): scrapy = Scrapy('scrapy', str(i)) scrapy.start() if __name__ == '__main__': main()
#!/usr/bin/env python # coding=utf-8 import requests from BeautifulSoup import BeautifulSoup from search_config import config from Queue import Queue import threading class Scrapy(threading.Thread): def __init__(self, threadname, queue, out_queue): threading.Thread.__init__(self, name = threadname) self.sharedata = queue self.out_queue= out_queue self.threadname = threadname print threadname + 'start.....' def run(self): url = config.url + self.sharedata.get() response = requests.get(url) self.out_queue.put(response) print self.threadname + 'end......' class Parse(threading.Thread): def __init__(self, threadname, queue, out_queue): threading.Thread.__init__(self, name = threadname) self.sharedata = queue self.out_queue= out_queue self.threadname = threadname print threadname + 'start.....' def run(self): response = self.sharedata.get() body = response.content.decode('gbk').encode('utf-8') soup = BeautifulSoup(body) ul_html = soup.find('ul',{'id':'list-container'}) lists = ul_html.findAll('li',{'class':'list-item'}) stores = [] for list in lists: store= {} try: infos = list.findAll('a',{'trace':'shop'}) for info in infos: attrs = dict(info.attrs) if attrs.has_key('class'): if 'rank' in attrs['class']: rank_string = attrs['class'] rank_num = rank_string[-2:] if (rank_num[0] == '-'): store['rank'] = rank_num[-1] else: store['rank'] = rank_num if attrs.has_key('title'): store['title'] = attrs['title'] store['href'] = attrs['href'] except AttributeError: pass if store: stores.append(store) for store in stores: print store['title'] + ' ' + store['rank'] print self.threadname + 'end......' def main(): queue = Queue() targets = Queue() stores = Queue() scrapy = [] for i in range(0,13,6): #queue 原始請求 #targets 等待解析的內容 #stores解析完成的內容,這裏爲了簡單直觀,直接在線程中輸出了內容,並無使用該隊列 queue.put(str(i)) scrapy = Scrapy('scrapy', queue, targets) scrapy.start() parse = Parse('parse', targets, stores) parse.start() if __name__ == '__main__': main()
#!/usr/bin/env python # coding=utf-8 from BeautifulSoup import BeautifulSoup import redis class Parse(): def __init__(self, threadname, content): self.threadname = threadname self.content = content print threadname + 'start.....' def run(self): response = self.content if response: body = response.decode('gbk').encode('utf-8') soup = BeautifulSoup(body) ul_html = soup.find('ul',{'id':'list-container'}) lists = ul_html.findAll('li',{'class':'list-item'}) stores = [] for list in lists: store= {} try: infos = list.findAll('a',{'trace':'shop'}) for info in infos: attrs = dict(info.attrs) if attrs.has_key('class'): if 'rank' in attrs['class']: rank_string = attrs['class'] rank_num = rank_string[-2:] if (rank_num[0] == '-'): store['rank'] = rank_num[-1] else: store['rank'] = rank_num if attrs.has_key('title'): store['title'] = attrs['title'] store['href'] = attrs['href'] except AttributeError: pass if store: stores.append(store) for store in stores: try: print store['title'] + ' ' + store['rank'] except KeyError: pass print self.threadname + 'end......' else: pass def main(): r = redis.StrictRedis(host='localhost', port=6379) while True: content = r.lpop('targets') if (content): parse = Parse('parse', content) parse.run() else: break if __name__ == '__main__': main()
GIL既然是針對一個python解釋器進程而言的,那麼,若是解釋器能夠多進程解釋執行,那就不存在GIL的問題了。一樣,他也不會致使你多個解釋器跑在同一個核上。 因此,最好的解決方案,是多線程+多進程結合。經過多線程來跑I/O密集型程序,經過控制合適數量的進程來跑CPU密集型的操做,這樣就能夠跑慢CPU了:)