和昨天同樣的工做量,時間只用了一半,但仍是效率有點低了,由於要把兩個網頁結合起來,因此在列表操做上用了好多時間php
1 import requests 2 from lxml import etree 3 4 headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'} 5 6 def get_html(url): 7 response = requests.get(url, headers=headers) 8 response.encoding = response.apparent_encoding 9 html = response.text 10 return html 11 12 13 def parse_html(html): 14 informations = [] 15 urls = [] 16 html_element = etree.HTML(html) 17 kinds = html_element.xpath('(//tr[@class="even"]|//tr[@class="odd"])/td[2]/text()') 18 ''' 19 kinds: 20 ['技術類', '設計類', '技術類', '技術類', '技術類', '技術類', '技術類', '技術類', '技術類', '產品/項目類'] 21 ''' 22 nums = html_element.xpath('(//tr[@class="even"]|//tr[@class="odd"])//td[3]/text()') 23 ''' 24 nums: 25 ['2', '1', '2', '1', '2', '2', '1', '2', '1', '1'] 26 ''' 27 addresses = html_element.xpath('(//tr[@class="even"]|//tr[@class="odd"])//td[4]/text()') 28 ''' 29 addresses: 30 ['深圳', '深圳', '深圳', '深圳', '深圳', '深圳', '深圳', '深圳', '深圳', '深圳'] 31 ''' 32 times = html_element.xpath('(//tr[@class="even"]|//tr[@class="odd"])//td[5]/text()') 33 ''' 34 times: 35 ['2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04', '2018-08-04'] 36 ''' 37 names = html_element.xpath('(//tr[@class="even"]|//tr[@class="odd"])//a/text()') 38 39 40 41 42 43 detail_url = html_element.xpath('(//tr[@class="even"]|//tr[@class="odd"])//a/@href') 44 for str_url in detail_url: 45 46 url = 'https://hr.tencent.com/' + str(str_url) 47 urls.append(url) 48 49 ''' 50 urls : 51 ['https://hr.tencent.com/position_detail.php?id=42917&keywords=python&tid=0&lid=0', 52 'https://hr.tencent.com/position_detail.php?id=42908&keywords=python&tid=0&lid=0', 53 ...... 54 'https://hr.tencent.com/position_detail.php?id=42832&keywords=python&tid=0&lid=0', 55 'https://hr.tencent.com/position_detail.php?id=42628&keywords=python&tid=0&lid=0'] 56 ''' 57 for index, name in enumerate(names): 58 information = {} 59 information['name'] = name 60 information['url'] = urls[index] 61 information['kind'] = kinds[index] 62 information['nums_of_need'] = nums[index] 63 information['address'] = addresses[index] 64 informations.append(information) 65 # print(informations) 66 # print(urls) 67 return urls, informations 68 69 70 71 def parse_detail_page(url): 72 #one detail page 73 html = get_html(url) 74 return html 75 76 77 78 def get_all_page(page_nums): 79 for i in range(0, page_nums): 80 url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start={0}#a'.format(i*10) 81 html = get_html(url) 82 urls, informations = parse_html(html) 83 # print(informations) 84 works = [] 85 for i, url in enumerate(urls): 86 87 html_detail = parse_detail_page(url) 88 html_element = etree.HTML(html_detail) 89 work_intro = html_element.xpath('//td[@class="l2"]//text()') 90 for index, text in enumerate(work_intro): 91 if text.startswith('工做職責:'): 92 text = text.replace('工做職責:', '') 93 works_detail = {} 94 intros = [] 95 for x in range(index+1, len(work_intro)): 96 intro = work_intro[x].strip() 97 if work_intro[x].startswith('工做要求:'): 98 break 99 intros.append(intro) 100 while '' in intros: 101 intros.remove('') 102 works_detail['1_____工做職責:'] = intros 103 works.append(works_detail) 104 # print(intros) 105 ''' 106 ['負責NLP與深度學習相關技術的研究與實現;', 107 '負責建設基礎的語義分析工具和平臺;', 108 '負責搜索系統、知識圖譜系統、問答與對話系統的設計與搭建;', 109 '結合實際業務需求與數據,研發高效、穩健、完備的NLP解決方案。'] 110 ''' 111 112 if text.startswith('工做要求:'): 113 text = text.replace('工做要求:', '') 114 works_detail = {} 115 requests = [] 116 for x in range(index+1, len(work_intro)): 117 intro = work_intro[x].strip() 118 if work_intro[x].startswith('申請崗位'): 119 break 120 requests.append(intro) 121 while '' in requests: 122 requests.remove('') 123 works_detail['2_____工做要求:'] = requests 124 works.append(works_detail) 125 # print(requests) 126 ''' 127 ['三年以上天然語言處理經驗包括語義表示、搜索、知識圖譜、對話系統等;', 128 '紮實的編程基礎,至少精通一種編程語言,如C++,Java,python等;', 129 '熟悉深度學習以及常見機器學習算法的原理與算法,能熟練運用聚類、分類、迴歸、排序等模型解決有挑戰性的問題;', 130 '對天然語言處理相關的分詞、詞性標註、實體識別、句法分析、語義分析等有深刻的實踐經驗;', 131 '有強烈求知慾,對人工智能領域相關技術有熱情;', '具備良好的數學基礎,良好的英語閱讀能力;', 132 '有項目管理經驗,與他人合做良好,可以獨立有效推進複雜項目。'] 133 ''' 134 return works, informations 135 136 137 138 def main(): 139 works, informations = get_all_page(1) 140 for index, information in enumerate(informations): 141 list = [] 142 list.append(works[index*2]) 143 list.append(works[index*2+1]) 144 information['duty'] = list 145 print(information) 146 147 148 if __name__ == '__main__': 149 main()
目前sublime還輸入不了中文,因此把輸出註釋上,方便看清格式html
運行結果:python
紅色圈出來的是一個字典,包含第一個網頁的信息(職位名稱,url,位置)和詳情頁面的職責(工做職責,工做要求),嵌套的可能有點複雜,但目前尚未想到更簡明的方法算法