爬取智聯招聘崗位並根據描述生成詞雲

轉自:https://blog.csdn.net/qq_36381299/article/details/80634451python

前言:正則表達式

根據搜索相關的職位,獲取職位數量,由職位數量獲得職位相關頁碼連接,再由相關頁碼連接得到每一個職位連接,最後由職位連接獲取詳細的職位描述。以上得到連接和職位描述由正則表達式完成。瀏覽器

環境:win7 、pycharm、python二、app

所用到的庫:urllib2 、 re、urllib、time 、jieba、matplotlib、wordcloud、numpy、PILdom

文件組成:函數

   main.py ----主要函數文件包括獲取頁碼連接、獲取每頁職位連接、獲取職位描述、爬取信息保存職位描述爲txt文本字體

   zhaopin_wordcloud.py ----根據保存文本信息生成詞雲編碼

   mysh.ttf ----爲生成詞雲準備的字體文件url

   info.txt ----保存職位描述爲txt文本spa

代碼以下:

main.py

#coding:utf-8
import urllib2
import urllib import re import time #獲取頁碼連接 def getpagelist(name): url = "https://sou.zhaopin.com/jobs/searchresult.ashx?" # 模擬瀏覽器頭部 headers = { "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1" } word = {"kw": name} # 相關職位 word = urllib.urlencode(word) # 編碼成字符串 url = url + word # 拼接url request = urllib2.Request(url, headers=headers) # 發起請求 request.add_header("Connection", "keep-alive") # 一直活着 response = urllib2.urlopen(request) # 打開請求 data = response.read() # 讀取數據 restr = "<em>(\\d+)</em>" # 正則表達式 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) # 尋找頁面全部信息 numbers = mylist[0] numbers = eval(numbers)#將職位數轉化爲數據 zhao_numbers = numbers # 職位的數量 zhao_list = [] # 空列表 print "++++++++++++++++" print zhao_numbers # for i in range(zhao_numbers//50): # print if zhao_numbers % 60 == 0: # 生成頁面列表 for i in range(zhao_numbers // 60): #智聯招聘每頁有60個職位 職位總數整除60就是有幾頁 zhao_list.append(    #添加連接到列表 "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1)) else: for i in range(zhao_numbers // 60 + 1): zhao_list.append( "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1)) return zhao_list #獲取每頁中的職位連接 def get_url_list(url): #模擬瀏覽器 headers = { "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1" } request = urllib2.Request(url, headers=headers) # 發起請求 request.add_header("Connection", "keep-alive") # 一直活着 response = urllib2.urlopen(request) # 打開請求 data = response.read() # 讀取數據 #print data restr = ur"<a style=\"font-weight: bold\" par=\"ssidkey=y&ss=201&ff=03&sg=.*?;so=.*?\" href=\"(\bhttp[\s\S]..\bjobs.\w+.\w+.\w+.\w+)" # 正則表達式,()匹配內容 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) # 尋找頁面全部信息 urllist = [] for list in mylist: urllist.append(list) return urllist '''#錯誤的代碼 restr = "http://jobs.zhaopin.com/([\s\S]*?)" # 正則表達式 regex = re.compile(restr, re.IGNORECASE) tableurllist = regex.findall(tablestr) # 尋找頁面全部信息 urllist = [] for list in tableurllist: urllist.append("http://jobs.zhaopin.com/"+list+".htm") return urllist ''' #獲取職位描述信息 def get_zhiwei(url): # 模擬瀏覽器 headers = { "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1" } request = urllib2.Request(url, headers=headers) # 發起請求 request.add_header("Connection", "keep-alive") # 一直活着 response = urllib2.urlopen(request) # 打開請求 data = response.read() # 讀取數據 restr = "<div class=\"tab-inner-cont\">([\s\S]*?)<b>" # 正則表達式 regex = re.compile(restr, re.IGNORECASE) info = regex.findall(data) info_page = info[0].decode('utf-8').strip().replace("<p>", "").replace("</p>", "")#去除標籤替換爲空格 #info_page = info[0].decode('utf-8') return info_page #寫入文件 def wirtetxt(info): file = open('info.txt',"ab+") file.write((info).encode('utf-8')) file.close() #file.write((info).encode('utf-8')) #num=1 #savefilepath = "workinfo.txt" #savefile =open(savefilepath,"wb") zhao_list = getpagelist("python")#頁碼連接 for line in zhao_list: #print line # 打印連接 urllist= get_url_list(line) for line1 in urllist: #num+=1 #print line1 #print "共有%d"%num+"個連接" time.sleep(1) workstr= get_zhiwei(line1) print workstr print "正在寫入...." wirtetxt(workstr) #savefile.write((workstr).encode("utf-8")) #savefile.close()

zhaopin_wordcloud.py

#coding:utf-8
import jieba
import matplotlib import matplotlib.pyplot as plt import wordcloud from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS#詞雲 import numpy as np #科學計算 from PIL import Image #處理圖片 #打開文本 testfile = open("info.txt").read() #數據清洗,去除不重要詞語 testfile = testfile.\ replace("family","").replace("span","").replace("font","").replace("color","").replace("14px","").replace("rgb","").\ replace("size","").replace("br","").replace("0px","").replace("宋體","").replace("margin","").replace("line","").\ replace("style","").replace("line","").replace("height","").replace("white","").\ replace("熟悉","").replace("nbsp","").replace("background","").replace("normal","").replace("margin","").replace("平臺","").\ replace("space","").replace("padding","").replace("bottom","").replace("top","").\ replace("技術","").replace("工做","").replace("text","").replace("indent","").replace("letter","").replace("stretch","").\ replace("25px","").replace("應用","").replace("simsun","").replace("strong","").\ replace("系統","").replace("Yahei","").replace("indent","").replace("left","").replace("data","").replace("熟練","").\ replace("Calii","").replace("Microsoft","").replace("Sans","").replace("div","").replace("serif","").replace("19px","").\ replace("設計","").replace("公司","").replace("開發","").replace("瞭解","").\ replace("熟悉","").replace("進行","").replace("仿宋","").replace("負責","").replace("border","").replace("專業","").\ replace("space","").replace("padding","").replace("優先","").replace("top","").\ replace("技術","").replace("工做","").replace("研發","").replace("要求","").replace("任職","").replace("相關","").\ replace("崗位職責","").replace("計算","").replace("上學","").replace("學歷","") wordlist = jieba.cut(testfile,cut_all=True)#切割 space_list =" ".join(wordlist)#連接詞語 backgroud = np.array(Image.open("1.jpg"))#背景圖片 mywordcloud = WordCloud(background_color="white",#背景顏色 mask=backgroud,#寫字用的背景圖,從背景提取顏色 stopwords=STOPWORDS,#中止的默認詞語 font_path="msyh.ttf",#字體 max_font_size=100,#字體大小 random_state=30,#詞雲數量 scale=1).generate(space_list)#生成詞雲 image_color = ImageColorGenerator(backgroud)#生成詞雲的顏色 plt.imshow(mywordcloud)#顯示詞雲 plt.axis("off") plt.show()

main.py 運行效果:

zhaopin_wordcloud.py  運行生成詞雲圖片:

相關文章
相關標籤/搜索