python爬蟲獲取112報修單數據

時間 2019-11-12
原文原文鏈接
python+beautifulsoup

爬取112報修單信息，寫入文檔保存。html
  1 #!/usr/bin/python
  2 # -*- coding: utf-8 -*-
  3 import requests
  4 import sys
  5 import http.cookiejar as cookielib
  6 from bs4 import BeautifulSoup
  7 import re
  8 import time 
  9 import random
 10 
 11 s = requests.session()
 12 s.cookies = cookielib.LWPCookieJar(filename = "LoginCookies-112.txt")
 13 
 14 def login_post():
 15     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 16     header = {'Origin':'http://112.efoxconn.com','Referer':'http://112.efoxconn.com/Home/Index','User-Agent':userAgent}
 17     
 18     login_url = 'http://112.efoxconn.com/Login/Index'
 19     login_data = {'username':'H2605177','password':'123'}
 20 
 21     r = s.post(login_url,data=login_data,headers = header,allow_redirects = False)
 22     time.sleep(random.random()*2)
 23     login_cookies = s.cookies
 24     cok = requests.utils.dict_from_cookiejar(login_cookies)
 25     #print(cok)
 26     #print(login_cookies)
 27     #print(r.text)
 28     print("登錄狀態:",r.status_code)
 29     s.cookies.save()
 30 
 31 def get_bynum_1(url,num): #獲取承辦人信息、申請人信息、滿意度信息。
 32     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 33     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
 34     bynum_url= "http://112.efoxconn.com"+url+"/"+num
 35     r1= s.get(bynum_url)
 36     time.sleep(random.random()*2)
 37     soup1 = BeautifulSoup(r1.text, features='html.parser')
 38     #re_11 = soup1.td.get_text()
 39     re_12 = soup1.find_all("td",class_=re.compile("value"),limit=16)
 40     list_1 = []
 41     for i in range (len(re_12)):
 42         re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 43         if i == 4:
 44             list_1.append(re_13)
 45         elif i == 5:
 46             list_1.append(re_13)
 47         elif i == 6:
 48             list_1.append(re_13)
 49         elif i == 7:
 50             list_1.append(re_13)
 51         elif i == 11:
 52             list_1.append(re_13)
 53         elif i == 12:
 54             list_1.append(re_13)
 55         elif i == 13:
 56             list_1.append(re_13)
 57         #elif i == 15:
 58             #list_1.append(re_13)
 59 
 60     re_12 = soup1.find_all("td",attrs={"class":"r-value","colspan":"7"})#同時使用多個屬性值過濾出滿意度評價。
 61     if re_12:
 62         re_13 = re_12[0].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 
 63         list_1.append(re_13)
 64     else:
 65         list_1.append("未評價")
 66     #re_12 = re.findall(r'<td.*?>(.*?)</td>',re_11,re.S|re.M)#使用re.findall模塊和正則表達式，匹配過濾出目標字符串中的<td>標籤內容
 67     #re_12 = re.findall(r'(?<=<td>).+?(?=</td>)',re_11,re.S|re.M)
 68     return list_1
 69 
 70 def get_bynum_2(url,num): #獲取申請內容信息
 71     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 72     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
 73     bynum_url= "http://112.efoxconn.com"+url+"/"+num
 74     r1= s.get(bynum_url)
 75     time.sleep(random.random()*2)
 76     soup1 = BeautifulSoup(r1.text, features='html.parser')
 77     #re_11 = soup1.td.get_text()
 78     list_2 = []
 79     re_12 = soup1.find_all("td",class_=re.compile("nettype-frame-title")) #獲取通訊單-申請信息標題
 80     for i in range (len(re_12)):
 81         re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 82         if i == 0:
 83             list_2.append(re_13)
 84 
 85     re_12 = soup1.find_all("div",class_=re.compile("model-header")) #獲取特殊單-申請信息標題
 86     for i in range (len(re_12)):
 87         re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 88         if i == 0:
 89             list_2.append(re_13)
 90 
 91     re_122 = soup1.find_all("td",class_=re.compile("color-blue r-value")) #獲取通訊單數量（新增分機、遷移分機）
 92     for i in range (len(re_122)):
 93         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 94         if i == 0:
 95             list_2.append(re_13)
 96 
 97     re_122 = soup1.find_all("span",style=re.compile("color: blue; font-weight: bold;")) #獲取通訊單報修數量（網點不通、電話不通）
 98     for i in range (len(re_122)):
 99         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
100         if i == 0:
101             list_2.append(re_13)
102 
103     re_122 = soup1.find_all("div",style=re.compile("padding-left: 3px;")) #獲取通訊單功能設定類標題信息
104     for i in range (len(re_122)):
105         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
106         if i == 0:
107             list_2.append(re_13)
108 
109     re_122 = soup1.find_all("div",class_=re.compile("tb-memo")) #獲取通訊單《其餘》標題下的需求內容描述,特殊單《其餘》標題下需求描述
110     for i in range (len(re_122)):
111         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
112         if i == 0:
113             list_2.append(re_13)
114 
115     re_122 = soup1.find_all("td",class_=re.compile("r-value tb-memo"),limit=2) #獲取特殊單需求描述
116     for i in range (len(re_122)):
117         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
118         if i == 1:
119             list_2.append(re_13)
120 
121     re_122 = soup1.find_all("td",rowspan="1",limit=2) #獲領料單信息 
122     for i in range (len(re_122)):
123         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
124         if i == 0:
125             list_2.append(re_13)
126         elif i ==1:
127             list_2.append(re_13)
128 
129     return list_2
130 
131 def get_bynum_3(url,num): #獲取籤核進度,籤核人及時間信息。
132     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
133     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
134     bynum_url= "http://112.efoxconn.com"+url+"/"+num
135     r1= s.get(bynum_url)
136     time.sleep(random.random()*2)
137     #print("get獲取狀態:",r1.status_code)
138     soup1 = BeautifulSoup(r1.text, features='html.parser')
139     
140     list_1 = []
141     list_2 = []
142     re_1 = soup1.find_all("tr",class_=re.compile("pro-item"))
143     for i in range (len(re_1)):
144         #re_11 = re_1[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
145         re_11 =str(re_1[i])
146         soup2 = BeautifulSoup(re_11,features='html.parser')
147         re_12 = soup2.find_all("td")
148         for j in range (len(re_12)):
149             re_13 = re_12[j].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
150             list_2.append(re_13)
151         list_1.append(list_2)
152         list_2 = []
153 
154     #print(list_1) 
155 
156     return list_1
157 
158 def get_byme():
159     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
160     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
161     
162     byme_url = 'http://112.efoxconn.com/UserCenter/ByMe'
163     post_data = {'sonbr':'','formtype':'','startdate':'','enddate':'','page':'1','pagesize':'100'}
164 
165     r = s.post(byme_url,data=post_data,headers = header,allow_redirects = False)
166     time.sleep(random.random()*3)
167     print("獲取查詢結果:",r.status_code)
168     print('*************************************************************')
169     
170     #使用正則匹配出過濾標籤內容
171     re_1 = str(re.findall(r"\[(.+)\]",r.text))
172     re_2 = re.split(r"\{|\},|\[\'|\'\]",re_1)
173     #使用過濾器篩掉空串獲得了迭代器，再從新構造出列表
174     re_2 = [item for item in filter(lambda x:x != '',re_2)]
175     for i in range (len(re_2)):
176         print("報修單序號:"+str(i))
177         re_3 = re.split(r"[,\"]",str(re_2[i])) #把元素內容裝進列表re_3
178         #print (re_3)
179         bb="/Sign" 
180         #print(re_3[13],re_3[18],re_3[28])      #打印出每一個單所須要的關鍵字段，如單號、url等。
181         #print(re_3[53],re_3[3],re_3[13])
182         if re_3[1] == "SO_NBR":  #確認元素排序的順序
183             for aa in re_3:
184                 if bb in aa:      #遍歷找到包含/Sign的元素
185                     a=aa
186             b=re_3[3]
187             print(a,b,re_3[13])
188         else:
189             a=re_3[13]
190             b=re_3[18]
191             print(a,b,re_3[28])
192 
193         list_1 = get_bynum_1(a,b) #經過特定函數返回本單號相應數據（申請人信息）
194         list_2 = get_bynum_2(a,b) #經過特定函數返回本單號相應數據（申請內容）
195         list_3 = get_bynum_3(a,b) #經過調用函數獲取籤核記錄集合
196         list_4 = [list_3[-1][-3],list_3[1][-1],list_3[-2][2],list_3[-1][-1]]
197         list_text = list_4+list_1+list_2#拼接須要打印的list爲一個list
198         list_f = [a,"~",b,"~"]    
199     
200         for xx in list_text:
201             list_f.append(xx)
202             list_f.append("~")
203         list_ft=str(list_f).replace("[","").replace("'","").replace("]","").replace(",","")#把list轉換爲字符串，並替換掉無用標點符號。
204         with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
205             f.write(list_ft)
206             
207         '''
208         with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
209             f.write(list_3[-1][-3])
210             f.write("~")
211             f.write(list_3[1][-1])
212             f.write("~")
213             f.write(list_3[-2][2])
214             f.write("~")
215             f.write(list_3[-1][-1])
216             f.write("~") 
217 
218         list_1 = get_bynum_1(a,b) #經過特定函數返回本單號相應數據（申請人信息）
219         #print(list_1)
220         for x in list_1:
221             with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
222                 f.write(x)
223                 f.write("~")
224 
225         list_2 = get_bynum_2(a,b) #經過特定函數返回本單號相應數據（申請內容）
226         #print(list_2)
227         for x in list_2:
228             with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
229                 f.write(x)
230                 f.write("~")
231         '''
232 
233         with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: #寫入一個完整的list後，換行
234             f.write('\n')
235 
236     print('#############################################################')
237 
238 
239 login_post()
240 get_byme()
241 #print(get_bynum_2("/Network/Sign","21118061100034"))