爬取112報修單信息,寫入文檔保存。html
1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 import requests 4 import sys 5 import http.cookiejar as cookielib 6 from bs4 import BeautifulSoup 7 import re 8 import time 9 import random 10 11 s = requests.session() 12 s.cookies = cookielib.LWPCookieJar(filename = "LoginCookies-112.txt") 13 14 def login_post(): 15 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 16 header = {'Origin':'http://112.efoxconn.com','Referer':'http://112.efoxconn.com/Home/Index','User-Agent':userAgent} 17 18 login_url = 'http://112.efoxconn.com/Login/Index' 19 login_data = {'username':'H2605177','password':'123'} 20 21 r = s.post(login_url,data=login_data,headers = header,allow_redirects = False) 22 time.sleep(random.random()*2) 23 login_cookies = s.cookies 24 cok = requests.utils.dict_from_cookiejar(login_cookies) 25 #print(cok) 26 #print(login_cookies) 27 #print(r.text) 28 print("登錄狀態:",r.status_code) 29 s.cookies.save() 30 31 def get_bynum_1(url,num): #獲取承辦人信息、申請人信息、滿意度信息。 32 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 33 header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent} 34 bynum_url= "http://112.efoxconn.com"+url+"/"+num 35 r1= s.get(bynum_url) 36 time.sleep(random.random()*2) 37 soup1 = BeautifulSoup(r1.text, features='html.parser') 38 #re_11 = soup1.td.get_text() 39 re_12 = soup1.find_all("td",class_=re.compile("value"),limit=16) 40 list_1 = [] 41 for i in range (len(re_12)): 42 re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 43 if i == 4: 44 list_1.append(re_13) 45 elif i == 5: 46 list_1.append(re_13) 47 elif i == 6: 48 list_1.append(re_13) 49 elif i == 7: 50 list_1.append(re_13) 51 elif i == 11: 52 list_1.append(re_13) 53 elif i == 12: 54 list_1.append(re_13) 55 elif i == 13: 56 list_1.append(re_13) 57 #elif i == 15: 58 #list_1.append(re_13) 59 60 re_12 = soup1.find_all("td",attrs={"class":"r-value","colspan":"7"})#同時使用多個屬性值過濾出滿意度評價。 61 if re_12: 62 re_13 = re_12[0].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 63 list_1.append(re_13) 64 else: 65 list_1.append("未評價") 66 #re_12 = re.findall(r'<td.*?>(.*?)</td>',re_11,re.S|re.M)#使用re.findall模塊和正則表達式,匹配過濾出目標字符串中的<td>標籤內容 67 #re_12 = re.findall(r'(?<=<td>).+?(?=</td>)',re_11,re.S|re.M) 68 return list_1 69 70 def get_bynum_2(url,num): #獲取申請內容信息 71 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 72 header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent} 73 bynum_url= "http://112.efoxconn.com"+url+"/"+num 74 r1= s.get(bynum_url) 75 time.sleep(random.random()*2) 76 soup1 = BeautifulSoup(r1.text, features='html.parser') 77 #re_11 = soup1.td.get_text() 78 list_2 = [] 79 re_12 = soup1.find_all("td",class_=re.compile("nettype-frame-title")) #獲取通訊單-申請信息標題 80 for i in range (len(re_12)): 81 re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 82 if i == 0: 83 list_2.append(re_13) 84 85 re_12 = soup1.find_all("div",class_=re.compile("model-header")) #獲取特殊單-申請信息標題 86 for i in range (len(re_12)): 87 re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 88 if i == 0: 89 list_2.append(re_13) 90 91 re_122 = soup1.find_all("td",class_=re.compile("color-blue r-value")) #獲取通訊單數量(新增分機、遷移分機) 92 for i in range (len(re_122)): 93 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 94 if i == 0: 95 list_2.append(re_13) 96 97 re_122 = soup1.find_all("span",style=re.compile("color: blue; font-weight: bold;")) #獲取通訊單報修數量(網點不通、電話不通) 98 for i in range (len(re_122)): 99 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 100 if i == 0: 101 list_2.append(re_13) 102 103 re_122 = soup1.find_all("div",style=re.compile("padding-left: 3px;")) #獲取通訊單功能設定類標題信息 104 for i in range (len(re_122)): 105 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 106 if i == 0: 107 list_2.append(re_13) 108 109 re_122 = soup1.find_all("div",class_=re.compile("tb-memo")) #獲取通訊單《其餘》標題下的需求內容描述,特殊單《其餘》標題下需求描述 110 for i in range (len(re_122)): 111 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 112 if i == 0: 113 list_2.append(re_13) 114 115 re_122 = soup1.find_all("td",class_=re.compile("r-value tb-memo"),limit=2) #獲取特殊單需求描述 116 for i in range (len(re_122)): 117 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 118 if i == 1: 119 list_2.append(re_13) 120 121 re_122 = soup1.find_all("td",rowspan="1",limit=2) #獲領料單信息 122 for i in range (len(re_122)): 123 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 124 if i == 0: 125 list_2.append(re_13) 126 elif i ==1: 127 list_2.append(re_13) 128 129 return list_2 130 131 def get_bynum_3(url,num): #獲取籤核進度,籤核人及時間信息。 132 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 133 header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent} 134 bynum_url= "http://112.efoxconn.com"+url+"/"+num 135 r1= s.get(bynum_url) 136 time.sleep(random.random()*2) 137 #print("get獲取狀態:",r1.status_code) 138 soup1 = BeautifulSoup(r1.text, features='html.parser') 139 140 list_1 = [] 141 list_2 = [] 142 re_1 = soup1.find_all("tr",class_=re.compile("pro-item")) 143 for i in range (len(re_1)): 144 #re_11 = re_1[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 145 re_11 =str(re_1[i]) 146 soup2 = BeautifulSoup(re_11,features='html.parser') 147 re_12 = soup2.find_all("td") 148 for j in range (len(re_12)): 149 re_13 = re_12[j].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 150 list_2.append(re_13) 151 list_1.append(list_2) 152 list_2 = [] 153 154 #print(list_1) 155 156 return list_1 157 158 def get_byme(): 159 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 160 header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent} 161 162 byme_url = 'http://112.efoxconn.com/UserCenter/ByMe' 163 post_data = {'sonbr':'','formtype':'','startdate':'','enddate':'','page':'1','pagesize':'100'} 164 165 r = s.post(byme_url,data=post_data,headers = header,allow_redirects = False) 166 time.sleep(random.random()*3) 167 print("獲取查詢結果:",r.status_code) 168 print('*************************************************************') 169 170 #使用正則匹配出過濾標籤內容 171 re_1 = str(re.findall(r"\[(.+)\]",r.text)) 172 re_2 = re.split(r"\{|\},|\[\'|\'\]",re_1) 173 #使用過濾器篩掉空串獲得了迭代器,再從新構造出列表 174 re_2 = [item for item in filter(lambda x:x != '',re_2)] 175 for i in range (len(re_2)): 176 print("報修單序號:"+str(i)) 177 re_3 = re.split(r"[,\"]",str(re_2[i])) #把元素內容裝進列表re_3 178 #print (re_3) 179 bb="/Sign" 180 #print(re_3[13],re_3[18],re_3[28]) #打印出每一個單所須要的關鍵字段,如單號、url等。 181 #print(re_3[53],re_3[3],re_3[13]) 182 if re_3[1] == "SO_NBR": #確認元素排序的順序 183 for aa in re_3: 184 if bb in aa: #遍歷找到包含/Sign的元素 185 a=aa 186 b=re_3[3] 187 print(a,b,re_3[13]) 188 else: 189 a=re_3[13] 190 b=re_3[18] 191 print(a,b,re_3[28]) 192 193 list_1 = get_bynum_1(a,b) #經過特定函數返回本單號相應數據(申請人信息) 194 list_2 = get_bynum_2(a,b) #經過特定函數返回本單號相應數據(申請內容) 195 list_3 = get_bynum_3(a,b) #經過調用函數獲取籤核記錄集合 196 list_4 = [list_3[-1][-3],list_3[1][-1],list_3[-2][2],list_3[-1][-1]] 197 list_text = list_4+list_1+list_2#拼接須要打印的list爲一個list 198 list_f = [a,"~",b,"~"] 199 200 for xx in list_text: 201 list_f.append(xx) 202 list_f.append("~") 203 list_ft=str(list_f).replace("[","").replace("'","").replace("]","").replace(",","")#把list轉換爲字符串,並替換掉無用標點符號。 204 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: 205 f.write(list_ft) 206 207 ''' 208 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: 209 f.write(list_3[-1][-3]) 210 f.write("~") 211 f.write(list_3[1][-1]) 212 f.write("~") 213 f.write(list_3[-2][2]) 214 f.write("~") 215 f.write(list_3[-1][-1]) 216 f.write("~") 217 218 list_1 = get_bynum_1(a,b) #經過特定函數返回本單號相應數據(申請人信息) 219 #print(list_1) 220 for x in list_1: 221 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: 222 f.write(x) 223 f.write("~") 224 225 list_2 = get_bynum_2(a,b) #經過特定函數返回本單號相應數據(申請內容) 226 #print(list_2) 227 for x in list_2: 228 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: 229 f.write(x) 230 f.write("~") 231 ''' 232 233 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: #寫入一個完整的list後,換行 234 f.write('\n') 235 236 print('#############################################################') 237 238 239 login_post() 240 get_byme() 241 #print(get_bynum_2("/Network/Sign","21118061100034"))