#完整程序以下: import requests import re def getHTMLText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '' def printAPPName(html): try: pattern = re.compile(r'{"im:name":{"label":(.*?)}, "rights"', re.S) #若是不使用re.S參數,則只在每一行內進行匹配,若是一行沒有,就換下一行從新開始,不會跨行。 #而使用re.S參數之後,正則表達式會將這個字符串做爲一個總體,將「\n」當作一個普通的字符加入到這個字符串中,在總體中進行匹配 APPName = re.findall(pattern, str(html)) return 'APPName:' + str(APPName) except: return '' def fillUnivlist(titles, comments, stars, html): try: pattern = re.compile(r'"title":{"label":(.*?)}, "content"', re.S) #提取標題 nbaInfo = re.findall(pattern, str(html)) #提取title # findStr = '"title":{"label":' # nbaInfo = nbaInfo1[nbaInfo1.find(findStr)+len(findStr):] patternFloor = re.compile(r'"content":{"label":(.*?), "attributes":{"type":"text"}}', re.S) #提取content floorText = re.findall(patternFloor, str(html)) patternStar = re.compile(r'"im:rating":{"label":(.*?)}, "id"', re.S) # 提取星級 star = re.findall(patternStar, str(html)) # print(str(star)) number = len(nbaInfo) print(number) for i in range(number): Info = nbaInfo[i] #利用Tools類移除不想要的格式字符 if i==0:Info = Info[Info.find('"title":{"label":')+len('"title":{"label":'):] # print(Info) Info1 = floorText[i] Info2 = star[i] # print(Info2+"hello") titles.append('title:' + Info) comments.append('content:' + Info1) stars.append('star:' + Info2) except: return '' def writeText(titleText, fpath): try: with open(fpath, 'a', encoding='utf-8') as f: f.write(str(titleText)+'\n') f.write('\n') f.close() except: return '' def writeUnivlist(titles, comments, stars, fpath, num): with open(fpath, 'a', encoding='utf-8') as f: for i in range(num): f.write(str(stars[i]) + '\n') f.write('*' * 10 + '\n') f.write(str(titles[i]) + '\n') f.write('*' * 50 + '\n') #輸入一行*號 f.write(str(comments[i]) + '\n') f.write('*' * 100 + '\n') f.close() def main(): count = 0 url = 'https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=cn' #要訪問的網址 output_file = 'D:/StockInfo.txt' #最終文本輸出的文件 html = getHTMLText(url) #獲取HTML APPName = printAPPName(html) writeText(APPName, output_file) for i in range(10): i = i + 1 titles = [] comments = [] stars = [] url = 'https://itunes.apple.com/rss/customerreviews/page=' + str(i) + '/id=414478124/sortby=mostrecent/json?l=en&&cc=cn' html = getHTMLText(url) fillUnivlist(titles, comments, stars, html) writeUnivlist(titles, comments, stars, output_file, len(titles)) count = count + 1 print("\r當前進度: {:.2f}%".format(count * 100 / 10), end="") if __name__ == '__main__': main() #若是想爬取其餘APP只須要改變id的值,如想爬騰訊的,只需將id=414478124換成id=444934666 #另外本程序是模仿https://www.cnblogs.com/sea-ocean/p/6601421.html的