最近想作一個小web應用,就是把豆瓣讀書和亞馬遜等寫有書評的網站上關於某本書的打分記錄下來,這樣本身買書的時候看成參考。python
這篇日誌這是以豆瓣網爲例,只討論簡單的功能。web
這很好處理,找到網站的搜索框,而後填入相關信息,提交後查看url便可。瀏覽器
這裏以豆瓣爲例,當我在http://book.douban.com頁面的搜索框中輸入 現代操做系統 後獲得下面的url:服務器
http://book.douban.com/subject_search?search_text=%E7%8E%B0%E4%BB%A3%E6%93%8D%E4%BD%9C%E7%B3%BB%E7%BB%9F&cat=1001
這樣就知道如何向服務器提交查詢請求了,注意search_text後面的一串字符只是編碼不一樣(。。。)。app
詳見下面代碼:工具
book_name = '現代操做系統' douban_book = 'http://book.douban.com/subject_search?' search = [('search_text','現代操做系統'),('cat','1001')] getbook = douban_book + urllib.urlencode(search) content = urllib2.urlopen(getbook).read()
代碼寫的很亂,一些語法還不是很熟悉。我是以寫代碼來學習Python的,什麼不懂就查什麼。佈局
# -*- coding: utf-8 -*- import urllib2 import urllib from sgmllib import SGMLParser class BookInfo(SGMLParser): def reset(self): SGMLParser.reset(self) # 標記對應的標籤 self.is_subject = 0 self.is_subject_info = 0 self.is_subject_h2 = 0 self.is_subject_pub = 0 self.is_subject_star = 0 self.temp = {} # 一個字典,保存暫時的信息 self.info = [] # 一個列表,保存全部的信息 # li標籤開始出現 def start_li(self,attrs): if 'subject-item' in [v for k, v in attrs if k == 'class']: self.is_subject = 1 # li標籤結束 def end_li(self): self.is_subject = 0 def start_h2(self,attrs): if self.is_subject == 1 and '' in [v for k,v in attrs if k == 'class']: self.is_subject_h2 = 1 def end_h2(self): self.is_subject_h2 = 0 def start_div(self,attrs): attr = '' for k,v in attrs: if k == 'class': attr = v break if attr == 'info' and self.is_subject == 1: self.is_subject_info = 1 elif attr == 'pub' and self.is_subject_info == 1: self.is_subject_pub = 1 elif attr == 'star clearfix' and self.is_subject_info == 1: self.is_subject_star = 1 else: pass def end_div(self): if self.is_subject_star == 0: if self.is_subject_pub == 0: self.is_subject_info = 0 self.info.append(self.temp) self.temp = {} else: self.is_subject_pub = 0 else: self.is_subject_star = 0 def handle_data(self,data): if self.is_subject_h2: string = data.strip() if len(string): if 'name' in self.temp: self.temp['name'] = self.temp['name'] + string else: self.temp['name'] = string #print string elif self.is_subject_pub: string = data.strip() if len(string): if 'pub' in self.temp: self.temp['pub'] = self.temp['pub']+string else: self.temp['pub'] = string elif self.is_subject_star: string = data.strip() if len(string): if 'star' in self.temp: self.temp['star'] = self.temp['star'] + string else: self.temp['star'] = string #print string else: pass book_name = '現代操做系統' douban_book = 'http://book.douban.com/subject_search?' search = [('search_text','現代操做系統'),('cat','1001')] getbook = douban_book + urllib.urlencode(search) print getbook content = urllib2.urlopen(getbook).read() fobj = open('book.txt','w') fileobj = open('books.txt','w') book = BookInfo() book.feed(content) for books in book.info: for item in books: print '*************************************************' print '書名:%s' % books['name'] if 'pub' in books: print '出版信息:%s' % books['pub'] if 'star' in books: print '評價:%s' % books['star'] break fobj.write(content) fobj.close() fileobj.close()
這只是開頭的第一步,之後的日子裏不斷的學習和實踐。。。學習
上面的代碼其實仍是有問題的,只是沒用被發現。當標記第一個 div 標籤的確是沒用問題,可是當出現第二個div標籤時,若是第 二個是第一個的子元素,那麼當處理第二個子標籤的/div 閉合標籤的時候就會出錯。網站
一個小小的改進。這個程序嚴格要求輸入的是正確的書名,這樣處理的結果纔是正確的。若是不是徹底正確的書名,個人代碼量就成集合倍增長了。在豆瓣讀書中,評價書小於特定的數目時,是沒有評論的(表明這個版次的書通常是好久的,上個世紀的書了),那麼就沒有參考價值了。編碼
下面是修改後的代碼:
# -*- coding: utf-8 -*- import urllib2 import urllib from sgmllib import SGMLParser class BookInfo(SGMLParser): def reset(self): SGMLParser.reset(self) # 標記對應的標籤 self.is_subject = 0 self.is_subject_info = 0 self.is_subject_h2 = 0 self.is_subject_pub = 0 self.is_subject_star = 0 self.is_subject_rating = 0 self.temp = {} # 一個字典,保存暫時的信息 self.info = [] # 一個列表,保存全部的信息 # li標籤開始出現 def start_li(self,attrs): if 'subject-item' in [v for k, v in attrs if k == 'class']: self.is_subject = 1 # li標籤結束 def end_li(self): self.is_subject = 0 def start_h2(self,attrs): if self.is_subject == 1 and '' in [v for k,v in attrs if k == 'class']: self.is_subject_h2 = 1 def end_h2(self): self.is_subject_h2 = 0 def start_div(self,attrs): attr = '' for k,v in attrs: if k == 'class': attr = v break if attr == 'info' and self.is_subject == 1: self.is_subject_info = 1 elif attr == 'pub' and self.is_subject_info == 1: self.is_subject_pub = 1 elif attr == 'star clearfix' and self.is_subject_info == 1: self.is_subject_star = 1 else: pass def end_div(self): if self.is_subject_info: if self.is_subject_pub: if self.is_subject_star: self.is_subject_star = 0 self.is_subject_rating = 0 else: self.is_subject_pub = 0 elif self.is_subject_star: self.is_subject_star = 0 self.is_subject_rating = 0 if len(self.temp) == 3: self.info.append(self.temp) self.temp = {} else: self.is_subject_info = 0 def start_span(self,attrs): if self.is_subject_star and 'allstar45' in [v for k,v in attrs if k == 'class']: print [v for k,v in attrs if k == 'class'] self.is_subject_rating = 1 def handle_data(self,data): if self.is_subject_h2: string = data.strip() if len(string): if 'name' in self.temp: self.temp['name'] = self.temp['name'] + string else: self.temp['name'] = string if string != book_name: self.temp = {} #print string elif self.is_subject_pub: string = data.strip() if len(string): if 'pub' in self.temp: self.temp['pub'] = self.temp['pub']+string else: self.temp['pub'] = string elif self.is_subject_star: string = data.strip() if len(string) and self.is_subject_rating: if 'star' in self.temp: self.temp['star'] = self.temp['star'] + string else: self.temp['star'] = string print string else: pass book_name = '現代操做系統' douban_book = 'http://book.douban.com/subject_search?' search = [('search_text','現代操做系統'),('cat','1001')] getbook = douban_book + urllib.urlencode(search) print getbook content = urllib2.urlopen(getbook).read() fobj = open('book.txt','w') fileobj = open('books.txt','w') book = BookInfo() book.feed(content) for books in book.info: for item in books: print '*************************************************' print '書名:%s' % books['name'] if 'pub' in books: print '出版信息:%s' % books['pub'] if 'star' in books: print '評價:%s' % books['star'] break fobj.write(content) fobj.close() fileobj.close()
下面是輸出結果:
之後程序修改就是將這本書的全部版本的評價綜合起來,在加上亞馬遜的評價,就能夠了。
-end-