最近開始玩 Python , 學習爬蟲相關知識的時候,心血來潮,爬取了豆瓣電影TOP250 和書籍TOP250, 這裏記錄一下本身玩的過程。html
import requests from bs4 import BeautifulSoup import time def getlist(list_url): time.sleep(2) res = requests.get(list_url) soup = BeautifulSoup(res.text, 'html.parser') movie_list = soup.select('.grid_view li') for m in movie_list: rank = m.select('em')[0].text score = m.select('.rating_num')[0].text title = m.select('.title')[0].text direct = m.select('.info .bd p')[0].text.strip() actor = '\n主演:'.join(direct.split(' 主演:')) director = '年代:'.join(actor.split(' ')) if m.select('.inq'): comments = m.select('.inq')[0].text.strip() else: comments = 'None' movie.append( '排名: ' + rank + '\n' + '評分: ' + score + '\n' + '片名: ' + title + '\n' + director + '\n' + '評論: ' + comments + '\n' + '\n') if soup.select('.next a'): asoup = soup.select('.next a')[0]['href'] next_page = seed_url + asoup getlist(next_page) else: print('結束') return movie def write(movies): with open('movie.txt', 'w', encoding='utf8') as m: for a in movies: m.write(a) def main(): write(getlist(seed_url)) pass if __name__ == '__main__': seed_url = 'https://movie.douban.com/top250' movie = [] main()
import bs4 import requests import re from bs4 import BeautifulSoup from operator import itemgetter def getHtmlText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def parserText(text, book_list): soup = BeautifulSoup(text, 'html.parser') for table in soup('table', {'width': '100%'}): if isinstance(table, bs4.element.Tag): tds = table.find('tr')('td') divs = tds[1]('div') content = {} for div in divs: if isinstance(div, bs4.element.Tag): if div.find('a'): name = div.find('a').attrs['title'] content.update({"書名": name}) if div.select('.rating_nums'): score = div.select('.rating_nums')[0].text content.update({"評分": score}) if div.select('.pl'): people_num = div.select('.pl')[0].text regex = re.compile(r'[\d]{1,10}') content.update({"評價人數": regex.findall(people_num)[0]}) ps = tds[1]('p') for p in ps: if isinstance(p, bs4.element.Tag): if p.attrs['class'][0] == 'quote': description = p.find('span').string content.update({"介紹": description}) if p.attrs['class'][0] == 'pl': author = p.string content.update({"做者信息": author}) book_list.append(content) next_books = soup.find('span', {'class': 'next'}) if next_books.find('a'): a = next_books.find('a').attrs['href'] text = getHtmlText(a) parserText(text, books) return book_list def sortedBookTop250(book_list): tmp = sorted(book_list, key=itemgetter('評分'), reverse=True) for i in range(len(tmp)): tmp[i].update({"排名": i + 1}) return tmp def writeToFile(book_list): with open('good_books.txt', 'w', encoding='utf8') as book_file: for book in book_list: for key, value in book.items(): book_file.write(f'{key}:{value}\n') book_file.write('\n') pass def main(): text = getHtmlText(seed_url) book_list = parserText(text, books) writeToFile(sortedBookTop250(book_list)) pass if __name__ == '__main__': seed_url = "https://book.douban.com/top250" books = [] main()
> - 點擊查看個人Github > - 點擊查看個人我的Blog > - 日拱一卒,不期速成python
以上直接貼出了代碼,這是很簡單的兩段代碼,主要用到了 requests 庫和 beautifulsoup 庫,須要的能夠直接拿去,或者直接去個人 GIthub上拿 movies.txt 和 good_books.txtgit