能夠用pandas讀出以前保存的數據:python
newsdf = pd.read_csv(r'F:\duym\gzccnews.csv')mysql
一.把爬取的內容保存到數據庫sqlite3web
import sqlite3
with sqlite3.connect('gzccnewsdb.sqlite') as db:
newsdf.to_sql('gzccnews',con = db)ajax
with sqlite3.connect('gzccnewsdb.sqlite') as db:
df2 = pd.read_sql_query('SELECT * FROM gzccnews',con=db)sql
保存到MySQL數據庫數據庫
做爲一名愛運動的男生,我選擇爬取的是季後賽nba新聞app
這是生成爬蟲的代碼dom
def creat_bs(url): result = requests.get(url) e=chardet.detect(result.content)['encoding'] #set the code of request object to the webpage's code result.encoding=e c = result.content soup =BeautifulSoup(c,'lxml') return soup
構建要獲取網頁的集合函數函數
def build_urls(prefix,suffix): urls=[] for item in suffix: url=prefix+item urls.append(url) return urls
爬取函數ui
def find_title_link(soup): titles=[] links=[] try: contanier=soup.find('div',{'class':'container_padd'}) ajaxtable=contanier.find('form',{'id':'ajaxtable'}) page_list=ajaxtable.find_all('li') for page in page_list: titlelink=page.find('a',{'class':'truetit'}) if titlelink.text==None: title=titlelink.find('b').text else: title=titlelink.text if np.random.uniform(0,1)>0.90: link=titlelink.get('href') titles.append(title) links.append(link) except: print('have no value') return titles,links
保存數據
wordlist=str() for title in title_group: wordlist+=title for reply in reply_group: wordlist+=reply def savetxt(wordlist): f=open('wordlist.txt','wb') f.write(wordlist.encode('utf8')) f.close() savetxt(wordlist)
生成的詞雲