爬取的數據,須要保存,能夠存儲在文件中或者數據庫中。html
python 字典操做參考:
http://jianwl.com/2017/08/22/%E9%AB%98%E6%95%88%E5%AE%9E%E7%94%A8Python%E5%AD%97%E5%85%B8%E7%9A%84%E6%B8%85%E5%8D%95/python
python 讀寫參考:
http://www.javashuo.com/article/p-dyjkekrw-cv.htmlmysql
all_house數據結構:all_house[{'house_area':dd,'price':dd,'build_year':dd},{},{}...]sql
f=open('net_saving_data.txt','w'); for item in all_house: # house_area=item['house_area']; # price=item['price']; output='\t'.join([str(item['house_area']),str(item['price']),str(item['build_year'])]); f.write(output); f.write('\n'); f.close();
效果如圖:數據庫
'\t'.join(["house_area","price","build_year"])
,注意join()內是個列表。CSV(Comma-Separated_values),以逗號分隔值的文件格式,文件以純文本格式存儲表格數據(數字和文本),每一行以換行符分隔,列與列之間用逗號分隔。與txt比較,可以存儲的數據大小差很少,可是數據以逗號分隔較整齊,全部python網絡爬蟲常常用此來存儲數據。json
import csv; f=open('net_saving_data.csv','w'); csv_write=csv.writer(f); for item in all_house: csv_write.writerow([item.get('house_area',None),item.get('price',None),item.get('build_year',None)]); #f.write('\n'); f.close();
效果如圖:api
如果想在csv中加入key值,操做以下:網絡
csv_write.writerow(['house_area',item.get('house_area',None),'price',item.get('price',None),'build_year',item.get('build_year',None)]);
效果如圖:數據結構
houses=[['2edr','ser','sge'],['as','hi','hioh','aaajio']]; f=open('saving_data.csv','w'); csv_write=csv.writer(f); for house in houses: csv_write.writerow([item for item in house]); f.close();
效果如圖:app
##寫入 with open("anjuke_salehouse.json","w",encoding='utf-8') as f: json.dump(all_house,f,ensure_ascii=False); print(u'加載入文件完成...');
-參考:http://www.javashuo.com/article/p-dyjkekrw-cv.html
import csv; houses=[]; with open('net_saving_data.csv','r') as openscv: csv_reader=csv.reader(openscv); for row in csv_reader: houses.append(row); openscv.close(); print houses;
原數據界面:
讀取數據界面以下:
##讀入 with open("anjuke_salehouse.json",'r',encoding='utf-8') as f: load_dict=json.load(f); print (load_dict);
什麼的不要想,直接load出來的就是json文件格式如出一轍的一個對象。 主要是防止亂碼的等參數設置。
參考文章:https://blog.csdn.net/shandong_chu/article/details/70173952
with open('net_saving_data.txt','r') as opentxt: txt_reader=opentxt.readlines(); for lin in txt_reader: print (lin);
建庫、更刪改查,由於下面涉及一些對數據庫的操做,如今這裏複習一下基本的更刪改查
(1)建數據庫、建表
create table urls(id int NOT NULL auto_increment,url varchar(1000) NOT NULL,content varchar(4000) NOT NULL,created_time timestamp default current_timestamp,primary key(id)); */
(2)查表結構或查database
describe urls; show databases;
(3)表中插入數據
insert into urls(url,content)values("www.baidu.com","這是內容。") select * from urls where id=1;
(4)從數據表中提取數據
insert into urls(url,content)values("www.blog.com","博客網址。"); select * from urls ;
(5)刪除數據
delete from urls where url='www.baidu.com'; select * from urls ;
(6)修改數據
將id=2的content改爲博客園
insert into urls(url,content)values("www.santostang.com","Santos blog"); update urls set url='www.blog.com',content="博客園" where id=2; select * from urls ;
(7)語句參考地址:https://blog.csdn.net/ljxfblog/article/details/52066006
select * from a order by id) union (select * from b order by id);
//使用連表查詢 SELECT Persons.LastName, Persons.FirstName,Orders.OrderNo FROM Persons, Orders WHERE Persons.Id_P = Orders.Id_P //使用join查詢(inner join) SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons INNER JOIN Orders ON Persons.Id_P = Orders.Id_P ORDER BY Persons.LastName
//使用left join查詢,只要左表有匹配的條件,就會生成一行,右表的列值爲空。 SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons LEFT JOIN Orders ON Persons.Id_P=Orders.Id_P ORDER BY Persons.LastName
//使用right join查詢,只要右表有匹配的條件,就會生成一行,左表的列值爲空。 SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons RIGHT JOIN Orders ON Persons.Id_P=Orders.Id_P ORDER BY Persons.LastName
//使用full join查詢,只要其中一個表中存在匹配,就會生成一行,另外一個表的列值爲空。 SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons FULL JOIN Orders ON Persons.Id_P=Orders.Id_P ORDER BY Persons.LastName
alter table urls add created_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP; #增長一列 alter table test modify content char(10) #修改表列類型
參考文獻:http://www.javashuo.com/article/p-dnxfmvts-gb.html
在操做數據庫的時候,python2中通常使用mysqldb,但在python3中已經不在支持mysqldb了,咱們能夠用pymysql和mysql.connector。本文的全部操做都是在python3的pymysql下完成的。python -m pip install pymysql
mysql -u root -p using mysql; select host,user from mysql.user;
mysql的host、user、password等信息。
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor();#獲取方法,建立遊標 sql='select * from urls'; recount=cur.execute(sql);#操做execute()方法寫入sql語句 data=cur.fetchall(); # 返回數據,返回的是tuple類型 print data;
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping')
用於建立數據庫的鏈接,裏面指定參數(用戶名,密碼,主機信息)。cur=conn.cursor()
經過獲取的數據庫鏈接conn下的cursor()方法來建立遊標,以後經過遊標操做execute()
方法寫入純SQL語句。完成MySQL數據庫操做後,須要關閉遊標cur和鏈接conn。conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(); sql1='insert into urls(url,content)values(%s,%s)'; params=('www.sinlang.com','新浪微博'); recount=cur.execute(sql1,params); ##executemany 批量插入 li=[('www.blogs.com','批量插入的第一個'),('www.sou.com','批量插入的第二個')]; sql2='insert into urls(url,content)values(%s,%s)'; recount=cur.executemany(sql2,li); sql3=sql='select * from urls'; recount=cur.execute(sql3); data=cur.fetchall(); conn.commit; cur.close; conn.close; print data; #返回的都是元組((1,'','',time),(2,'','',time)...(6,'','',time));
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor);# 參數設置 sql='select * from urls'; recount=cur.execute(sql); data=cur.fetchall(); cur.close(); conn.close(); print recount; print data; #返回的是列表含字典[{u'url': 'www.baidu.com', u'content': 'xxx', u'id': 1, u'created_time': datetime.datetime(2018, 8, 22, 22, 2, 23)}, {xxx}, {xxx}];
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from urls'; recount=cur.execute(sql); data=cur.fetchall(); for i in range(len(data)): print data[i] cur.close(); conn.close(); print recount;
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from urls'; recount=cur.execute(sql); cur.close(); conn.close(); print recount; for i in range(recount): data=cur.fetchone(); print data;
兩種方法獲取結果都以下:
(1) 在cmd 數據庫中先建立database 和相應表;
create database anjuke; use anjuke; create table anjuke (id int not null Auto_increment,house_title varchar(1000) not null,house_layout varchar(1000) not null,house_area int not null,house_levers int not null,brokername varchar (1000),address varchar(2000),price int not null,primary key(id));
(2)將數據插入數據庫中,爬取的數據格式以下[{},{},{},{}]。for循環列表,提取每個字典中的信息,創建sql語義傳參至execute中。
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='anjuke'); cur=conn.cursor(); for item in all_house: house_title=item['house_title']; house_layout=item['house_layout']; house_area=item['house_area']; house_levers=item['house_levers']; brokername=item['brokername']; house_address=item['house_address']; price=item['price']; sql='insert into anjuke(house_title,house_layout,house_area,house_levers,address,brokername,price) values (%s,%s,%s,%s,%s,%s,%s)'; #parme=(house_title,house_layout,house_area,house_levers,house_address,brokername,price); #cur.execute(sql,parme); cur.execute(sql,(house_title,house_layout,house_area,house_levers,house_address,brokername,price)); conn.commit(); cur.close(); conn.close();
(3) 讀取存入MySQL數據庫de網頁爬取數據,能夠[{},{},{}...{}]或者{}/n{}/n{}/n.../n{}形式輸出。
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='anjuke'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from anjuke'; cur.execute(sql); conn.close(); cur.close(); data=cur.fetchall();#[{},{},{}...{}] print data;
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='anjuke'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from anjuke'; recount=cur.execute(sql); conn.close(); cur.close(); for i in range(recount): data=cur.fetchone(); print data;
conn=mysql.connect()
、獲取遊標:cur=conn.cursor
、對數據庫操做:cur.execute(sql)
、獲取數據庫:cur.fetchall()
四個操做進行數據庫操做。通常獲取後,能夠不用commit,存入數據等須要conn.commit()
,但都要conn.close()
,cur.close()
。