幸虧老子從原來的文章提取出一部分還能夠用的信息。python
步驟:mysql
1, 從未被刪除的數據中恢復了三千多個城市,分別插入到idcard_address表中。sql
2, 發現恢復的數據是新GB/T2600的版本,不少舊版本的編碼沒有了,行政編碼的GB/T2600有9個版本,不少城市有了新的編碼,可是用了老的編碼的身份證仍然有效,因此要找齊全部的版本編碼纔算齊全,無奈年代久遠,網絡資料有限,能找到的最先版本爲1999年版本,是一個掃描生成的PDF文檔,沒法複製粘貼,須要一個一個手打到excle裏面而後用python從excle裏面逐行讀取,速度很慢,因此只截取了幾個省份(北京天津河北山西內蒙古江西等)。數據庫
3, 文件的數據和idcard_address表的數據共同插入到新表中: idcard_address_all.apache
以下bash
建表網絡
# 部分表 CREATE TABLE `idcard_address` ( `id` int(4) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增主鍵', `address_id` int(6) unsigned NOT NULL DEFAULT '0' COMMENT '行政區劃編號', `address_name` varchar(40) NOT NULL DEFAULT '' COMMENT '行政區劃名稱', PRIMARY KEY (`id`), KEY `idx_ai_an` (`address_id`,`address_name`) ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8 COMMENT='行政區劃表'; # 一塊兒表 CREATE TABLE `idcard_address_all` ( `id` int(4) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增主鍵', `address_id` int(6) unsigned NOT NULL DEFAULT '0' COMMENT '行政區劃編號', `address_name` varchar(40) NOT NULL DEFAULT '' COMMENT '行政區劃名稱', PRIMARY KEY (`id`), KEY `idx_ai_an` (`address_id`,`address_name`) ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8 COMMENT='行政區劃表';
代碼app
#!/usr/bin/python # -*- coding: UTF-8 -*- # encoding=utf-8 # Filename: cityZoneDB.py '''全國行政區劃號碼數據庫''' import MySQLdb import sys import re import os reload(sys) sys.setdefaultencoding('utf-8') class CZDB: def __init__(self): self.host = 'localhost' self.port = '3306' self.username = 'root' self.password = '123456' self.dbname = 'python' self.charset = 'utf8' self.oldtable = 'idcard_address' self.newtable = 'idcard_address_all' def connect(self): try: self.dbConnection = MySQLdb.connect(host=self.host, user=self.username, passwd=self.password, db=self.dbname, charset='utf8') # 打開數據庫鏈接 if self.dbConnection: self.cursor = self.dbConnection.cursor() print '數據庫鏈接成功' return True else: print '數據庫鏈接失敗' return False except: print '數據庫鏈接失敗' return False # 插入數據 def insertData(self, cities): # 整理獲取到的數據 values = [] for city in cities: value = (int(city[0]), city[1]) values.append(value) self.connect() try: self.cursor.executemany("insert into idcard_address_all(`address_id`,`address_name`) values(%s,%s)", values) self.dbConnection.commit() print '成功插入%d條數據到%s表' % (len(values), self.newtable) except Exception as e: print e self.dbConnection.rollback() self.dbConnection.close() # 獲取文件中的數據 def getFileData(self): filedict = {} fo = open("/home/c80k2/桌面/爬蟲/行政區劃/測試", "r") fo.seek(os.SEEK_SET) for line in fo.readlines(): if line.strip() != '': code = re.findall('\d\d\d\d\d\d', line)[0] name = line.replace(code, '').strip('\t\n') filedict[code] = name fo.close() print '成功獲取/home/c80k2/桌面/爬蟲/行政區劃/測試的數據' return filedict # 從表中獲取數據 def getDBData(self): dbdict = {} try: self.connect() # 執行SQL語句 self.cursor.execute("select address_id,address_name from %s" % self.oldtable) # 獲取全部記錄列表 results = self.cursor.fetchall() for row in results: code = int(str(row[0]).strip('L')) dbdict[code] = row[1] except: print "Error: unable to fecth data" # 關閉數據庫鏈接 self.dbConnection.close() print '成功獲取DB數據' return dbdict # 獲取全部數據 def getAllData(self): fileData = self.getFileData() dbData = self.getDBData() # 對fileData進行去重處理 for key, value in fileData.items(): if dbData.has_key(str(key)) or dbData.has_key(int(key)): continue else: dbData[int(key)] = value # 對新字典進行排序 sortedCityDict = dbData.items() sortedCityDict.sort() print '成功獲取所有數據' return sortedCityDict # 插入所有數據到數據庫 def insertAllData(self): cities = self.getAllData() self.insertData(cities) czdb = CZDB() czdb.insertAllData()
運行結果測試
成功獲取/home/c80k2/桌面/爬蟲/行政區劃/測試的數據 數據庫鏈接成功 成功獲取DB數據 成功獲取所有數據 數據庫鏈接成功 成功插入3928條數據到idcard_address_all表
mysql> select count(1) from idcard_address; +----------+ | count(1) | +----------+ | 3744 | +----------+ 1 row in set (0.01 sec) mysql> select count(1) from idcard_address_all; +----------+ | count(1) | +----------+ | 3928 | +----------+ 1 row in set (0.00 sec)
總結:fetch
連mysql的時候不能直接在初始化方法中鏈接,這樣會形成mysql 2006 'MySQL server has gone away' 的錯誤。