python掃描全部文件放入數據庫


上文 用memory_profiler 監控內存耗用寫了一個遍歷目錄夾下全部文件的腳本,約有26萬個文件,用時90秒左右。
如今想要將文件名、路徑存入數據庫, 要求有新增文件時,就寫入數據,有減小的文件時,就從數據庫刪除。
python

1、 第一步,將文件寫入數據庫。

1. 上代碼:

root@WJL-SH4031667: # cat setFileToSql.pysql

python hljs    32行數據庫

from os import walk
from os.path import join,getmtime
import sqlite3

@profile
def scan():
    dbName,tblName = "test.db","fmanages"
    conn = sqlite3.connect(dbName)
    cursor = conn.cursor()
    cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))
    ret = {}
    for path, _, files in walk("./"):
        for f in files:
            if not f.endswith(""):
                continue


            filename = join(path, f)

            try:
                cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'{}\',\'{}\', \'study\')".format(tblName, filename.rstrip(), path.rstrip(), getmtime(filename)))
            except:
                continue

    cursor.close()
    conn.commit()
    conn.close()

if __name__ == "__main__":
    scan()

2. 測試運行用時和內存使用狀況

root@WJL-SH4031667: # time python -m memory_profiler setFileToSql.py測試

python hljs    26行fetch

Line #    Mem usage    Increment   Line Contents
================================================
     5   29.820 MiB   29.820 MiB   @profile
     6                             def scan():
     7   29.824 MiB    0.004 MiB       dbName,tblName = "test.db","fmanages"
     8   29.984 MiB    0.160 MiB       conn = sqlite3.connect(dbName)
     9   29.988 MiB    0.004 MiB       cursor = conn.cursor()
    10   30.473 MiB    0.484 MiB       cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))

    12   37.578 MiB -48094.824 MiB       for path, _, files in walk("./"):
    13   37.578 MiB -479957.762 MiB           for f in files:
    14   37.578 MiB -431869.512 MiB               if not f.endswith(""):
    15                                             continue

    20   37.578 MiB -431869.512 MiB               try:
    21   37.578 MiB -431869.469 MiB                   cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'{}\',\'{}\', \'study\')".format(tblName, f.rstrip(), path.rstrip(), getmtime(f)))
    22   37.578 MiB -431727.805 MiB               except:
    23   37.578 MiB -431727.809 MiB                   continue
    24
    25   31.953 MiB   -5.625 MiB       cursor.close()
    26   31.953 MiB    0.000 MiB       conn.commit()
    27   31.953 MiB    0.000 MiB       conn.close()
    
python -m memory_profiler setFileToSql.py  94.52s user 253.09s system 99% cpu 5:49.69 total

3. 優化

從上看出,使用數據庫用了內存30M左右,相比上文 用memory_profiler 監控內存耗用, 字典方式存儲時用的100M,少了70M,字典果真是吃糧大貨。想提升sqlite的寫入速度?但和上文比對以後,二者時間差很少,優化的空間就不大了。user態用時90秒的時間,都用在了IO的耗時上。優化

4. 查看生成的數據庫文件大小(23萬72條記錄,42M左右)

root@WJL-SH4031667:# ls -lh test.db
-rwxrwxrwx 1 root root 42M Nov 1 09:01 test.db.net

root@WJL-SH4031667: # sqlite3 test.db3d

sqlite3 hljs    3行code

sqlite> select count(*) from fmanages;
230072

2、第二步,增長和刪除文件後,對應的數據庫的記錄可以相應的增刪。

1. 主要是新增了兩個集合的對比,第一個集合是前一次的數據庫中全部文件名,第二個集合是現有的文件名集合。兩個集合的相減就能得出增/減的文件:

python hljs    15行orm

#導出原有的文件記錄
        extRet = dict(cursor.execute('select fname,tm from {}'.format(tblName)).fetchall())
        
    #將上次的文件名集合和這一次的文件名集合弄出來
    prekeys = extRet.keys()
    nowkeys = ret.keys()

    #檢查刪除的文件
    delFile = prekeys - nowkeys
    ...

    #檢查新增的文件
    newFile = nowkeys - prekeys
    ...

2. 上代碼

root@WJL-SH4031667: # cat setFileToSql.py

python hljs    67行

from os import walk
from os.path import join,getmtime
import sqlite3
 
@profile
def scan():
    dbName,tblName = "test.db","fmanages"
    conn = sqlite3.connect(dbName)
    cursor = conn.cursor()
    ret,extRet = {},{}
    

    i = 1
    for path, _, files in walk("./"):
        for f in files:
            if not f.endswith(""):
                continue


            filename = join(path, f)
            ret[filename] = getmtime(filename)
            i += 1
            
    ###############如下爲新增代碼,比較重要的是兩個集合還刪增############		
    #檢查是否存在table
    try:
        cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))
    except:
        extRet = dict(cursor.execute('select fname,tm from {}'.format(tblName)).fetchall())
        
    #將上次的文件名集合和這一次的文件名集合弄出來
    prekeys = extRet.keys()
    nowkeys = ret.keys()

    #檢查刪除的文件
    delFile = prekeys - nowkeys
    print("刪除的文件{}".format(delFile))

    #檢查新增的文件
    newFile = nowkeys - prekeys
    print("新增的文件{}".format(newFile))
    
    try:
        for filename in delFile:
            cursor.execute("delete from {} where fname=\'{}\'".format(tblName, filename))
            print("刪除的文件{}".format(filename))
    except:
        pass


    try:
        for filename in newFile:
            cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'new\',\'{}\', \'study\')".format(tblName, filename.rstrip(), getmtime(filename)))
            print("新增的文件{}".format(filename))
    except:
        pass
    ############################################

    cursor.close()
    conn.commit()
    conn.close()
    return ret


if __name__ == "__main__":
    scan()

3. 增長刪減功能以後,觀察內存使用狀況(多了20M左右,總用時多了1分鐘)

(首先去刪除一個文件)

python hljs    61行

Filename: setFileToSql.py

Line #    Mem usage    Increment   Line Contents
================================================
     6   29.824 MiB   29.824 MiB   @profile
     7                             def scan():
     8   29.828 MiB    0.004 MiB       dbName,tblName = "test.db","fmanages"
     9   29.988 MiB    0.160 MiB       conn = sqlite3.connect(dbName)
    10   29.992 MiB    0.004 MiB       cursor = conn.cursor()
    11   29.992 MiB    0.000 MiB       ret,extRet = {},{}
    12                                 #\u68C0\u67E5\u662F\u5426\u5B58\u5728table
    13   29.992 MiB    0.000 MiB       try:
    14   30.477 MiB    0.484 MiB           cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))
    15                                 except:
    16                                     extRet = dict(cursor.execute('select fname,tm from {}'.format(tblName)).fetchall())
    17
    18   30.477 MiB    0.000 MiB       i = 1
    19   97.188 MiB -5072.457 MiB       for path, _, files in walk("./"):
    20   97.188 MiB -35328.441 MiB           for f in files:
    21   97.188 MiB -31115.449 MiB               if not f.endswith(""):
    22                                             continue
    23
    24
    25   97.188 MiB -31114.238 MiB               filename = join(path, f)
    26   97.188 MiB -31074.410 MiB               ret[filename] = getmtime(filename)
    27   97.188 MiB -31111.219 MiB               i += 1
    28
    29                                 #\u5C06\u4E0A\u6B21\u7684\u6587\u4EF6\u540D\u96C6\u5408\u548C\u8FD9\u4E00\u6B21\u7684\u6587\u4EF6\u540D\u96C6\u5408\u5F04\u51FA\u6765
    30   97.188 MiB    0.000 MiB       prekeys = extRet.keys()
    31   97.188 MiB    0.000 MiB       nowkeys = ret.keys()
    32
    33                                 #\u68C0\u67E5\u5220\u9664\u7684\u6587\u4EF6
    34   97.188 MiB    0.000 MiB       delFile = prekeys - nowkeys
    35   97.195 MiB    0.008 MiB       print("\u5220\u9664\u7684\u6587\u4EF6{}".format(delFile))
    36
    37                                 #\u68C0\u67E5\u65B0\u589E\u7684\u6587\u4EF6
    38  111.578 MiB   14.383 MiB       newFile = nowkeys - prekeys
    39  114.059 MiB    2.480 MiB       print("\u65B0\u589E\u7684\u6587\u4EF6{}".format(newFile))
    40
    41  114.059 MiB    0.000 MiB       try:
    42  114.059 MiB    0.000 MiB           for filename in delFile:
    43                                         cursor.execute("delete from {} where fname=\'{}\'".format(tblName, filename))
    44                                         print("\u5220\u9664\u7684\u6587\u4EF6{}".format(filename))
    45                                 except:
    46                                     pass
    47
    48
    49  114.059 MiB    0.000 MiB       try:
    50  114.109 MiB    0.000 MiB           for filename in newFile:
    51  114.109 MiB    0.043 MiB               cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'new\',\'{}\', \'study\')".format(tblName, filename.rstrip(), getmtime(filename)))
    52  114.109 MiB    0.008 MiB               print("\u65B0\u589E\u7684\u6587\u4EF6{}".format(filename))
    53  114.109 MiB    0.000 MiB       except:
    54  114.109 MiB    0.000 MiB           pass
    55
    56  114.109 MiB    0.000 MiB       cursor.close()
    57  114.109 MiB    0.000 MiB       conn.commit()
    58  114.109 MiB    0.000 MiB       conn.close()
    59  114.109 MiB    0.000 MiB       return ret
    
python -m memory_profiler setFileToSql.py  95.17s user 289.77s system 97% cpu 6:32.95 total

4. 觀察數據庫記錄數目變化--230071(23萬71條記錄,確實比上次少了一個)

root@WJL-SH4031667:# sqlite3 test.db

sqlite3 hljs    3行

sqlite> select count(*) from fmanages;
230071

5. 效果圖

3324959919019c4ae1bbb423c4b0e1e

enter description here

相關文章
相關標籤/搜索