20180711-統計PDB中的蛋白質種類、膜蛋白文件個數及信息等

時間 2019-12-01

標籤統計 pdb 蛋白質種類蛋白文件個數信息简体版

原文原文鏈接

20180710完成這份工做。簡單，可是完成了仍是很開心。在我嘗試如何使用pickle保存數據後，嘗試保存PDB文件中「HEADER」中的信息。文件均保存於實驗室服務器（97.73.198.168）/home/RaidDisk/BiodataTest/zyh_pdb_test/tests路徑下。本文將記錄流程並分享統計結果。數據庫

先插入一段代碼看看json

from PDBParseBase import PDBParserBase
import os
import json
import datetime
from DBParser import ParserBase
import pickle 
 
def length_counter(seqres_info):
    pdb_id = seqres_info['pdb_id']
    number = 0
    for item in seqres_info.keys():
        if item != 'pdb_id' and item != 'SEQRES_serNum':
            number += int(seqres_info[item]['SEQRES_numRes'])
        pass
    #print(pdb_id)
    #print(number)  
    id_and_lenth = []
    id_and_lenth.append(pdb_id)
    id_and_lenth.append(number)
    return id_and_lenth

def find_all_headers(rootdir,saveFilePath,saveFilePath2):
    #with the help of PDBParserBase,just put HEADER inf into a pickle with the form of list
    pdbbase = PDBParserBase()
    start = datetime.datetime.now()
    count = 0
    f = open(saveFilePath,'wb')
    f2 = open(saveFilePath2,'wb')    
    header_info_all = []
    for parent,dirnames,filenames in os.walk(rootdir):
        #print(dirnames)
        #print(filenames)
        for filename in filenames:
            count = count + 1
            try:
                PDBfile = filename
                #print(filename)
                header_info = pdbbase.get_header_info(os.path.join(parent,filename))
                #print(header_info)                 
                header_info_all.append(header_info)
                if count %1000== 0:
                    print(count)  
                    end = datetime.datetime.now()
                    print (end-start)                    
                pass
            except:
                print(filename)
                end = datetime.datetime.now()
                print (end-start)               
                pickle.dump(filename, f2)
                #pdb1ov7.ent
            else:
                if count %1111 == 0:
                    print(count)                
    pickle.dump(header_info_all, f,protocol=2)      
    end = datetime.datetime.now()
    print (end-start)
        
    print("Done")    
    return header_info_all    
    

    
    
def hd_ctr_clsfcsh(listpath):
    #header_counter_classfication
    #return a dic ,the key is the classfication and the value is the num
    with open(listpath, 'rb') as f:
        header_list = pickle.load(f)   
    #print(header_list)
    dic = {}
    for item in header_list:
        classification =  item['HEADER_classification']
        print (classification)
        if 'MEMBRAN' in classification:
            if classification in dic.keys():
                dic[classification] = dic[classification] +1
            else:
                dic[classification] = 1 
        else:
            pass
        
    dict= sorted(dic.items(), key=lambda d:d[1], reverse = True)
    print(dict)
    return dict

def get_filenames(listpath,keyword,resultpath):
    with open(listpath, 'rb') as f:
        header_list = pickle.load(f)    
    filenames = []
    for item in header_list:
        classification =  item['HEADER_classification']
        if keyword in classification:
            #print(item)
            #print (item['pdb_id'])
            #print (classification)
            id = item['pdb_id']
            filenames.append(id)
            
        else:
            #print ('dad')
            pass
    
    print(len(filenames))
    print(filenames)
    f2 = open(resultpath,'wb')
    pickle.dump(filenames, f2,protocol=2)      
    print("done")
    return filenames
    

if __name__ == "__main__": 
    rootdir = "/home/BiodataTest/updb"
    #1 is to save list 2 is to save some wrang message
    saveFilePath = "/home/BiodataTest/test_picale/header_counter.pic"
    saveFilePath2 = "/home/BiodataTest/test_picale/header_counter_wrang.pic"        
    #all_header_list = find_all_headers(rootdir,saveFilePath,saveFilePath2)
    
    
    HEADER_LIST_FILE = "/home/BiodataTest/test_picale/header_counter_list.pic"
    hd_ctr_clsfcsh_dic = hd_ctr_clsfcsh(HEADER_LIST_FILE)
    
    resultpath = "/home/BiodataTest/test_picale/Membrane_Filename_list.pic"
    #wanted_filenames = get_filenames(HEADER_LIST_FILE,'MEMBRANE',resultpath)

昨天寫的代碼，次日就忘記了。服務器

函數一：find_all_headers(rootdir,saveFilePath,saveFilePath2)，輸入文件路徑，找到PDB文件中全部的「HEADER」信息，其中包括，文件日期、蛋白質種類、蛋白質名字。存入pickle文件中，保存成一個list。難點在於pickle的第三個參數的值是「2」。app

函數二：hd_ctr_clsfcsh(listpath):#header_counter_classfication，將剛纔輸入的list輸入，統計出每一個分類的數量。輸出的是一個字典，鍵是分類名字，值是該分類的數量。
函數

函數三：get_filenames(listpath,keyword,resultpath)，根據關鍵詞獲取給定文件的文件名。在這裏，我將全部分類這個信息中含有「MEMBRANE」的蛋白質名字找到，保存成列表輸出到文件中。（蛋白質名字就是文件名字）this

枯燥的代碼介紹完了，來點好看的：大餅圖。spa

這個餅狀圖畫出了PDB數據庫中蛋白質種類的分佈，實際上是不許確的，好比有的蛋白分別屬於膜蛋白和轉運蛋白。可是標註爲「MEMBRANE PROTEIN, TRANSPORT PROTEIN」，那咱們把它歸爲一類。3d

咱們統計了140946個文件，總共有96(100>x）+400（100>x>9）+1606(10>x>1) +1863(x = 1) = 2965個種類。咱們選取前1%看看它們分別是什麼：code

就是它們。總共105057，佔總量140946的74.53%。也就是前百分之一的種類佔了百分之七十五的數據量。（看起來好殘酷的樣子哦）orm

第二部分是有關本身的課題，膜蛋白究竟有多少呢？額，1836個，悽慘的很。因此我擴大了搜索範圍，只要包含了「MEMBRANE」的就看成數據提取出來了。注意這個單詞的末尾有個「E」，沒有"E"的話，數量會增長5個。由於有五個文件叫作「MEMBRANCE PROTEIN」.

找到了它們以後，咱們總共得到2335個膜蛋白文件名字，以後將它們解壓好，放在預期的文件夾裏去。

from PDBParseBase import PDBParserBase
import os
import json
import datetime
from DBParser import ParserBase
#import DataStorage as DS
import time, os,datetime,logging,gzip,configparser,pickle

def  UnzipFile(fileName,unzipFileName):
    """"""
    try:
        zipHandle = gzip.GzipFile(mode='rb',fileobj=open(fileName,'rb'))
        with open(unzipFileName, 'wb') as fileHandle:
            fileHandle.write(zipHandle.read())

    except IOError:
        raise("Unzip file failed!")  



def mkdir(path):
    #Created uncompress path folder
    isExists=os.path.exists(path)
    if not isExists:
        os.makedirs(path)
        print(path + " Created folder sucessful!")
        return True
    else:  
        #print ("this path is exist")
        return False   

if __name__ == "__main__": 

    #rootdir = "/home/BiodataTest/updb"
    #rootdir = "/home/BiodataTest/pdb"
    rootdir = "/home/BiodataTest/membraneprotein"


    count = 0
    countcon = 0
    start = datetime.datetime.now()
    saveFilePath = "/home/BiodataTest/picale_zyh_1000/picale.pic"
    #Storage = DS.DataStorage('PDB_20180410')
    
    
    mempt_path = "/home/BiodataTest/test_picale/Membrane_Filename_list.pic"
    with open(mempt_path, 'rb') as f:
        memprtn_file = pickle.load(f) 
    print(memprtn_file)

    count = 0
    for parent,dirnames,filenames in os.walk(rootdir):
        for filename in filenames:
            count = count + 1
            
            
            #start unzip,get the target name and make a files
            dirname = filename[4:6]
            filename_for_membrane_1 = filename[3:7]
            filename_for_membrane = filename_for_membrane_1.upper()
            print(filename_for_membrane)
            if(filename_for_membrane in memprtn_file):
                print(filename_for_membrane)
                filename_with_rootdir = "/home/BiodataTest/pdb/pdb/" + str(dirname)+"/"+str(filename)
                unzipFileName = "/home/BiodataTest/membraneprotein/" + str(dirname)+"/"+str(filename)
                mkdir("/home/BiodataTest/membraneprotein/" + str(dirname)+"/") 
                try:
                    UnzipFile(filename_with_rootdir,unzipFileName[:-3])
                    pass
                except:
                    
                    continue                
            else:
                #print(filename)
                pass
            
            pass
            #print(filename)



    end = datetime.datetime.now()
    print("alltime = ")
    print (end-start)    
    print(count)
print("Done")

因而就得到了這些膜蛋白。

第三十行「MEMBRANCE」的多字母變體英文就大剌剌的顯示在那裏。。。總結工做也是須要作好的，否則可能以後會忘記吧。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。