在工做當中,有時候咱們知到染色體編號以及染色體起始終止座標,咱們想知道這段序列是什麼樣的鹼基。
其一,咱們通常用去UCSC的genome browser裏面去查詢 ,其實也能夠從UCSC的接口去解析網頁,而後在提取序列信息
好比chr17:7676091,7676196 ,那麼我只須要構造下面一個網頁地址 http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr17:7676091,7676196
而後 hg38能夠更換成hg19,dna?segment= 後面能夠按照標準格式更換,就能夠返回咱們想要的序列了。如今對網頁返回 一個xml格式的信息,用python解析一下
1 import requests
2 import re 3 from bs4 import BeautifulSoup 4 import xlwt 5 import xlrd 6 from xlutils.copy import copy 7 import os ,sys 8 #print(sys.path) 9 cwd=os.getcwd() 10 11 12 13 def getHTMLText(url): 14 print("111111") 15 try: 16 header = {'user-agent': 'Mozilla/5.0'} 17 r = requests.get(url,headers = header,timeout = 30 ) 18 r.raise_for_status() 19 r.encoding =r.apparent_encoding 20 print("get 222222222222222") 21 return r.text 22 except: 23 return "" 24 25 def fillDNAList(dnalist, html): 26 # 使用正則表達式獲取dna 序列的頭文件 27 match = re.search('SEQUENCE([\s\S]*?version="1.00")', html) 28 print("match ok") 29 if match: 30 dna_header = re.search('SEQUENCE([\s\S]*?version="1.00")', html) 31 #print('10====>', dna_header.group()) 32 #dna_header 存到列表 33 dnalist.append(dna_header.group()) 34 35 match = re.search('DNA.*?length="(\d*)"', html) 36 if match: 37 length_header= re.search('DNA.*?length="(\d*)"', html) 38 #print('11=====>', length_header.group()) 39 dnalist.append(length_header.group()) 40 41 #使用 BeautifulSoup 42 print("BeautifulSoup tag屬性 獲取dna標籤屬性的字符串部分") 43 soup = BeautifulSoup(html, 'html.parser') 44 tag = soup.dna 45 seq = soup.dna.string 46 seq = seq.replace('\n','').upper() #換行符刪除掉,轉換成大寫 47 # seq 存到列表 48 dnalist.append(seq) 49 print("final======>",dnalist) 50 return dnalist 51 52 def write_excell(dnalist,chrnum,pos): 53 head = '>hg19' + ' ' + dnalist[0] + dnalist[1] 54 f = xlwt.Workbook(encoding='utf-8', style_compression=0) #建立新的Excel(新的workbook) 55 sheet = f.add_sheet('test8', cell_overwrite_ok=True) #建立新的表單 56 # 先寫第一行的頭文件 57 sheet.write(0,0,head) 58 #再從第二行開始寫,每行寫入50 個字符串 59 dna = dnalist[2] 60 print('=====',dna,type(dna)) 61 for i in range(0,len(dna),50): 62 sheet.write((int((i+1)/50))+1,0,dna[i:i+50]) 63 64 out_file = 'chrmosome%s_%s.xls'% (chrnum,pos) 65 f.save(out_file) 66 out_file_dir = os.path.join(cwd, out_file) 67 return out_file_dir 68 69 def modify_excell(out_file_dir,chrnum,pos): 70 ''' 71 改Excel表(xlutils模塊) 72 :return: 73 ''' 74 rb = xlrd.open_workbook(out_file_dir) # 打開out_file.xls文件,建立工做簿實例對象 75 sheet = rb.sheet_by_index(0) 76 nrow11 = sheet.cell_value(10, 0) #修改第11行第一列,索引是10,0 77 # 根據須要截取原單元格里面的內容與須要添加的內容進行拼接 78 new_nrow11 = '[' + nrow11 79 # 同理操做nrow12 80 nrow12 = sheet.cell_value(11, 0) 81 new_nrow12 = nrow12 + ']' 82 83 wb = copy(rb) 84 ws = wb.get_sheet(0) 85 # 往單元格中寫入拼接後的新字符串內容 86 ws.write(10,0,new_nrow11) 87 ws.write(11, 0, new_nrow12) 88 modify_file = 'new_chrmosome%s_%s.xls' % (chrnum,pos) 89 wb.save(modify_file) 90 91 def main(): 92 hg19 = "hg19" 93 chrnum = 17 94 pos = 7676091 95 start = pos - 500 96 end = pos + 500 97 position_DNA_list = [] 99 url = f"http://genome.ucsc.edu/cgi-bin/das/{hg19}/dna?segment=chr{chrnum}:{start},{end}" 100 101 print(url) 102 html = getHTMLText(url) 103 dnalist = fillDNAList(position_DNA_list,html) 104 out_file_dir = write_excell(dnalist,chrnum,pos) 105 modify_excell(out_file_dir,chrnum,pos) 106 107 main()
結果以下:html
http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr17:7675591,7676591 match ok BeautifulSoup tag屬性 獲取dna標籤屬性的字符串部分 final======> ['SEQUENCE id="chr17" start="7675591" stop="7676591" version="1.00"', 'DNA length="1001"', 'CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC'] ===== CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC <class 'str'>