六框翻譯-生信人必練的200個數據處理任務-生信技能樹 http://www.biotrainee.com/thread-1444-1-1.html(出處: 生信技能樹)
密碼子是按3個鹼基翻譯的, 因此從第一位開始翻譯會獲得一個氨基酸序列, 從第二位翻譯會獲得一個不一樣的氨基酸序列 從第三位開始又會獲得一個不一樣的序列。 從第四位開始就會和第一個開始翻譯的序列同樣(由於這兩個都是從序列裏面的起始密碼子開始翻譯的), 因此相對於單鏈,會有3種翻譯的方式, 一樣的狀況在互補鏈上也會有3種,因此就一共有六種翻譯方式。讀出六條序列,即6框翻譯。 關於真核生物翻譯的起始起始: 首先是核糖體40s小亞基以及一些真核翻譯起始因子結合mRNA的5'帽結構造成複合體,以後就會向下滑動,找AUG,翻譯起始還要受到AUG附近序列控制,好比Kozak序列:CCa/gCCAUGG,若是找到了AUG,並且附近的序列又比較適合起始,那麼翻譯就開始了。
my(%codon) = ( 'TCA' => 'S', # Serine 'TCC' => 'S', # Serine 'TCG' => 'S', # Serine 'TCT' => 'S', # Serine 'TTC' => 'F', # Phenylalanine 'TTT' => 'F', # Phenylalanine 'TTA' => 'L', # Leucine 'TTG' => 'L', # Leucine 'TAC' => 'Y', # Tyrosine 'TAT' => 'Y', # Tyrosine 'TAA' => '*', # Stop 'TAG' => '*', # Stop 'TGC' => 'C', # Cysteine 'TGT' => 'C', # Cysteine 'TGA' => '*', # Stop 'TGG' => 'W', # Tryptophan 'CTA' => 'L', # Leucine 'CTC' => 'L', # Leucine 'CTG' => 'L', # Leucine 'CTT' => 'L', # Leucine 'CCA' => 'P', # Proline 'CCC' => 'P', # Proline 'CCG' => 'P', # Proline 'CCT' => 'P', # Proline 'CAC' => 'H', # Histidine 'CAT' => 'H', # Histidine 'CAA' => 'Q', # Glutamine 'CAG' => 'Q', # Glutamine 'CGA' => 'R', # Arginine 'CGC' => 'R', # Arginine 'CGG' => 'R', # Arginine 'CGT' => 'R', # Arginine 'ATA' => 'I', # Isoleucine 'ATC' => 'I', # Isoleucine 'ATT' => 'I', # Isoleucine 'ATG' => 'M', # Methionine 'ACA' => 'T', # Threonine 'ACC' => 'T', # Threonine 'ACG' => 'T', # Threonine 'ACT' => 'T', # Threonine 'AAC' => 'N', # Asparagine 'AAT' => 'N', # Asparagine 'AAA' => 'K', # Lysine 'AAG' => 'K', # Lysine 'AGC' => 'S', # Serine 'AGT' => 'S', # Serine 'AGA' => 'R', # Arginine 'AGG' => 'R', # Arginine 'GTA' => 'V', # Valine 'GTC' => 'V', # Valine 'GTG' => 'V', # Valine 'GTT' => 'V', # Valine 'GCA' => 'A', # Alanine 'GCC' => 'A', # Alanine 'GCG' => 'A', # Alanine 'GCT' => 'A', # Alanine 'GAC' => 'D', # Aspartic Acid 'GAT' => 'D', # Aspartic Acid 'GAA' => 'E', # Glutamic Acid 'GAG' => 'E', # Glutamic Acid 'GGA' => 'G', # Glycine 'GGC' => 'G', # Glycine 'GGG' => 'G', # Glycine 'GGT' => 'G', # Glycine ); $dna=shift @ARGV; $protein=""; for(my $i=0; $i < (length($dna) - 2) ; $i += 3) {$protein.=$codon{substr($dna,$i,3)} } print "+1\t$protein\n"; $protein=""; for(my $i=1; $i < (length($dna) - 2) ; $i += 3) {$protein.=$codon{substr($dna,$i,3)} } print "+2\t$protein\n"; $protein=""; for(my $i=2; $i < (length($dna) - 2) ; $i += 3) {$protein.=$codon{substr($dna,$i,3)} } print "+3\t$protein\n"; $dna=reverse($dna); $dna=~tr/ACGTacgt/TGCAtgca/; $protein=""; for(my $i=0; $i < (length($dna) - 2) ; $i += 3) {$protein.=$codon{substr($dna,$i,3)} } print "-1\t$protein\n"; $protein=""; for(my $i=1; $i < (length($dna) - 2) ; $i += 3) {$protein.=$codon{substr($dna,$i,3)} } print "-2\t$protein\n"; $protein=""; for(my $i=2; $i < (length($dna) - 2) ; $i += 3) {$protein.=$codon{substr($dna,$i,3)} } print "-3\t$protein\n";
注:還能夠參照下面的代碼html
1 use strict; 2 use warnings; 3 4 5 my $dna =''; 6 my $protein =''; 7 my @file_data=( ); 8 my @filedata; 9 my $revcom=''; 10 11 12 #打開文件 13 @filedata = get_file_data(); 14 #獲得序列 15 $dna = extract_sequence_from_fasta_data(@filedata); 16 17 #六框閱讀翻譯 18 19 print "\n---------------------Reading Frame 1-----------------\n"; 20 $protein=translate_frame($dna,1); 21 print_sequence($protein,70); 22 23 print "\n---------------------Reading Frame 2-----------------\n"; 24 $protein=translate_frame($dna,2); 25 print_sequence($protein,70); 26 27 print "\n---------------------Reading Frame 3-----------------\n"; 28 $protein=translate_frame($dna,3); 29 print_sequence($protein,70); 30 31 print "\n---------------------Reading Frame 4-----------------\n"; 32 $protein=translate_frame($dna,4); 33 print_sequence($protein,70); 34 35 print "\n---------------------Reading Frame 5-----------------\n"; 36 $protein=translate_frame($dna,5); 37 print_sequence($protein,70); 38 39 print "\n---------------------Reading Frame 6-----------------\n"; 40 $protein=translate_frame($dna,6); 41 print_sequence($protein,70); 42 43 sub get_file_data 44 { 45 # A subroutine to get data from a file given its filename 46 #讀取文件的子序列 47 my $dna_filename; 48 my @filedata; 49 print "please input the Path just like this f:\\\\perl\\\\data.txt\n"; 50 chomp($dna_filename=<STDIN>); 51 open(DNAFILENAME,$dna_filename)||die("can not open the file!"); 52 @filedata = <DNAFILENAME>; 53 close DNAFILENAME; 54 return @filedata;#子函數的返回值必定要記住寫 55 } 56 57 sub extract_sequence_from_fasta_data 58 { 59 #******************************************************************* 60 # A subroutine to extract FASTA sequence data from an array 61 # 獲得其中的序列 62 # fasta格式介紹: 63 # 包括三個部分 64 # 1.第一行中以>開頭的註釋行,後面是名稱和序列的來源 65 # 2.標準單字母符號的序列 66 # 3.*表示結尾 67 #******************************************************************* 68 69 my (@fasta_file_data) =@_; 70 my $sequence =' '; 71 foreach my $line (@fasta_file_data) 72 { 73 #這裏忽略空白行 74 if ($line=~/^\s*$/) 75 { 76 next; 77 } 78 #忽略註釋行 79 elsif($line=~/^\s*#/) 80 { 81 next; 82 } 83 #忽略fasta的第一行 84 elsif($line=~/^>/) 85 { 86 next; 87 } 88 else 89 { 90 $sequence .=$line; 91 } 92 } 93 $sequence=~s/\s//g; 94 return $sequence; 95 } 96 97 sub print_sequence 98 { 99 # A subroutine to format and print sequence data 100 my ($sequence, $length) = @_; 101 for (my $pos =0; $pos<length($sequence);$pos+=$length) 102 { 103 print substr($sequence,$pos,$length),"\n"; 104 } 105 } 106 107 108 109 sub codon2aa 110 { 111 112 #第三種方法 113 #也就是運用哈希 114 #咱們將全部的密碼子做爲hash的key,而後將表明的氨基酸做爲hash的value 115 #而後進行匹配 116 # codon2aa 117 # A subroutine to translate a DNA 3-character codon to an amino acid 118 # Version 3, using hash lookup 119 my($codon) = @_; 120 121 $codon = uc $codon;#uc=uppercase;lc=lowercase 122 #也就是大小寫轉換,uc表示將全部的小寫 轉換爲大寫 123 #lc將全部的大寫轉換爲小寫 124 125 my(%genetic_code) = ( 126 127 'TCA' => 'S', # Serine 128 'TCC' => 'S', # Serine 129 'TCG' => 'S', # Serine 130 'TCT' => 'S', # Serine 131 'TTC' => 'F', # Phenylalanine 132 'TTT' => 'F', # Phenylalanine 133 'TTA' => 'L', # Leucine 134 'TTG' => 'L', # Leucine 135 'TAC' => 'Y', # Tyrosine 136 'TAT' => 'Y', # Tyrosine 137 'TAA' => '_', # Stop 138 'TAG' => '_', # Stop 139 'TGC' => 'C', # Cysteine 140 'TGT' => 'C', # Cysteine 141 'TGA' => '_', # Stop 142 'TGG' => 'W', # Tryptophan 143 'CTA' => 'L', # Leucine 144 'CTC' => 'L', # Leucine 145 'CTG' => 'L', # Leucine 146 'CTT' => 'L', # Leucine 147 'CCA' => 'P', # Proline 148 'CCC' => 'P', # Proline 149 'CCG' => 'P', # Proline 150 'CCT' => 'P', # Proline 151 'CAC' => 'H', # Histidine 152 'CAT' => 'H', # Histidine 153 'CAA' => 'Q', # Glutamine 154 'CAG' => 'Q', # Glutamine 155 'CGA' => 'R', # Arginine 156 'CGC' => 'R', # Arginine 157 'CGG' => 'R', # Arginine 158 'CGT' => 'R', # Arginine 159 'ATA' => 'I', # Isoleucine 160 'ATC' => 'I', # Isoleucine 161 'ATT' => 'I', # Isoleucine 162 'ATG' => 'M', # Methionine 163 'ACA' => 'T', # Threonine 164 'ACC' => 'T', # Threonine 165 'ACG' => 'T', # Threonine 166 'ACT' => 'T', # Threonine 167 'AAC' => 'N', # Asparagine 168 'AAT' => 'N', # Asparagine 169 'AAA' => 'K', # Lysine 170 'AAG' => 'K', # Lysine 171 'AGC' => 'S', # Serine 172 'AGT' => 'S', # Serine 173 'AGA' => 'R', # Arginine 174 'AGG' => 'R', # Arginine 175 'GTA' => 'V', # Valine 176 'GTC' => 'V', # Valine 177 'GTG' => 'V', # Valine 178 'GTT' => 'V', # Valine 179 'GCA' => 'A', # Alanine 180 'GCC' => 'A', # Alanine 181 'GCG' => 'A', # Alanine 182 'GCT' => 'A', # Alanine 183 'GAC' => 'D', # Aspartic Acid 184 'GAT' => 'D', # Aspartic Acid 185 'GAA' => 'E', # Glutamic Acid 186 'GAG' => 'E', # Glutamic Acid 187 'GGA' => 'G', # Glycine 188 'GGC' => 'G', # Glycine 189 'GGG' => 'G', # Glycine 190 'GGT' => 'G', # Glycine 191 ); 192 193 if(exists $genetic_code{$codon}) 194 { 195 return $genetic_code{$codon}; 196 } 197 else 198 { 199 200 print STDERR "Bad codon \"$codon\"!!\n"; 201 exit; 202 } 203 } 204 205 sub dna2peptide 206 { 207 my ($dna)=@_; 208 my $protein =''; 209 for (my $i=0; $i<(length($dna)-2);$i+=3) 210 { 211 $protein .=codon2aa(substr($dna,$i,3)); 212 } 213 return $protein;#這個詞錯誤找了一夜,沒有返回值,因此結果老是沒有內容,之後要引覺得戒,子程序必定要有返回值 214 } 215 216 sub translate_frame 217 { 218 my ($seq,$start,$end)=@_; 219 my $protein; 220 221 unless($end) 222 { 223 $end=length($seq); 224 } 225 return dna2peptide(substr($seq,$start-1,$end-$start+1)); 226 }