六框翻譯

 

六框翻譯-生信人必練的200個數據處理任務-生信技能樹  http://www.biotrainee.com/thread-1444-1-1.html(出處: 生信技能樹)
密碼子是按3個鹼基翻譯的,
因此從第一位開始翻譯會獲得一個氨基酸序列,
從第二位翻譯會獲得一個不一樣的氨基酸序列
從第三位開始又會獲得一個不一樣的序列。
從第四位開始就會和第一個開始翻譯的序列同樣(由於這兩個都是從序列裏面的起始密碼子開始翻譯的),
因此相對於單鏈,會有3種翻譯的方式,
一樣的狀況在互補鏈上也會有3種,因此就一共有六種翻譯方式。讀出六條序列,即6框翻譯。
關於真核生物翻譯的起始起始:
首先是核糖體40s小亞基以及一些真核翻譯起始因子結合mRNA的5'帽結構造成複合體,以後就會向下滑動,找AUG,翻譯起始還要受到AUG附近序列控制,好比Kozak序列:CCa/gCCAUGG,若是找到了AUG,並且附近的序列又比較適合起始,那麼翻譯就開始了。
my(%codon) = ( 
      
    'TCA' => 'S',    # Serine 
    'TCC' => 'S',    # Serine 
    'TCG' => 'S',    # Serine 
    'TCT' => 'S',    # Serine        
    'TTC' => 'F',    # Phenylalanine 
    'TTT' => 'F',    # Phenylalanine 
    'TTA' => 'L',    # Leucine 
    'TTG' => 'L',    # Leucine 
    'TAC' => 'Y',    # Tyrosine
    'TAT' => 'Y',    # Tyrosine 
    'TAA' => '*',    # Stop 
    'TAG' => '*',    # Stop 
    'TGC' => 'C',    # Cysteine 
    'TGT' => 'C',    # Cysteine 
    'TGA' => '*',    # Stop 
    'TGG' => 'W',    # Tryptophan 
    'CTA' => 'L',    # Leucine 
    'CTC' => 'L',    # Leucine 
    'CTG' => 'L',    # Leucine 
    'CTT' => 'L',    # Leucine 
    'CCA' => 'P',    # Proline 
    'CCC' => 'P',    # Proline 
    'CCG' => 'P',    # Proline 
    'CCT' => 'P',    # Proline 
    'CAC' => 'H',    # Histidine 
    'CAT' => 'H',    # Histidine 
    'CAA' => 'Q',    # Glutamine 
    'CAG' => 'Q',    # Glutamine 
    'CGA' => 'R',    # Arginine 
    'CGC' => 'R',    # Arginine 
    'CGG' => 'R',    # Arginine 
    'CGT' => 'R',    # Arginine 
    'ATA' => 'I',    # Isoleucine 
    'ATC' => 'I',    # Isoleucine 
    'ATT' => 'I',    # Isoleucine 
    'ATG' => 'M',    # Methionine 
    'ACA' => 'T',    # Threonine 
    'ACC' => 'T',    # Threonine 
    'ACG' => 'T',    # Threonine 
    'ACT' => 'T',    # Threonine 
    'AAC' => 'N',    # Asparagine 
    'AAT' => 'N',    # Asparagine 
    'AAA' => 'K',    # Lysine 
    'AAG' => 'K',    # Lysine 
    'AGC' => 'S',    # Serine 
    'AGT' => 'S',    # Serine 
    'AGA' => 'R',    # Arginine 
    'AGG' => 'R',    # Arginine 
    'GTA' => 'V',    # Valine 
    'GTC' => 'V',    # Valine 
    'GTG' => 'V',    # Valine 
    'GTT' => 'V',    # Valine 
    'GCA' => 'A',    # Alanine 
    'GCC' => 'A',    # Alanine 
    'GCG' => 'A',    # Alanine 
    'GCT' => 'A',    # Alanine
    'GAC' => 'D',    # Aspartic Acid 
    'GAT' => 'D',    # Aspartic Acid 
    'GAA' => 'E',    # Glutamic Acid 
    'GAG' => 'E',    # Glutamic Acid 
    'GGA' => 'G',    # Glycine 
    'GGC' => 'G',    # Glycine 
    'GGG' => 'G',    # Glycine 
    'GGT' => 'G',    # Glycine 
    ); 
$dna=shift @ARGV;
$protein="";
for(my $i=0; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "+1\t$protein\n"; 
$protein="";
for(my $i=1; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "+2\t$protein\n"; 
$protein="";
for(my $i=2; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "+3\t$protein\n"; 
 
$dna=reverse($dna);
$dna=~tr/ACGTacgt/TGCAtgca/;
$protein="";
for(my $i=0; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "-1\t$protein\n"; 
$protein="";
for(my $i=1; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "-2\t$protein\n"; 
$protein="";
for(my $i=2; $i < (length($dna) - 2) ; $i += 3)
        {$protein.=$codon{substr($dna,$i,3)} }
print "-3\t$protein\n"; 

 

注:還能夠參照下面的代碼html

 

  1 use strict;  
  2 use warnings;  
  3   
  4   
  5 my $dna      ='';  
  6 my $protein  ='';  
  7 my @file_data=( );  
  8 my @filedata;
  9 my $revcom='';
 10  
 11  
 12 #打開文件 
 13 @filedata  = get_file_data();
 14 #獲得序列
 15 $dna       = extract_sequence_from_fasta_data(@filedata);  
 16  
 17 #六框閱讀翻譯
 18  
 19 print "\n---------------------Reading Frame 1-----------------\n";
 20 $protein=translate_frame($dna,1);
 21 print_sequence($protein,70);
 22  
 23 print "\n---------------------Reading Frame 2-----------------\n";
 24 $protein=translate_frame($dna,2);
 25 print_sequence($protein,70);
 26  
 27 print "\n---------------------Reading Frame 3-----------------\n";
 28 $protein=translate_frame($dna,3);
 29 print_sequence($protein,70);
 30  
 31 print "\n---------------------Reading Frame 4-----------------\n";
 32 $protein=translate_frame($dna,4);
 33 print_sequence($protein,70);
 34  
 35 print "\n---------------------Reading Frame 5-----------------\n";
 36 $protein=translate_frame($dna,5);
 37 print_sequence($protein,70);
 38  
 39 print "\n---------------------Reading Frame 6-----------------\n";
 40 $protein=translate_frame($dna,6);
 41 print_sequence($protein,70);
 42  
 43 sub get_file_data
 44 {  
 45     # A subroutine to get data from a file given its filename
 46     #讀取文件的子序列
 47     my $dna_filename;
 48     my @filedata;
 49     print "please input the Path just like this f:\\\\perl\\\\data.txt\n";   
 50     chomp($dna_filename=<STDIN>); 
 51     open(DNAFILENAME,$dna_filename)||die("can not open the file!");    
 52     @filedata     = <DNAFILENAME>;  
 53     close DNAFILENAME;  
 54     return @filedata;#子函數的返回值必定要記住寫
 55 }
 56  
 57 sub extract_sequence_from_fasta_data  
 58 {  
 59     #*******************************************************************  
 60     # A subroutine to extract FASTA sequence data from an array  
 61     # 獲得其中的序列  
 62     # fasta格式介紹:  
 63     # 包括三個部分  
 64     # 1.第一行中以>開頭的註釋行,後面是名稱和序列的來源  
 65     # 2.標準單字母符號的序列  
 66     # 3.*表示結尾  
 67     #*******************************************************************  
 68   
 69     my (@fasta_file_data) =@_;  
 70     my $sequence =' ';  
 71     foreach my $line (@fasta_file_data)  
 72     {  
 73         #這裏忽略空白行  
 74         if ($line=~/^\s*$/)  
 75         {  
 76             next;  
 77         }  
 78         #忽略註釋行  
 79         elsif($line=~/^\s*#/)  
 80         {  
 81             next;  
 82         }  
 83         #忽略fasta的第一行  
 84         elsif($line=~/^>/)  
 85         {  
 86             next;  
 87         }  
 88         else  
 89         {  
 90             $sequence .=$line;  
 91         }  
 92     }  
 93     $sequence=~s/\s//g;  
 94     return $sequence;  
 95 }  
 96   
 97 sub print_sequence  
 98 {  
 99     # A subroutine to format and print sequence data  
100     my ($sequence, $length) = @_;  
101     for (my $pos =0; $pos<length($sequence);$pos+=$length)  
102     {  
103         print substr($sequence,$pos,$length),"\n";  
104     }  
105 }  
106   
107      
108   
109 sub codon2aa     
110 {     
111   
112     #第三種方法    
113     #也就是運用哈希    
114     #咱們將全部的密碼子做爲hash的key,而後將表明的氨基酸做爲hash的value    
115     #而後進行匹配    
116     # codon2aa     
117     # A subroutine to translate a DNA 3-character codon to an amino acid     
118     # Version 3, using hash lookup     
119     my($codon) = @_;     
120      
121     $codon = uc $codon;#uc=uppercase;lc=lowercase    
122                    #也就是大小寫轉換,uc表示將全部的小寫 轉換爲大寫    
123                #lc將全部的大寫轉換爲小寫    
124       
125     my(%genetic_code) = (     
126          
127     'TCA' => 'S',    # Serine     
128     'TCC' => 'S',    # Serine     
129     'TCG' => 'S',    # Serine     
130     'TCT' => 'S',    # Serine     
131     'TTC' => 'F',    # Phenylalanine     
132     'TTT' => 'F',    # Phenylalanine     
133     'TTA' => 'L',    # Leucine     
134     'TTG' => 'L',    # Leucine     
135     'TAC' => 'Y',    # Tyrosine      
136     'TAT' => 'Y',    # Tyrosine     
137     'TAA' => '_',    # Stop     
138     'TAG' => '_',    # Stop     
139     'TGC' => 'C',    # Cysteine     
140     'TGT' => 'C',    # Cysteine     
141     'TGA' => '_',    # Stop     
142     'TGG' => 'W',    # Tryptophan     
143     'CTA' => 'L',    # Leucine     
144     'CTC' => 'L',    # Leucine     
145     'CTG' => 'L',    # Leucine     
146     'CTT' => 'L',    # Leucine     
147     'CCA' => 'P',    # Proline     
148     'CCC' => 'P',    # Proline     
149     'CCG' => 'P',    # Proline     
150     'CCT' => 'P',    # Proline     
151     'CAC' => 'H',    # Histidine     
152     'CAT' => 'H',    # Histidine     
153     'CAA' => 'Q',    # Glutamine     
154     'CAG' => 'Q',    # Glutamine     
155     'CGA' => 'R',    # Arginine     
156     'CGC' => 'R',    # Arginine     
157     'CGG' => 'R',    # Arginine     
158     'CGT' => 'R',    # Arginine     
159     'ATA' => 'I',    # Isoleucine     
160     'ATC' => 'I',    # Isoleucine     
161     'ATT' => 'I',    # Isoleucine     
162     'ATG' => 'M',    # Methionine     
163     'ACA' => 'T',    # Threonine     
164     'ACC' => 'T',    # Threonine     
165     'ACG' => 'T',    # Threonine     
166     'ACT' => 'T',    # Threonine     
167     'AAC' => 'N',    # Asparagine     
168     'AAT' => 'N',    # Asparagine     
169     'AAA' => 'K',    # Lysine     
170     'AAG' => 'K',    # Lysine     
171     'AGC' => 'S',    # Serine     
172     'AGT' => 'S',    # Serine     
173     'AGA' => 'R',    # Arginine     
174     'AGG' => 'R',    # Arginine     
175     'GTA' => 'V',    # Valine     
176     'GTC' => 'V',    # Valine     
177     'GTG' => 'V',    # Valine     
178     'GTT' => 'V',    # Valine     
179     'GCA' => 'A',    # Alanine     
180     'GCC' => 'A',    # Alanine     
181     'GCG' => 'A',    # Alanine     
182     'GCT' => 'A',    # Alanine         
183     'GAC' => 'D',    # Aspartic Acid     
184     'GAT' => 'D',    # Aspartic Acid     
185     'GAA' => 'E',    # Glutamic Acid     
186     'GAG' => 'E',    # Glutamic Acid     
187     'GGA' => 'G',    # Glycine     
188     'GGC' => 'G',    # Glycine     
189     'GGG' => 'G',    # Glycine     
190     'GGT' => 'G',    # Glycine     
191     );     
192      
193     if(exists $genetic_code{$codon})     
194     {     
195         return $genetic_code{$codon};     
196     }    
197     else    
198     {     
199      
200             print STDERR "Bad codon \"$codon\"!!\n";     
201             exit;     
202     }     
203 }     
204   
205 sub dna2peptide  
206 {  
207     my ($dna)=@_;  
208     my $protein ='';  
209     for (my $i=0; $i<(length($dna)-2);$i+=3)  
210     {  
211         $protein .=codon2aa(substr($dna,$i,3));  
212     }  
213     return $protein;#這個詞錯誤找了一夜,沒有返回值,因此結果老是沒有內容,之後要引覺得戒,子程序必定要有返回值  
214 }  
215  
216 sub translate_frame
217 {
218     my ($seq,$start,$end)=@_;
219     my $protein;
220     
221     unless($end)
222     {
223         $end=length($seq);
224     }
225     return dna2peptide(substr($seq,$start-1,$end-$start+1));
226 }
相關文章
相關標籤/搜索