sub enc
{ &opt_default(output=>'stdout', filename=>'enc.csv', id=>'', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $tag = opt_val("tag");
my %tbox; my $aa; my %relative; my %F; my %cntaa; my %numaa; my %sumF; my %aveF; my $enc;
unless($id){
if($output eq 'stdout'){
msg_send("\nThe effective number of codons (enc)\n",
"enc range from 20 (maximum bias) to 61 (no bias).\n",
"enc\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"enc,$tag\n";
}
my $cds;
foreach $cds ($gb->cds()){
$gb->{$cds}->{enc} = &enc($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds); if($output eq 'stdout'){
msg_send($gb->{$cds}->{enc}, "\t", $gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE $gb->{$cds}->{enc}, ",", $gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
} else {
my $count = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C0',
-translate=>$translate, -del_key=>$del_key);
foreach (keys %{$count}){ $tbox{substr($_, 0, 1)} += $count->{$_}; }
foreach (keys %{$count}){
$aa = substr($_, 0, 1);
if($tbox{$aa} - 1 == 0){ delete $tbox{$aa}; }
else { $relative{$_} = $count->{$_} / $tbox{$aa}; } } foreach (keys %relative){ $F{substr($_, 0, 1)} += $relative{$_} ** 2; } foreach $aa (keys %F){ $F{$aa} = ($tbox{$aa} * $F{$aa} - 1) / ($tbox{$aa} - 1);
}
foreach $aa (keys %F){
if($F{$aa} > 0.0000001){
$sumF{$gb->{degeneracy}->{$aa}} += $F{$aa};
$cntaa{$gb->{degeneracy}->{$aa}} ++;
}
}
foreach (values %{$gb->{degeneracy}}){ $numaa{$_} ++; }
foreach (keys %numaa){
if($_ == 1){ $aveF{$_} = 1; }
elsif($cntaa{$_} && $sumF{$_}){
$aveF{$_} = $sumF{$_} / $cntaa{$_}; }
elsif($_ == 3 && $sumF{2} && $sumF{4}){
$aveF{3} = ($sumF{2} / $cntaa{2} + $sumF{4} / $cntaa{4}) / 2; }
else { $enc = -1; }
}
unless($enc){ foreach (keys %numaa){ $enc += $numaa{$_} / $aveF{$_}; }} #return ($enc > 61) ? 61.00 : sprintf "%.2f", $enc; if($enc < 0){ $enc = undef; }
elsif($enc > 61){ $enc = 61; }
else { $enc = sprintf "%.2f", $enc; }
return $enc;
}
}
=head2 cbi
Name: cbi - calculates the codon bias index (CBI)
Description:
This program calculates the codon bias index (CBI).
Usage:
scalar = &cbi(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'cbi.csv')
-id feature ID or gene group ID ('All') (default: '' for each of all genes)
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120620 added recursive call
20030401 initial posting
References:
Comeron JM, Aguade M. (1998) J Mol Evol. 47(3):268-74.
Morton BR (1993) J.Mol.Evol. 37:273-280.
Requirements: sub codon_compiler
=cut
sub cbi {
&opt_default(output=>'stdout', filename=>'cbi.csv', id=>'', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $tag = opt_val("tag");
unless($id){
if($output eq 'stdout'){
msg_send("\nThe codon bias index (cbi)\n",
"cbi range from 0 (no bias) to 1 (maximum bias).\n",
"cbi\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"cbi,$tag\n";
}
my $cds;
foreach $cds ($gb->cds()){
$gb->{$cds}->{cbi} = &cbi($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds); if($output eq 'stdout'){
msg_send($gb->{$cds}->{cbi}, "\t", $gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE $gb->{$cds}->{cbi}, ",", $gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
} else {
my $aacu = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'A1C4',
-translate=>$translate, -del_key=>$del_key);
my %bias; my $cbi;
foreach (keys %{$gb->{All}->{A1C4}}){
my $aa = substr($_, 0, 1);
$bias{$aa} += ($aacu->{$_} - 1) ** 2 / ($gb->{degeneracy}->{$aa} - 1) if($aacu->{$aa} && length($_) == 4 && $gb->{degeneracy}->{$aa} > 1); }
foreach (keys %bias){ $cbi += $aacu->{$_} * $bias{$_}; }
return sprintf "%.4f", $cbi;
}
}
=head2 scs
Name: scs - calculates the scaled chi-square
Description:
This program calculates the chi-square for deviation from
uniform synonymous codon usage, scaled by gene length.
Usage:
scalar = &scs(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'scs.csv')
-id feature ID or gene group ID ('All') (default: '' for each of all genes)
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120620 added recursive call
20090204 initial posting
References:
Comeron JM, Aguade M. (1998) J Mol Evol. 47(3):268-74.
Shields DC, Sharp PM. (1987) Nucleic Acids Res. 15(19):8023-40.
Requirements: sub codon_compiler
=cut
sub scs {
&opt_default(output=>'stdout', filename=>'scs.csv', id=>'', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $tag = opt_val("tag");
unless($id){
if($output eq 'stdout'){
msg_send("\nThe scaled chi-square (scs)\n",
"scs\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"scs,$tag\n";
}
my $cds;
foreach $cds ($gb->cds()){
$gb->{$cds}->{scs} = &scs($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds); if($output eq 'stdout'){
msg_send($gb->{$cds}->{scs}, "\t", $gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE $gb->{$cds}->{scs}, ",", $gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
} else {
my $observed = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C0',
-translate=>$translate, -del_key=>$del_key);
my %tbox; my $tnc; my %degeneracy;
foreach (keys %{$observed}){ $tbox{substr($_, 0, 1)} += $observed->{$_}; $tnc += $observed->{$_}; $degeneracy{substr($_, 0, 1)} ++; }
my $scs; foreach (sort keys %{$observed}){
my $aa = substr($_, 0, 1);
my $expected = $tbox{$aa} / $gb->{degeneracy}->{$aa}; $scs += ($observed->{$_} - $expected)**2 / $expected; }
return sprintf "%.4f", $scs /= $tnc if($tnc); }
}
=head2 icdi
Name: icdi - calculates the intrinsic codon deviation index (ICDI)
Description:
This program calculates the intrinsic codon deviation index (ICDI).
Usage:
scalar = &icdi(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'icdi.csv')
-id feature ID or gene group ID ('All') (default: '' for each of all genes)
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120620 added recursive call
20030401 initial posting
References:
Comeron JM, Aguade M. (1998) J Mol Evol. 47(3):268-74.
Freire-Picos MA et al. (1994) Gene 139:43-49.
Requirements: sub codon_compiler
=cut
sub icdi {
&opt_default(output=>'stdout', filename=>'icdi.csv', id=>'', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $tag = opt_val("tag");
my $rscu; my %tbox; my %Sk; my $icdi; my $ndaa;
unless($id){
if($output eq 'stdout'){
msg_send("\nThe intrinsic codon deviation index (icdi)\n",
"icdi range from 0 (no bias) to 1 (maximum bias).\n",
"icdi\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"icdi,$tag\n";
}
my $cds;
foreach $cds ($gb->cds()){
$gb->{$cds}->{icdi} = &icdi($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds); if($output eq 'stdout'){
msg_send($gb->{$cds}->{icdi}, "\t", $gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE $gb->{$cds}->{icdi}, ",", $gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
} else {
$rscu = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C3',
-translate=>$translate, -del_key=>$del_key);
foreach (keys %{$gb->{All}->{C3}}){
$tbox{substr($_, 0, 1)} += $rscu->{$_};
}
foreach (keys %{$gb->{All}->{C3}}){
my $aa = substr($_, 0, 1);
$Sk{$aa} += ($rscu->{$_} - 1) ** 2 if($tbox{$aa} > 1);
}
foreach (keys %Sk){
my $k = $gb->{degeneracy}->{$_};
$icdi += $Sk{$_} / ($k * ($k - 1)); $ndaa ++;
}
return sprintf "%.4f", $icdi /= $ndaa if($ndaa); }
}
=head2 fop
Name: fop - calculate the frequency of optimal codons (Fop)
Description:
This program calculates the frequency of optimal codons (Fop), and
inputs in the G instance; i.e. Fop values will be accessible at
$gb->{"FEATURE$i"}->{fop};
Usage:
NULL = &fop(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'fop.csv')
-binary pointer HASH of vectors of binary number, 1 or 0 refering to
the optimal or nonoptimal codon. By default, four amino acids
(Ile, Asn, Phe, and Tyr), where the optimal codons seem to be
the same in all species, are used.
-translate 1 when translates using standard codon table (default: 0)
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20100311 added -tag option
20030401 initial posting
References:
Ikemura (1981) J.Mol.Biol. 151:389-409.
Ikemura (1985) Mol.Biol.Evol. 2(1):13-34.
=cut
sub fop {
&opt_default(output=>'stdout', filename=>'fop.csv', translate=>0, tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my $tag = opt_val("tag");
my %Sc = ('Lcta'=>0,'Lctc'=>0,'Lctg'=>0,'Lctt'=>0,'Ltta'=>0,'Lttg'=>1,
'Raga'=>1,'Ragg'=>0,'Rcga'=>0,'Rcgc'=>0,'Rcgg'=>0,'Rcgt'=>0,
'Sagc'=>0,'Sagt'=>0,'Stca'=>0,'Stcc'=>1,'Stcg'=>0,'Stct'=>1,
'Agca'=>0,'Agcc'=>1,'Agcg'=>0,'Agct'=>1,
'Ggga'=>0,'Gggc'=>1,'Gggg'=>0,'Gggt'=>1,
'Pcca'=>0,'Pccc'=>0,'Pccg'=>0,'Pcct'=>0,
'Taca'=>0,'Tacc'=>1,'Tacg'=>0,'Tact'=>1,
'Vgta'=>0,'Vgtc'=>1,'Vgtg'=>0,'Vgtt'=>1,
'Iata'=>0,'Iatc'=>1,'Iatt'=>1,
'Naac'=>1,'Naat'=>0,
'Dgac'=>1,'Dgat'=>0,
'Ctgc'=>0,'Ctgt'=>0,
'Qcaa'=>0,'Qcag'=>0,
'Egaa'=>1,'Egag'=>0,
'Hcac'=>1,'Hcat'=>0,
'Kaaa'=>0,'Kaag'=>1,
'Fttc'=>1,'Fttt'=>0,
'Ytac'=>1,'Ytat'=>0);
my %Ec = ('Lcta'=>0,'Lctc'=>0,'Lctg'=>1,'Lctt'=>0,'Ltta'=>0,'Lttg'=>0,
'Raga'=>0,'Ragg'=>0,'Rcga'=>0,'Rcgc'=>1,'Rcgg'=>0,'Rcgt'=>1,
'Sagc'=>0,'Sagt'=>0,'Stca'=>0,'Stcc'=>0,'Stcg'=>0,'Stct'=>0,
'Agca'=>1,'Agcc'=>0,'Agcg'=>1,'Agct'=>1,
'Ggga'=>0,'Gggc'=>1,'Gggg'=>0,'Gggt'=>1,
'Pcca'=>0,'Pccc'=>0,'Pccg'=>1,'Pcct'=>0,
'Taca'=>0,'Tacc'=>1,'Tacg'=>0,'Tact'=>1,
'Vgta'=>1,'Vgtc'=>0,'Vgtg'=>1,'Vgtt'=>1, 'Vgta'=>1,'Vgtc'=>0,'Vgtg'=>0,'Vgtt'=>1, 'Iata'=>0,'Iatc'=>1,'Iatt'=>0,
'Naac'=>1,'Naat'=>0,
'Dgac'=>1,'Dgat'=>0,
'Ctgc'=>0,'Ctgt'=>0,
'Qcaa'=>0,'Qcag'=>1,
'Egaa'=>1,'Egag'=>0,
'Hcac'=>1,'Hcat'=>0,
'Kaaa'=>1,'Kaag'=>0,
'Fttc'=>1,'Fttt'=>0,
'Ytac'=>1,'Ytat'=>0);
my %four = ('Iatc'=>1,'Iatt'=>0, 'Naac'=>1,'Naat'=>0,
'Fttc'=>1,'Fttt'=>0,
'Ytac'=>1,'Ytat'=>0);
my $binary = opt_val("binary") ? opt_val("binary") :\% four;
my %degeneracy;
foreach (keys %{$binary}){ $degeneracy{substr($_, 0, 1)} ++; }
if($output eq 'stdout'){
msg_send("\noptimal (1) and nonoptimal (0) codons\n");
foreach (sort keys %$binary){ msg_send("$_\t"); }
msg_send("\n");
foreach (sort keys %$binary){ msg_send("$binary->{$_}\t"); }
msg_send("\n\n",
"frequency of optimal codons (fop)\n",
"Laa: length in amino acids\n",
"Lc: length in codons used\n",
"Laa\tLc\tfop\t$tag\n");
}
if($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"Laa,Lc,fop,$tag\n";
}
foreach my $cds ($gb->cds()){
my $seq = substr($gb->get_geneseq($cds), 3, -3);
my $aaseq;
my $i;
if($translate){ $aaseq = &translate($seq); }
else { $aaseq = substr($gb->{$cds}->{translation}, 1); }
my $Laa = length($aaseq);
my $Lc; my $fop;
$i = 0;
while($aaseq){
my $aa = substr($aaseq, 0, 1);
substr($aaseq, 0, 1) = '';
my $codon = substr($seq, $i, 3);
if($degeneracy{$aa} > 1 && exists $binary->{$aa.$codon}){
$Lc ++;
$fop += $binary->{$aa.$codon};
}
$i += 3;
}
$gb->{$cds}->{fop} = sprintf "%.4f", $fop / $Lc if($Lc); $gb->{$cds}->{Lc} = $Lc;
if($output eq 'stdout'){
msg_send("$Laa\t$Lc\t",
$gb->{$cds}->{fop}, "\t",
$gb->{$cds}->{$tag}, "\n");
}
if($output eq 'f'){
print FILE
"$Laa,$Lc,",
$gb->{$cds}->{fop}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE);
}
=head2 w_value
Name: w_value - calculate the 'relative adaptiveness of each codon' (W)
Description:
This program calculates the 'relative adaptiveness of each codon' (W value)
necessary for CAI analysis. Returned value is a pointer HASH of W values.
Usage:
pointer HASH_w_values = &w_value(G instance);
Options:
-output output option (default: "stdout")
"stdout" for standard output,
"f" for file output in directory "data"
-filename output filename (default: "w_value.csv")
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: "product")
-include regular expression to include genes in a reference set
a reference set in several studies are in-built
1: Nakamura and Tabata, 2: Sharp and Li, 3: Sakai et al.
(default: 1.'ribosomal.*protein')
-exclude regular expression to exclude genes from a reference set
(default: '[Mm]itochondrial')
-sharp set to 1 to use the 40 genes used by Sharp PM et al. (default: 0)
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120701 added -sharp option
20100311 added -tag option
20030401 added -include and -exclude options
20010830 first release (Author: Kazuharu "Gaou" Arakawa)
References:
Sharp PM et al. (2005) Nucleic Acids Res. 33(4):1141-1153
Sakai et al. (2001) J.Mol.Evol. 52:164-170.
Nakamura and Tabata (1997) Microb.Comp.Genomics 2:299-312.
Sharp and Li (1987) Nucleic Acids Res. 15:1281-1295.
=cut
sub w_value {
&opt_default(output=>'stdout', filename=>"w_value.csv", tag=>'product', sharp=>0,
include=>'ribosomal.*protein', exclude=>'[Mm]itochondrial');
my @args=opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $tag = opt_val("tag");
my $include;
my $exclude = opt_val("exclude");
my $sharp = opt_val("sharp");
if(opt_val("include") == 1){ $include = 'ribosomal.*protein'; }
elsif(opt_val("include") == 2){ $include = 'ribosomal.*protein|outer membrane protein|^elongation factor'; }
elsif(opt_val("include") == 3){ $include = 'ribosom.*protein|outer.*membrane|elongation|heat.*shock|RNA.*polymerase.*subunit'; }
else{ $include = opt_val("include"); }
if($output eq 'stdout'){
msg_send("\n",
"Reference set of highly expressed genes\n",
"$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"Reference set of highly expressed genes\n",
"$tag\n";
}
my $aaseq;
my $seq;
my $aa;
my $codon;
my %count;
my $i;
my %max;
my %w_val;
my $cnt;
&geneGroup($gb) if($sharp);
foreach my $cds ($gb->cds()){
if($sharp){
next unless($gb->{$cds}->{geneGroup} =~ /highlyExpressed/);
}else{
next if ($gb->{$cds}->{$tag} !~ /$include/i);
next if ($gb->{$cds}->{$tag} =~ /$exclude/);
}
$cnt ++;
$aaseq = substr($gb->{$cds}->{translation}, 1);
$seq = substr(lc($gb->get_geneseq($cds)), 3, -3);
while($aaseq){
$aa = substr($aaseq, 0, 1);
substr($aaseq, 0, 1) = '';
$codon = substr($seq, $i, 3);
$count{$aa.$codon} ++;
$i += 3;
}
$i = 0;
if($output eq 'stdout'){
msg_send($gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE $gb->{$cds}->{$tag}, "\n";
}
}
&msg_error("\ncai: No reference proteins found (could not define W values).") unless($cnt);
foreach (keys %count){
$aa = substr($_, 0, 1);
$max{$aa} = $count{$_} if($count{$_} > $max{$aa});
}
foreach (keys %count){
$w_val{$_} = sprintf "%.4f", $count{$_} / $max{substr($_, 0, 1)}; }
if($output eq 'stdout'){
msg_send("\nRelative adaptiveness (W) of each codon\n",
"aa\tcodon\tW value\n");
foreach (sort keys %count){
$aa = substr($_, 0, 1);
$codon = substr($_, 1, 3);
msg_send("$aa\t$codon\t$w_val{$_}\n");
}
}
if($output eq 'f'){
print FILE
"\nRelative adaptiveness (W) of each codon\n",
"amino acid,codon,W value\n";
foreach (sort keys %count){
$aa = substr($_, 0, 1);
$codon = substr($_, 1, 3);
print FILE "$aa,$codon,$w_val{$_}\n";
}
close(FILE);
}
return\% w_val;
}
=head2 cai
Name: cai - calculate codon adaptation index (CAI) for each gene
Description:
This program calculates codon adaptation index (CAI) for each gene,
and inputs in the G instance. i.e. CAI values will be accessible at
$gb->{"FEATURE$i"}->{cai};
Usage:
NULL = &cai(G instance);
Options:
-output output option (default: "stdout")
"stdout" for standard output,
"f" for file output in directory "data"
-filename output filename (default: "cai.csv")
-translate 1 when translates using standard codon table (default: 0)
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: "locus_tag")
-tai 1 when calculating tRNA adaptation index (tAI) (default: 0)
-w_output output option for W value (default: "null")
-w_filename filename to output the W values (default: "w_value.csv")
-w_values pointer HASH of W values used in CAI calculation
-w_absent W value of codons absent from a reference set (default: "-")
"-" when excludes such codons from the calculation
a value of 0.5 was suggested by Sharp and Li (1987)
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20140514 added -tai option
20100311 added -tag option
20030401 added -w_absent option
20010830 first release (Author: Kazuharu "Gaou" Arakawa)
References:
Sharp PM, Li WH. 1987 Nucleic Acids Res. 15(3):1281-95.
Requirements: sub w_value
=cut
sub cai {
&opt_default(output=>'stdout', translate=>0, tag=>'locus_tag', tai=>0,
w_output=>'null', w_filename=>"w_value.csv", w_absent=>"-");
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $translate = opt_val("translate");
my $tag = opt_val("tag");
my $w_output = opt_val("w_output");
my $w_fname = opt_val("w_filename");
my $w_abs = opt_val("w_absent");
my $w_val;
my $tai = opt_val("tai");
my $index = ($tai == 1) ? "tRNA adaptation index (tAI)" : "Codon Adaptation Index (CAI)";
my $idx = ($tai == 1) ? "tai" : "cai";
my $filename = ($tai == 1) ? "tai.csv" : "cai.csv";
$w_output = $output if($w_output eq 'stdout' && $output ne 'stdout');
if(opt_val("w_values")){
$w_val = opt_val("w_values");
}elsif($tai == 1){
$w_val = &w_tai($gb);
}else{
$w_val = &w_value($gb, -output=>$w_output, -filename=>$w_fname);
}
if($output eq 'stdout'){
msg_send("\n$index\n$idx\t$tag\n");
}
if($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE "$idx,$tag\n";
}
foreach my $cds ($gb->cds()){
my $seq = substr(lc($gb->get_geneseq($cds)), 3, -3); my $aaseq;
my $i;
my $codon;
if($translate){ $aaseq = &translate($seq); }
else { $aaseq = substr($gb->{$cds}->{translation}, 1); }
my $cai;
my $Lc;
for($i = 0; $i * 3 <= length($seq) - 3; $i ++){
my $aa = substr($aaseq, $i, 1);
$codon = substr($seq, $i * 3, 3);
if($w_val->{$aa.$codon} > 0){ $cai += log($w_val->{$aa.$codon}); }
elsif($w_abs){ next if($w_abs eq '-'); $cai += log($w_abs); }
else { $cai = 999; last; }
$Lc ++;
}
$gb->{$cds}->{$idx} = ($cai == 999) ? 0 : sprintf "%.4f", exp($cai/$Lc) if($Lc);
if($output eq 'stdout'){
msg_send($gb->{$cds}->{$idx}, "\t",
$gb->{$cds}->{$tag}, "\n");
}
if($output eq 'f'){
print FILE
$gb->{$cds}->{$idx}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE);
}
=head2 phx
Name: phx - identify Predicted Highly eXpressed (PHX) genes
Description:
This program calculates codon usage differences between gene classes for
identifying Predicted Highly eXpressed (PHX) and Putative Alien (PA) genes.
The "expression measure" of a gene, E_g, will be accessible at
$gb->{"FEATURE$i"}->{E_g}.
Usage:
NULL = &phx(G instance);
Options:
-output output option (default: 'show')
'stdout' for standard output,
'f' for file output in directory "data",
'g' for graph output in directory "graph",
'show' for graph output and display (default: "show")
-filename output filename (default: "phx.png" for -output=>"g",
"phx.csv" for -output=>"f")
-usage pointer HASH of vectors of amino acid and codon usage
of a set of highly expressed genes. By default, the
ribosomal protein genes are used.
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120628 changed -output option
20100311 added -tag option
20030401 initial posting
Informations stored in the G instance:
$gb->{"FEATURE$i"}->{BgC}: codon bias relative to a collection of all genes B(g|C)
$gb->{"FEATURE$i"}->{BgH}: codon bias relative to a set of highly expressed genes B(g|H)
$gb->{"FEATURE$i"}->{E_g}: expression measure E(g) = B(g|C) / B(g|H) $gb->{"FEATURE$i"}->{phx}: 1 if E(g) > 1.05 $gb->{"FEATURE$i"}->{pa}: 1 if B(g|C) > T(g) & B(g|H) > T(g). T(g) is median[B(g|C)] + 0.1 $gb->{"FEATURE$i"}->{A_g}: combined measure of alien character 0.5 * [B(g|C) + B(g|H)] - T(g)
References: CMBL- PHX/PA user guide http://www.cmbl.uga.edu/software/PHX-PA-guide.htm
Henry I, Sharp PM. 2007 Mol Biol Evol. 24(1):10-2.
Karlin and Mrazek (2000) J.Bacteriol. 182(18):5238-5250.
Requirements: sub codon_compiler; sub geneGroup
=cut
sub phx {
&opt_default(output=>'show', filename=>"phx.png", usage=>"", tag=>'locus_tag',
translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $aacuHigh = opt_val("usage");
my $tag = opt_val("tag");
my $aacuAll = &codon_compiler($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -data=>'A1C2');
&geneGroup($gb);
my $cds;
unless($aacuHigh){
my $number = &geneGroup($gb);
if($$number{highlyExpressed}){
foreach $cds ($gb->cds()){ $gb->{$cds}->{High} = $gb->{$cds}->{geneGroup} =~ /highlyExpressed/ ? 1 : 0; }
$aacuHigh = &codon_compiler($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -data=>'A1C2', -id=>'High');
} else {
warn("\n phx: No reference proteins found.");
$aacuHigh = $aacuAll;
}
}
my (@B_gC, @B_gH);
foreach $cds ($gb->cds()){
my $aacu = &codon_compiler($gb, -output=>'n', -id=>$cds, -data=>'A1C2', -translate=>$translate, -del_key=>$del_key);
my %boxC;
my %boxH;
my $BgC; my $BgH; foreach (keys %{$aacuAll}){
if(length($_) == 4){
my $aa = substr($_, 0, 1);
$boxC{$aa} += abs($aacu->{$_} - $aacuAll->{$_});
$boxH{$aa} += abs($aacu->{$_} - $aacuHigh->{$_});
}
}
foreach (keys %boxC){
$BgC += $aacu->{$_} * $boxC{$_};
$BgH += $aacu->{$_} * $boxH{$_};
}
if($BgH){
$gb->{$cds}->{BgC} = sprintf "%.4f", $BgC;
$gb->{$cds}->{BgH} = sprintf "%.4f", $BgH;
$gb->{$cds}->{E_g} = sprintf "%.4f", $BgC / $BgH; $gb->{$cds}->{phx} = (($gb->{$cds}->{E_g} > 1.05) ? 1 : 0);
}
push(@B_gC, sprintf "%.4f", $BgC);
push(@B_gH, sprintf "%.4f", $BgH);
}
if($output eq 'stdout'){
msg_send("\nPredicted Highly eXpressed (PHX) and Putative Alien (PA) genes\n",
" BgC: codon usage difference from a collection of all genes\n",
" BgH: codon usage difference from a collection of highly expressed genes\n",
" E_g: expression measure (BgC/BgH)\n",
" phx = 1 if E_g > 1.05\n",
" pa = 1 if BgC > T_g & BgH > T_g where T_g is median(BgC) + 0.1\n",
"BgC\tBgH\tE_g\tphx\tpa\t$tag\n");
} elsif($output eq 'f'){
$filename =~ s/\.png$/\.csv/;
mkdir ("data", 0777);
open(FILE,">data/$filename");
print FILE "BgC,BgH,E_g,phx,pa,$tag\n";
}
$T_g = &median(@B_gC) + 0.1;
foreach $cds ($gb->cds()){
$gb->{$cds}->{pa} = (($gb->{$cds}->{BgC} > $T_g && $gb->{$cds}->{BgH} > $T_g) ? 1 : 0);
$gb->{$cds}->{A_g} = sprintf "%.4f", 0.5 * ($gb->{$cds}->{BgC} + $gb->{$cds}->{BgH}) - $T_g;
if($output eq 'stdout'){
msg_send($gb->{$cds}->{BgC}, "\t",
$gb->{$cds}->{BgH}, "\t",
$gb->{$cds}->{E_g}, "\t",
$gb->{$cds}->{phx}, "\t",
$gb->{$cds}->{pa}, "\t",
$gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE
$gb->{$cds}->{BgC}, ",",
$gb->{$cds}->{BgH}, ",",
$gb->{$cds}->{E_g}, ",",
$gb->{$cds}->{phx}, ",",
$gb->{$cds}->{pa}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
if($output eq 'g' || $output eq 'show'){
mkdir ("graph", 0777);
grapher(\@
B_gH,\@B_gC,
-x=>"B(g|H)", -y=>"B(g|C)",
-filename=>"phx.png",
-title=>"Codon bias relative to all genes B(g|C) and highly expressed genes B(g|H)",
-style=>"points",
-output=>$output
);
}
}
=head2 aaui
Name: aaui - calculates various indices of amino acid usage
Description:
This program calculates amino acid usage indices for proteins
(excluding formylmethionine), and inputs in the G instance.
Informations stored in the G instance are:
$gb->{$id}->{ndaa}: number of different amino acids
$gb->{$id}->{Haau}: entropy of amino acid usage
$gb->{$id}->{mmw}: mean molecular weight
$gb->{$id}->{gravy}: mean hydropathic indices of each amino acid
$gb->{$id}->{aroma}: relative frequency of aromatic amino acids
$gb->{$id}->{acidic}: relative frequency of acidic amino acids
$gb->{$id}->{basic}: relative frequency of basic amino acids
$gb->{$id}->{neutral}: relative frequency of neutral amino acids
$gb->{$id}->{polar}: relative frequency of polar amino acids
Usage:
NULL = &aaui(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'aaui.csv')
-id ID of a group of genes or a single gene (default: '')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120620 added recursive call
20100311 added -tag option
20030401 initial posting
References:
Lobry and Gautier (1994) Nucleic Acids Res. 22:3174-3180.
Zavala A et al. (2002) J Mol Evol. 54(5):563-8.
=cut
sub aaui {
&opt_default(output=>'stdout', filename=>'aaui.csv', id=>'', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $tag = opt_val("tag");
unless($id){
if($output eq 'stdout'){
msg_send("\nAmino acid usage indices (excluding formylmethionine)\n",
" Laa: length in amino acids\n",
" ndaa: number of different amino acids\n",
" aroma: relative frequency of aromatic amino acids\n",
" gravy: mean hydropathic indices of each amino acid\n",
" mmw: mean molecular weight\n",
"Laa\tndaa\taroma\tgravy\tmmw\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE "Laa,ndaa,Haau,mmw,gravy,aroma,$tag\n";
}
my $cds;
foreach $cds ($gb->cds()){
&aaui($gb, -id=>$cds);
if($output eq 'stdout'){
msg_send(
$gb->{$cds}->{Laa}, "\t",
$gb->{$cds}->{ndaa}, "\t",
$gb->{$cds}->{aroma}, "\t",
$gb->{$cds}->{gravy}, "\t",
$gb->{$cds}->{mmw}, "\t",
$gb->{$cds}->{$tag}, "\n");
} elsif($output eq 'f'){
print FILE
$gb->{$cds}->{Laa}, ",",
$gb->{$cds}->{ndaa}, ",",
$gb->{$cds}->{Haau}, ",",
$gb->{$cds}->{mmw}, ",",
$gb->{$cds}->{gravy}, ",",
$gb->{$cds}->{aroma}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
} else {
my $aaseq;
if($id =~ /FEATURE/){ $aaseq = substr($gb->{$id}->{translation}, 1); }
else { foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$aaseq .= substr($gb->{$cds}->{translation}, 1);
}
}
my $Laa = $aaseq =~ tr[ACDEFGHIKLMNPQRSTVWY][ACDEFGHIKLMNPQRSTVWY];
if($aaseq){
my %count;
$count{A} = $aaseq =~ tr/A/A/;
$count{C} = $aaseq =~ tr/C/C/;
$count{D} = $aaseq =~ tr/D/D/;
$count{E} = $aaseq =~ tr/E/E/;
$count{F} = $aaseq =~ tr/F/F/;
$count{G} = $aaseq =~ tr/G/G/;
$count{H} = $aaseq =~ tr/H/H/;
$count{I} = $aaseq =~ tr/I/I/;
$count{K} = $aaseq =~ tr/K/K/;
$count{L} = $aaseq =~ tr/L/L/;
$count{M} = $aaseq =~ tr/M/M/;
$count{N} = $aaseq =~ tr/N/N/;
$count{P} = $aaseq =~ tr/P/P/;
$count{Q} = $aaseq =~ tr/Q/Q/;
$count{R} = $aaseq =~ tr/R/R/;
$count{S} = $aaseq =~ tr/S/S/;
$count{T} = $aaseq =~ tr/T/T/;
$count{V} = $aaseq =~ tr/V/V/;
$count{W} = $aaseq =~ tr/W/W/;
$count{Y} = $aaseq =~ tr/Y/Y/;
my ($ndaa, $Haau, $mmw, $hydro);
foreach (qw(A C D E F G H I K L M N P Q R S T V W Y)){
next if($count{$_} == 0);
$ndaa ++;
$Haau += - $count{$_}/$Laa * log($count{$_}/$Laa) / log(2); $mmw += $count{$_} * &G::Seq::AminoAcid::mass($_);
$hydro += $count{$_} * &G::Seq::AminoAcid::hydropathy($_);
}
$gb->{$id}->{Laa} = $Laa;
$gb->{$id}->{ndaa} = $ndaa;
$gb->{$id}->{Haau} = sprintf "%.4f", $Haau;
$gb->{$id}->{mmw} = sprintf "%.2f", $mmw / $Laa; $gb->{$id}->{gravy} = sprintf "%+.4f", $hydro / $Laa; $gb->{$id}->{aroma} = sprintf "%.4f", ($count{F} + $count{Y} + $count{W}) / $Laa; $gb->{$id}->{CMHFYW} = sprintf "%.4f", ($count{C} + $count{M} + $count{H} + $count{F} + $count{Y} + $count{W}) / $Laa; # these six residues have in common the property of being the preferential targets of reactive oxygen species and can act as sinks for radical fluxes through electron tranfer between amino acids in the protein $gb->{$id}->{acidic} = sprintf "%.4f", ($count{D} + $count{E}) / $Laa;
$gb->{$id}->{basic} = sprintf "%.4f", ($count{R} + $count{K} + $count{H}) / $Laa; $gb->{$id}->{neutral} = sprintf "%.4f", ($count{N} + $count{Q} + $count{S} + $count{T} + $count{Y}) / $Laa; $gb->{$id}->{polar} = $gb->{$id}->{acidic} + $gb->{$id}->{basic} + $gb->{$id}->{neutral};
}
}
return;
}
=head2 bui
Name: bui - calculates base usage indices for protein-coding sequences
Description:
This program calculates base usage indices for protein-coding sequences
(excluding start and stop codons), and inputs in the G instance.
Informations stored in the G instance are:
$gb->{$id}->{"acgt$p"}: total number of bases (A+T+G+C)
$gb->{$id}->{"ryr$p"}: purine/pyrimidine ratio (A+G)/(T+C)
$gb->{$id}->{"gac$p"}: G+A content (G+A)/(A+T+G+C) $gb->{$id}->{"gcc$p"}: G+C content (G+C)/(A+T+G+C)
$gb->{$id}->{"gtc$p"}: G+T content (G+T)/(A+T+G+C) $gb->{$id}->{"gcs$p"}: GC skew (C-G)/(C+G)
$gb->{$id}->{"ats$p"}: AT skew (A-T)/(A+T) $gb->{$id}->{"Hbu$p"}: entropy of base usage $gb->{$id}->{"Hgc$p"}: entropy of G+C content (G+C)/(A+T+G+C)
Usage:
NULL = &bui(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'bui.csv')
-id ID of a group of genes or a single gene (default: '')
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-position codon position (default: '');
'' when assessing overall base usage of the gene
'1' when assessing base usage at 1st position of codons
'2' when assessing base usage at 2nd position of codons
'3' when assessing base usage at 3rd position of codons
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120620 added recursive call
20100311 added -tag option
20030401 initial posting
=cut
sub bui {
&opt_default(output=>'stdout',filename=>'bui.csv',id=>'',translate=>0,
del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]',position=>'',tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $p = opt_val("position");
my $tag = opt_val("tag");
unless($id){
if($output eq 'stdout'){
msg_send("\nBase usage indices (excluding start and stop codons)");
msg_send(" at codon position $p") if($p);
msg_send("\n",
" acgt$p: A$p + T$p + G$p + C$p\n",
" ryr$p: purine/pyrimidine ratio (A$p + G$p)/(T$p + C$p)\n",
" gcc$p: G+C content (G$p + C$p)/(A$p + T$p + G$p + C$p)\n",
" Hgc$p: entropy of G+C content (G$p + C$p)/(A$p + T$p + G$p + C$p)\n",
" gcs$p: GC skew (C$p - G$p)/(C$p + G$p)\n",
" ats$p: AT skew (A$p - T$p)/(A$p + T$p)\n",
"acgt$p\tryr$p\tgcc$p\tHgc$p\tgcs$p\tats$p\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE "acgt$p,ryr$p,gcc$p,Hgc$p,gcs$p,ats$p,$tag\n";
}
my $cds;
foreach $cds ($gb->cds()){
&bui($gb, -id=>$cds, -translate=>$translate, -del_key=>$del_key, -position=>$p);
if($output eq 'stdout'){
msg_send($gb->{$cds}->{"acgt$p"}, "\t",
$gb->{$cds}->{"ryr$p"}, "\t",
$gb->{$cds}->{"gcc$p"}, "\t",
$gb->{$cds}->{"Hgc$p"}, "\t",
$gb->{$cds}->{"gcs$p"}, "\t",
$gb->{$cds}->{"ats$p"}, "\t",
$gb->{$cds}->{"$tag"}, "\n");
} elsif($output eq 'f'){
print FILE
$gb->{$cds}->{"acgt$p"}, ",",
$gb->{$cds}->{"ryr$p"}, ",",
$gb->{$cds}->{"gcc$p"}, ",",
$gb->{$cds}->{"Hgc$p"}, ",",
$gb->{$cds}->{"gcs$p"}, ",",
$gb->{$cds}->{"ats$p"}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE) if($output eq 'f');
} else {
my $seq;
my $aaseq;
my $codon;
my $i;
if($id =~ /FEATURE/){ $seq = substr(lc($gb->get_geneseq($id)), 3, -3); if($translate){ $aaseq = &translate($seq); }
else { $aaseq = substr($gb->{$id}->{translation}, 1); }
my $partseq;
$i = 0;
while($aaseq){
my $aa = substr($aaseq, 0, 1);
$partseq .= $p ? substr($seq, $i * 3 + $p - 1, 1) : substr($seq, $i * 3, 3) if($aa !~ /$del_key/);
substr($aaseq, 0, 1) = '';
$i ++;
}
$seq = $partseq;
}
else { my $cumseq;
foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$seq = substr(lc($gb->get_geneseq($cds)), 3, -3); if($translate){ $aaseq = &translate($seq); }
else { $aaseq = substr($gb->{$cds}->{translation}, 1); }
my $partseq;
$i = 0;
while($aaseq){
my $aa = substr($aaseq, 0, 1);
$partseq .= $p ? substr($seq, $i * 3 + $p - 1, 1) : substr($seq, $i * 3, 3) if($aa !~ /$del_key/);
substr($aaseq, 0, 1) = '';
$i ++;
}
$cumseq .= $partseq;
}
$seq = $cumseq;
}
my ($a, $c, $g, $t, $Hbu);
if($seq){
$a = $seq =~ tr/a/a/;
$c = $seq =~ tr/c/c/;
$g = $seq =~ tr/g/g/;
$t = $seq =~ tr/t/t/;
$gb->{$id}->{"acgt$p"} = $a+$c+$g+$t;
if($t+$c){ $gb->{$id}->{"ryr$p"} = sprintf "%.4f", ($a+$g) / ($t+$c); } #else { &msg_error("bui: t + c = 0 for CDS $gb->{$id}->{$tag}\n"); } if($g+$a){ $gb->{$id}->{"gas$p"} = sprintf "%+.4f", ($g-$a)/($g+$a); }
if($c+$t){ $gb->{$id}->{"cts$p"} = sprintf "%+.4f", ($c-$t)/($c+$t); } #else { &msg_error("bui: c + t = 0 for CDS $gb->{$id}->{$tag}\n"); } if($g+$c){ $gb->{$id}->{"gcs$p"} = sprintf "%+.4f", ($c-$g)/($c+$g); }
if($t+$a){ $gb->{$id}->{"ats$p"} = sprintf "%+.4f", ($a-$t)/($a+$t); } #else { &msg_error("bui: t + a = 0 for CDS $gb->{$id}->{$tag}\n"); } if($g+$t){ $gb->{$id}->{"gts$p"} = sprintf "%+.4f", ($g-$t)/($g+$t); }
if($a+$c){ $gb->{$id}->{"acs$p"} = sprintf "%+.4f", ($a-$c)/($a+$c); } #else { &msg_error("bui: a + c = 0 for CDS $gb->{$id}->{$tag}\n"); } foreach($a,$c,$g,$t){ $Hbu -= $_/($a+$c+$g+$t) * log($_/($a+$c+$g+$t)) / log(2) if($_); }
$gb->{$id}->{"Hbu$p"} = sprintf "%.4f", $Hbu;
my $gc = ($g+$c) / ($a+$c+$g+$t); my $gt = ($g+$t) / ($a+$c+$g+$t); $gb->{$id}->{"gac$p"} = sprintf "%.4f", ($g+$a) / ($a+$c+$g+$t); $gb->{$id}->{"gcc$p"} = sprintf "%.4f", $gc;
$gb->{$id}->{"gtc$p"} = sprintf "%.4f", $gt;
$gb->{$id}->{"Hgc$p"} = sprintf "%.4f", - ($gc * log($gc) + (1-$gc) * log(1-$gc)) / log(2) if(0 < $gc && $gc < 1); $gb->{$id}->{"Hgt$p"} = sprintf "%.4f", - ($gt * log($gt) + (1-$gt) * log(1-$gt)) / log(2) if(0 < $gt && $gt < 1); }
}
return;
}
=head2 dinuc
Name: dinuc - calculates dinucleotide usage
Description:
This program calculates dinucleotide usage indices for protein-coding sequences
(excluding start and stop codons), and inputs in the G instance. e.g. CG dinucleotide
usage at the reading frame 1-2 will be accessible at $gb->{"FEATURE$i"}->{cg12}
Dinucleotide usage was computed as the ratio of observed (O) to expected (E)
dinucleotide frequencies.
Informations stored in the G instance are: $gb->{header_dinuc}
Usage:
NULL = &dinuc(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'dinuc.csv')
-id ID of a group of genes or a single gene (default: 'All')
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-position codon position or reading frame (default: '');
'' when assessing all codon positions
'12' when assessing the reading frame 1-2
'23' when assessing the reading frame 2-3
'31' when assessing the reading frame 3-1
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20030401 initial posting
References:
Yew et al. (2004) Virus Genes 28:1,41-53.
=cut
sub dinuc {
&opt_default(output=>'stdout',filename=>'dinuc.csv',id=>'All',position=>'');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $rf = opt_val("position");
my $p = ($rf) ? substr($rf, 0, 1) : 1;
my $q = ($rf) ? substr($rf, 1, 1) : 1;
my $end = ($rf =~ /12|23/) ? 0 : 1;
my %usg;
my $seq;
if($rf){
if($id =~ /FEATURE/){ $seq = substr(lc($gb->get_geneseq($id)), 3, -3); for(my $i; $i * 3 <= length($seq) - 3 - $end; $i ++){
$usg{"mono$p"}->{substr($seq, $i * 3 + $p - 1, 1)} ++;
$usg{"mono$q"}->{substr($seq, $i * 3 + $q - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i * 3 + $p - 1, 2).$rf} ++;
}
}
else { foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$seq = substr(lc($gb->get_geneseq($cds)), 3, -3);
for(my $i; $i * 3 <= length($seq) - 3 - $end; $i ++){
$usg{"mono$p"}->{substr($seq, $i * 3 + $p - 1, 1)} ++;
$usg{"mono$q"}->{substr($seq, $i * 3 + $q - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i * 3 + $p - 1, 2).$rf} ++;
}
}
}
}
else {
if($id =~ /FEATURE/){ $seq = substr(lc($gb->get_geneseq($id)), 3, -3); for(my $i; $i <= length($seq) - 1 - $end; $i ++){
$usg{mono}->{substr($seq, $i + $p - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i + $p - 1, 2)} ++;
}
}
else { foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$seq = substr(lc($gb->get_geneseq($cds)), 3, -3);
for(my $i; $i <= length($seq) - 1 - $end; $i ++){
$usg{mono}->{substr($seq, $i + $p - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i + $p - 1, 2)} ++;
}
}
}
}
my $total;
foreach my $key (keys %usg){
foreach (keys %{$usg{$key}}){ $total += $usg{$key}->{$_}; }
foreach (keys %{$usg{$key}}){ $usg{$key}->{$_} = $usg{$key}->{$_} / $total; } $total = 0; }
my @word = ("aa$rf","ac$rf","ag$rf","at$rf","ca$rf","cc$rf","cg$rf","ct$rf","ga$rf","gc$rf","gg$rf","gt$rf","ta$rf","tc$rf","tg$rf","tt$rf");
foreach (@word){
if($rf){ $usg{diexp}->{$_} = $usg{"mono$p"}->{substr($_,0,1)} * $usg{"mono$q"}->{substr($_,1,1)}; }
else { $usg{diexp}->{$_} = $usg{mono}->{substr($_,0,1)} * $usg{mono}->{substr($_,1,1)}; }
}
my %dioe;
foreach (@word){
if($usg{diexp}->{$_}){
$dioe{$_} = sprintf("%.3f", $usg{diobs}->{$_} / $usg{diexp}->{$_}); $gb->{$id}->{$_} = $dioe{$_}; }
}
$gb->{$id}->{"dinuc$rf"} =\% dioe;
my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}";
if($output =~ /stdout/){
unless($gb->{"header_dinuc$p"}){
msg_send("\n\n");
foreach(@word){ msg_send("$_\t"); }
msg_send("#\tgene");
$gb->{"header_dinuc$p"} = 1;
}
msg_send("\n\n");
foreach(@word){ msg_send("$dioe{$_}\t"); }
msg_send(scalar keys %dioe, "\t$gene");
}
if($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
unless($gb->{"header_dinuc$p"}){
print FILE "\nkeys,";
foreach(@word){ print FILE "$_,"; }
print FILE "gene,";
$gb->{"header_dinuc$p"} = 1;
}
print FILE "\n$id,";
foreach(@word){
print FILE (exists $dioe{$_}) ? "$dioe{$_}," : "0,";
}
print FILE $gb->{$id}->{gene};
close(FILE);
}
}
=head2 Ew
Name: Ew - calculate a measure of synonymous codon usage evenness (Ew)
Description:
This program calculates the 'weighted sum of relative entropy' (Ew) as a
measure of synonymous codon usage evenness for each gene, and inputs in the
G instance; i.e. Ew values will be accessible at $gb->{"FEATURE$i"}->{Ew};
Usage:
NULL = &Ew(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'Ew.csv')
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20100311 added -tag option
20070701 initial posting
References:
Suzuki H. et al. (2004) Gene. 23;335:19-23.
Suzuki H. et al. (2007) EURASIP J Bioinform Syst Biol. 2007:61374.
Requirements: sub codon_compiler
=cut
sub Ew {
&opt_default(output=>'stdout', filename=>'Ew.csv', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $tag = opt_val("tag");
if($output eq 'stdout'){
msg_send("\nThe 'weighted sum of relative entropy' (Ew)\n",
"Ew ranges from 0 (maximum bias) to 1 (no bias).\n",
"Ew\t$tag\n");
} elsif($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"Ew,$tag\n";
}
foreach my $cds ($gb->cds()){
my %Hi; my %Ei; my $Ew; my $aacu = &codon_compiler($gb, -output=>'n', -data=>'A1C2', -id=>$cds, -translate=>$translate, -del_key=>$del_key);
foreach (keys %{$aacu}){
if(length($_) == 4){ $Hi{substr($_, 0, 1)} += - $aacu->{$_} * log($aacu->{$_}) / log(2); } } foreach (keys %Hi){ $Ei{$_} = $Hi{$_} / ( log($gb->{degeneracy}->{$_}) / log(2) ) if($gb->{degeneracy}->{$_} > 1); } foreach (keys %Hi){ if($Ei{$_} > 1){ $Ew = -0.1; } else { $Ew += $Ei{$_} * $aacu->{$_}; } } $gb->{$cds}->{Ew} = sprintf "%.4f", $Ew; # input in the G instance if($output eq 'stdout'){ msg_send($gb->{$cds}->{Ew}, "\t", $gb->{$cds}->{$tag}, "\n"); }
if($output eq 'f'){
print FILE
$gb->{$cds}->{Ew}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE);
}
=head2 P2
Name: P2 - calculate the P2 index of each gene
Description:
This program calculates the P2 index for each gene. This index describes the
proportion of codons conforming to the intermediate strength of codon-anticodon
interaction energy rule of Grosjean and Fiers: P2 = (WWC+SSU)/(WWY+SSY) where W = A or U, S = C or G, and Y = C or U. P2 values will be accessible at $gb->{"FEATURE$i"}->{P2};
Usage:
NULL = &P2(G instance);
Options:
-output output option (default: 'stdout')
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'P2.csv')
-tag feature tag accessible at $gb->{"FEATURE$i"}->{$tag} (default: 'locus_tag')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20100311 added -tag option
20070701 initial posting
References:
Gouy M, Gautier C. (1982) Nucleic Acids Res. 10(22):7055-74.
Requirements: sub codon_compiler
=cut
sub P2 {
&opt_default(output=>'stdout', filename=>'P2.csv', tag=>'locus_tag');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $tag = opt_val("tag");
if($output eq 'stdout'){
msg_send("\nThe P2 index\n",
"P2\t$tag\n");
}
if($output eq 'f'){
mkdir ("data", 0777);
open(FILE,">>data/$filename");
print FILE
"P2,$tag\n";
}
my ($WWC, $SST, $WWY, $SSY);
foreach my $cds ($gb->cds()){
$WWC = $SST = $WWY = $SSY = 0;
my $seq = substr(lc($gb->get_geneseq($cds)), 3, -3); for(my $i = 0; $i * 3 <= length($seq) - 3; $i ++){
my $codon = substr($seq, $i * 3, 3);
$WWC ++ if($codon =~ /[at][at]c/);
$SST ++ if($codon =~ /[cg][cg]t/);
$WWY ++ if($codon =~ /[at][at][ct]/);
$SSY ++ if($codon =~ /[cg][cg][ct]/);
}
$gb->{$cds}->{P2} = sprintf "%.4f", ($WWC+$SST)/($WWY+$SSY) if($WWY+$SSY); # input in the G instance if($output eq 'stdout'){ msg_send($gb->{$cds}->{P2}, "\t", $gb->{$cds}->{$tag}, "\n"); }
if($output eq 'f'){
print FILE
$gb->{$cds}->{P2}, ",",
$gb->{$cds}->{$tag}, "\n";
}
}
close(FILE);
}
=head2 codon_mva
Name: codon_mva - perform multivariate analyses of codon usage data
Description:
This program performs multivariate analyses on codon usage data, and
analyzes correlations between the axes and various gene parameters.
Usage:
NULL = &codon_mva(G instance);
Options:
-output output option (default: 'show')
'stdout' for standard output,
'f' for file output in directory "data",
'g' for graph output in directory "graph",
'show' for graph output and display (default: "show")
-filename output filename (default: "codon_mva.png" for -output=>"g",
"codon_mva.csv" for -output=>"f")
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-data data matrix of amino acid and codon usage (default: 'R0')
-naxis number of axes (default: '4')
-method multivariate analysis method (default: 'wca')
'pca' for principal component analysis
'coa' for correspondence analysis
'wca' for within-group correspondence analysis
-parameter pointer ARRAY of list of gene parameters
(default: ['Laa','aroma','gravy','mmw','gcc3','gtc3','P2']))
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20070101 initial posting
References:
Suzuki H. et al. (2008) DNA Res. 15(6):357-365.
Suzuki H. et al. (2005) FEBS Lett. 579(28):6499-6504.
Charif D. et al. (2005) Bioinformatics 21(4):545-547.
Requirements: sub codon_compiler; sub geneGroup; sub aaui; sub bui; sub dinuc
=cut
sub codon_mva {
&opt_default(output=>'show',filename=>'codon_mva.png',translate=>0,
del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]',data=>'R0',
naxis=>'4',method=>'wca',parameter=>['Laa','aroma','gravy','mmw','gcc3','gtc3','P2']);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
if($output eq 'g' || $output eq 'show'){ mkdir ("graph", 0777); } elsif($output eq 'f'){ mkdir ("data", 0777); $filename =~ s/\.png$/\.csv/; }
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $data = opt_val("data");
my $naxis = opt_val("naxis");
my $method = opt_val("method");
my $parameter = opt_val("parameter");
$parameter = [@$parameter,'High'];
&P2($gb, -output=>'n');
&geneGroup($gb);
my $fh = File::Temp->new;
my $fname = $fh->filename;
my $usageAll = &codon_compiler($gb, -output=>"null", -translate=>$translate, -del_key=>$del_key, -data=>$data, -id=>'All');
my @keyAll = sort keys %$usageAll;
print $fh join(',', (@keyAll, @$parameter) ),"\n";
&geneGroup($gb);
OUTER: foreach my $cds ($gb->cds()){
$gb->{$cds}->{High} = $gb->{$cds}->{geneGroup} =~ /highlyExpressed/ ? 1 : 0;
&aaui($gb, -output=>'n', -id=>$cds);
&bui($gb, -output=>'n', -position=>3, -del_key=>$del_key, -id=>$cds);
foreach(@$parameter){ unless(exists $gb->{$cds}->{$_}){ next OUTER; } }
my $usage = &codon_compiler($gb, -output=>"null", -translate=>$translate, -del_key=>$del_key, -data=>$data, -id=>$cds);
my @tmp;
foreach (@keyAll){ push(@tmp, $$usage{$_} || 0); }
foreach (@$parameter){ push(@tmp, $gb->{$cds}->{$_} || 0); }
print $fh join(',', @tmp), "\n";
}
my $ndc = scalar @keyAll;
my $rcmd= new Rcmd;
my @result = $rcmd->exec(
qq||
output = "$output"
naxis = $naxis
ndc = $ndc
method = "$method"
dat = read.csv("$fname")
usage = as.matrix(dat[,1:ndc])
index = dat[,(ndc+1):(ncol(dat)-1)] # -c("High")
########## haruo
rda = as.dist(1 - cor(t(usage))); eda = dist(usage)
rdh = edh = NA
if(sum(dat[,"High"]) > 1){ usg.h = usage[dat[,"High"] == 1,]; rdh = as.dist(1 - cor(t(usg.h))); edh = dist(usg.h) }
# omits columns where SD is zero
omit = c()
for(i in 1:ncol(usage)) if(sd(usage[,i]) == 0) omit = c(omit, i)
if(length(omit)) usage = usage[,-omit]
# multivariate analysis
if(method == "coa"){ # correspondence analysis
library(ade4);
coa = dudi.coa(as.data.frame(usage), scannf = FALSE, nf = min(dim(usage)));
score = coa\$li[,1:naxis]; #\$
contribution = (100*coa\$eig\/sum(coa\$eig))[1:naxis]; #\$\/
} else if(method == "wca"){ # within-group correspondence analysis
library(ade4);
coa = dudi.coa(as.data.frame(t(usage)), scannf = FALSE, nf = min(dim(usage)));
facaa = as.factor(substring(rownames(coa\$tab),1,1)); #\$
coa = wca(coa, fac = facaa, scannf = FALSE, nf=min(dim(usage)));
score = coa\$co[,1:naxis]; #\$
contribution = (100*coa\$eig\/sum(coa\$eig))[1:naxis]; #\$\/
} else if(method == "pca"){ # principal component analysis
pr = prcomp(usage, center = T, scale = F);
score = pr\$x[,1:naxis]; #\$
contribution = 100*summary(pr)\$importance[2,][1:naxis]; #\$
#factorloading = cor(usage, score)[,1:naxis];
}
#
z.score = apply(score, 2, scale)
zz = rep(NA, naxis); if(sum(dat[,"High"]) > 1) zz = round(abs(apply(z.score[dat[,"High"]==1,], 2, mean)), digits=4)
rr = round(abs(cor(index, z.score, method="pearson")), digits=4) # pearson spearman
# gene parameter with highest r^2 value
key = rownames(rr)[ apply(rr,2,order)[nrow(rr),] ]
# output
if(output == "show"\|\| output == "g"){ #\|\|
png("graph/codon_mva.png");
par(mfrow=c(ceiling(sqrt(naxis)), ceiling(sqrt(naxis))));
for(y in 1:naxis){
x = key[y];
r = rr[x,y];
if(sum(dat[,"High"]) > 1){
plot(dat[,x],z.score[,y],xlab=x, ylab=paste(colnames(z.score)[y]," (",round(contribution[y],digits=1),"% of variance)"), type="n", main=paste("z =", zz[y], "\nr =",r));
text(dat[dat[,"High"]==0,x],z.score[dat[,"High"]==0,y],label="+");
text(dat[dat[,"High"]==1,x],z.score[dat[,"High"]==1,y],label="o",col="red");
} else plot(dat[,x],z.score[,y],xlab=x, ylab=paste(colnames(z.score)[y]," (",round(contribution[y],digits=1),"%) of variance"), main=paste("r =",r));
}
dev.off();
} else if(output == "f"){
out = round(rbind(contribution, rr), digits=4);
write.csv(out, file = "data/$filename", quote = F);
}
########## haruo
out = round(rbind(contribution,abs(rr)), digits=4)
val = round(apply(abs(rr), 2, max), digits=4)
gpd = key # gene parameter detected
gpd[val < 0.7] = "nd"; zc = qnorm(1-0.01)
gfd = gpd # gene feature detected (gpd + Replication + Expression)
if(sum(dat[,"High"]) > 1) gfd[zz > zc & gfd != "nd"] = paste(gfd[zz > zc & gfd != "nd"], "(Expression)")
gfd[zz > zc & gfd == "nd"] = "Expression"
res = gfd
res[substr(res,nchar(res),nchar(res)) == ")"] = "two"
write.csv(rbind(gpd,gfd,res,out,key,val,zz), file = "table$data.csv", quote = F)
# returned value
as.vector(c(mean(rda),mean(eda),mean(rdh),mean(edh), contribution, zz, rr))
|
);
shift @result if($result[0] eq '');
$gb->{rdam} = sprintf "%.4f", shift @result;
$gb->{edam} = sprintf "%.4f", shift @result;
$gb->{rdhm} = sprintf "%.4f", shift @result;
$gb->{edhm} = sprintf "%.4f", shift @result;
for(1..$naxis){ $gb->{"Axis$_"}->{contribution} = sprintf "%.3f", shift @result; }
for(1..$naxis){ $gb->{"Axis$_"}->{z_score} = sprintf "%.3f", shift @result; }
for(1..$naxis){
foreach my $key (@$parameter){
next if($key =~ /High/);
$gb->{"Axis$_"}->{$key} = sprintf "%+.3f", shift @result;
}
}
msg_gimv("graph/codon_mva.png") if($output =~ /show/);
if($output =~ /stdout/){
if($method eq "coa"){ $method = "Correspondence analysis"; }
elsif($method eq "wca"){ $method = "Within-group correspondence analysis (WCA)"; }
elsif($method eq "pca"){ $method = "Principal component analysis (PCA)"; }
msg_send("\n$method of codon usage data $data");
msg_send("\nContribution of each axis (%)\n\t");
for(1..$naxis){ msg_send("Axis$_\t"); }
msg_send("\n\t");
for(1..$naxis){ msg_send($gb->{"Axis$_"}->{contribution}, "\t"); }
msg_send("\nMean standard score (z-score) for highly expressed genes on each axis\n\t");
for(1..$naxis){ msg_send("Axis$_\t"); }
msg_send("\n\t");
for(1..$naxis){ msg_send($gb->{"Axis$_"}->{z_score}, "\t"); }
msg_send("\nCorrelation between each axis and each gene parameter\n\t");
for(1..$naxis){ msg_send("Axis$_\t"); }
foreach my $key (@$parameter){
next if($key =~ /High/);
msg_send("\n$key\t");
for(1..$naxis){ msg_send($gb->{"Axis$_"}->{$key}, "\t"); }
}
msg_send("\n");
}
}
=head2 codon_counter
Name: codon_counter - count codons
Description:
This program calculates codon counts (absolute codon frequencies).
Usage:
(pointer HASH) = &codon_counter(G instance);
Options:
See &codon_compiler
Author:
Koya Mori (mory@g-language.org)
History:
20091203 deprecated
20010721 initial posting
=cut
sub codon_counter {
warn('This method is deprecated. Use &codon_compiler(@_, -data=>"C0") instead.');
return &codon_compiler(@_, -data=>'C0');
}
=head2 amino_counter
Name: amino_counter - count amino acids
Description:
This program calculates absolute amino acid frequencies.
Usage:
(pointer HASH) = &amino_counter(G instance);
Options:
See &codon_compiler
Author:
Koya Mori (mory@g-language.org)
History:
20091203 deprecated
20010721 initial posting
=cut
sub amino_counter {
warn('This method is deprecated. Use &codon_compiler(@_, -data=>"A0") instead.');
return &codon_compiler(@_, -data=>'A0');
}
=head2 codon_usage
Name: codon_usage - calculates codon usage
Description:
This program calculates codon usage.
Usage:
(pointer HASH) = &codon_usage(G instance);
(pointer HASH) = &codon_usage('tRNAscan-SE_output_file');
Options:
-output output option (default: 'show')
'g' for graph, 'show' for showing the graph,
'stdout' for standard output,
'f' for file output in directory "data"
-filename output filename (default: 'codon_usage.png')
-CDSid CDS ID number (default: '')
Author:
Koya Mori (mory@g-language.org)
History:
20090908-01 now accepts tRNAscan-SE output as first argument
20090312-01 fixed bug in filename option
20010721-01 initial posting
=cut
sub codon_usage{
&opt_default(CDSid=>"",filename=>'codon_usage.png',output=>'show',version=>1);
my @args=opt_get(@_);
my $gb=shift @args;
my $key=opt_val("CDSid");
my $filename=opt_val("filename");
my $output=opt_val("output");
my $version = opt_val("version");
my $codon;
my $amino;
my %result;
my %usage;
my %option;
if(-e $gb){
require G::Seq::AminoAcid;
foreach my $line (readFile($gb, 1)){
my @tmp = split(/\s+/, $line);
next unless($tmp[1] =~ /^\d+$/);
my $code = G::Seq::AminoAcid::three2one($tmp[4]);
$code = 'Pseudo' unless length($code);
$usage{$code}{G::Seq::Primitive::complement(lc($tmp[5]))} ++;
}
$usage{'/'}{taa} = 0.0001;
$usage{'/'}{tga} = 0.0001;
$usage{'/'}{tag} = 0.0001;
$option{'-display'} = 'number';
}elsif(length $key){
%usage=codon_amino_counter($gb,$key);
}else{
foreach($gb->cds()){
%result=();
%result=codon_amino_counter($gb,$_);
foreach $amino (keys(%result)){
foreach $codon (keys(%{$result{$amino}})){
$usage{$amino}{$codon}+=$result{$amino}{$codon};
}
}
}
}
if($output eq "f"){
_codon_usage_printer(\%usage,-output=>"f",-filename=>"$filename");
}
if($output!~/[fn]/){
_codon_usage_printer(\%usage,-output=>"stdout");
}
if ($version == 2){
_codon_table2(\%usage,-output=>$output, -filename=>$filename) if($output=~/(g|show)/ && $key eq "");
}else{
_codon_table(\%usage,-output=>$output, -filename=>$filename, %option) if($output=~/(g|show)/ && $key eq "");
}
return\% usage;
}
sub _codon_table{
&opt_default(output=>"show",filename=>"codon_table.png",display=>"percent");
my @args=opt_get(@_);
my $result=shift @args;
my $filename=opt_val("filename");
my $output=opt_val("output");
my $display=opt_val("display");
my ($x, $y, %amino, %data, %per, $amino_total, $codon, $amino, $v, $h);
my %exception;
my $CoDoN;
my %color;
my $im = new GD::Image(500,550);
my $white = $im->colorAllocate(255,255,255);
my $black = $im->colorAllocate(0,0,0);
my $red = $im->colorAllocate(255,0,0);
my $yellow = $im->colorAllocate(200,200,0);
my $green = $im->colorAllocate(0,150,0);
my $blue = $im->colorAllocate(0,0,255);
$color{$_} = $yellow for (qw/D E/);
$color{$_} = $red for (qw/R K H/);
$color{$_} = $blue for (qw/N Q S T Y/);
$color{$_} = $green for (qw/A G V L I P F M W C/);
$color{'/'}=$black;
foreach((10,50,450,490)){
$x=$_;
for($y=10;$y<450;$y++){
$im->setPixel($x,$y,$black);
}
}
foreach((150,250,350)){
$x=$_;
for($y=30;$y<450;$y++){
$im->setPixel($x,$y,$black);
}
}
$y=30;
for($x=50;$x<450;$x++){
$im->setPixel($x,$y,$black);
}
foreach((10,50,150,250,350,450)){
$y=$_;
for($x=10;$x<490;$x++){
$im->setPixel($x,$y,$black);
}
}
$im->string(gdSmallFont,15,25,"first",$red);
$im->string(gdSmallFont,233,15,"second",$green);
$im->string(gdSmallFont,455,25,"third",$blue);
$im->string(gdSmallFont,30,95,"T",$red);
$im->string(gdSmallFont,30,195,"C",$red);
$im->string(gdSmallFont,30,295,"A",$red);
$im->string(gdSmallFont,30,395,"G",$red);
$im->string(gdSmallFont,100,35,"T",$green);
$im->string(gdSmallFont,200,35,"C",$green);
$im->string(gdSmallFont,300,35,"A",$green);
$im->string(gdSmallFont,400,35,"G",$green);
$im->string(gdSmallFont,470,65,"T",$blue);
$im->string(gdSmallFont,470,85,"C",$blue);
$im->string(gdSmallFont,470,105,"A",$blue);
$im->string(gdSmallFont,470,125,"G",$blue);
$im->string(gdSmallFont,470,165,"T",$blue);
$im->string(gdSmallFont,470,185,"C",$blue);
$im->string(gdSmallFont,470,205,"A",$blue);
$im->string(gdSmallFont,470,225,"G",$blue);
$im->string(gdSmallFont,470,265,"T",$blue);
$im->string(gdSmallFont,470,285,"C",$blue);
$im->string(gdSmallFont,470,305,"A",$blue);
$im->string(gdSmallFont,470,325,"G",$blue);
$im->string(gdSmallFont,470,365,"T",$blue);
$im->string(gdSmallFont,470,385,"C",$blue);
$im->string(gdSmallFont,470,405,"A",$blue);
$im->string(gdSmallFont,470,425,"G",$blue);
foreach $amino (keys(%{$result})){
$amino_total=0;
foreach $codon (keys(%{$$result{$amino}})){
$amino_total+=$$result{$amino}{$codon};
}
foreach $codon (keys(%{$$result{$amino}})){
if($$result{$amino}{$codon} > $data{$codon}){
if($data{$codon}!=""){
$exception{$codon}{amino}=$amino{$codon};
$exception{$codon}{per}=$per{$codon};
}
$data{$codon}=$$result{$amino}{$codon};
$amino{$codon}=$amino;
if($display eq 'percent'){
$per{$codon}=sprintf("%.3f",$$result{$amino}{$codon}/$amino_total); }else{
$per{$codon}=sprintf("%s",$$result{$amino}{$codon});
}
}
else{
$exception{$codon}{amino}=$amino;
if($display eq 'percent'){
$exception{$codon}{per}=sprintf("%.3f",$$result{$amino}{$codon}/$amino_total); }else{
$exception{$codon}{per}=sprintf("%s",$$result{$amino}{$codon});
}
}
}
}
$im->string(gdSmallFont,60,65,"TTT $amino{ttt} $per{ttt}",$color{$amino{ttt}});
$im->string(gdSmallFont,60,85,"TTC $amino{ttc} $per{ttc}",$color{$amino{ttc}});
$im->string(gdSmallFont,60,105,"TTA $amino{tta} $per{tta}",$color{$amino{tta}});
$im->string(gdSmallFont,60,125,"TTG $amino{ttg} $per{ttg}",$color{$amino{ttg}});
$im->string(gdSmallFont,60,165,"CTT $amino{ctt} $per{ctt}",$color{$amino{ctt}});
$im->string(gdSmallFont,60,185,"CTC $amino{ctc} $per{ctc}",$color{$amino{ctc}});
$im->string(gdSmallFont,60,205,"CTA $amino{cta} $per{cta}",$color{$amino{cta}});
$im->string(gdSmallFont,60,225,"CTG $amino{ctg} $per{ctg}",$color{$amino{ctg}});
$im->string(gdSmallFont,60,265,"ATT $amino{att} $per{att}",$color{$amino{att}});
$im->string(gdSmallFont,60,285,"ATC $amino{atc} $per{atc}",$color{$amino{atc}});
$im->string(gdSmallFont,60,305,"ATA $amino{ata} $per{ata}",$color{$amino{ata}});
$im->string(gdSmallFont,60,325,"ATG $amino{atg} $per{atg}",$color{$amino{atg}});
$im->string(gdSmallFont,60,365,"GTT $amino{gtt} $per{gtt}",$color{$amino{gtt}});
$im->string(gdSmallFont,60,385,"GTC $amino{gtc} $per{gtc}",$color{$amino{gtc}});
$im->string(gdSmallFont,60,405,"GTA $amino{gta} $per{gta}",$color{$amino{gta}});
$im->string(gdSmallFont,60,425,"GTG $amino{gtg} $per{gtg}",$color{$amino{gtg}});
$im->string(gdSmallFont,160,65,"TCT $amino{tct} $per{tct}",$color{$amino{tct}});
$im->string(gdSmallFont,160,85,"TCC $amino{tcc} $per{tcc}",$color{$amino{tcc}});
$im->string(gdSmallFont,160,105,"TCA $amino{tca} $per{tca}",$color{$amino{tca}});
$im->string(gdSmallFont,160,125,"TCG $amino{tcg} $per{tcg}",$color{$amino{tcg}});
$im->string(gdSmallFont,160,165,"CCT $amino{cct} $per{cct}",$color{$amino{cct}});
$im->string(gdSmallFont,160,185,"CCC $amino{ccc} $per{ccc}",$color{$amino{ccc}});
$im->string(gdSmallFont,160,205,"CCA $amino{cca} $per{cca}",$color{$amino{cca}});
$im->string(gdSmallFont,160,225,"CCG $amino{ccg} $per{ccg}",$color{$amino{ccg}});
$im->string(gdSmallFont,160,265,"ACT $amino{act} $per{act}",$color{$amino{act}});
$im->string(gdSmallFont,160,285,"ACC $amino{acc} $per{acc}",$color{$amino{acc}});
$im->string(gdSmallFont,160,305,"ACA $amino{aca} $per{aca}",$color{$amino{aca}});
$im->string(gdSmallFont,160,325,"ACG $amino{acg} $per{acg}",$color{$amino{acg}});
$im->string(gdSmallFont,160,365,"GCT $amino{gct} $per{gct}",$color{$amino{gct}});
$im->string(gdSmallFont,160,385,"GCC $amino{gcc} $per{gcc}",$color{$amino{gcc}});
$im->string(gdSmallFont,160,405,"GCA $amino{gca} $per{gca}",$color{$amino{gca}});
$im->string(gdSmallFont,160,425,"GCG $amino{gcg} $per{gcg}",$color{$amino{gcg}});
$im->string(gdSmallFont,260,65,"TAT $amino{tat} $per{tat}",$color{$amino{tat}});
$im->string(gdSmallFont,260,85,"TAC $amino{tac} $per{tac}",$color{$amino{tac}});
$im->string(gdSmallFont,260,105,"TAA $amino{taa} $per{taa}",$color{$amino{taa}});
$im->string(gdSmallFont,260,125,"TAG $amino{tag} $per{tag}",$color{$amino{tag}});
$im->string(gdSmallFont,260,165,"CAT $amino{cat} $per{cat}",$color{$amino{cat}});
$im->string(gdSmallFont,260,185,"CAC $amino{cac} $per{cac}",$color{$amino{cac}});
$im->string(gdSmallFont,260,205,"CAA $amino{caa} $per{caa}",$color{$amino{caa}});
$im->string(gdSmallFont,260,225,"CAG $amino{cag} $per{cag}",$color{$amino{cag}});
$im->string(gdSmallFont,260,265,"AAT $amino{aat} $per{aat}",$color{$amino{aat}});
$im->string(gdSmallFont,260,285,"AAC $amino{aac} $per{aac}",$color{$amino{aac}});
$im->string(gdSmallFont,260,305,"AAA $amino{aaa} $per{aaa}",$color{$amino{aaa}});
$im->string(gdSmallFont,260,325,"AAG $amino{aag} $per{aag}",$color{$amino{aag}});
$im->string(gdSmallFont,260,365,"GAT $amino{gat} $per{gat}",$color{$amino{gat}});
$im->string(gdSmallFont,260,385,"GAC $amino{gac} $per{gac}",$color{$amino{gac}});
$im->string(gdSmallFont,260,405,"GAA $amino{gaa} $per{gaa}",$color{$amino{gaa}});
$im->string(gdSmallFont,260,425,"GAG $amino{gag} $per{gag}",$color{$amino{gag}});
$im->string(gdSmallFont,360,65,"TGT $amino{tgt} $per{tgt}",$color{$amino{tgt}});
$im->string(gdSmallFont,360,85,"TGC $amino{tgc} $per{tgc}",$color{$amino{tgc}});
$im->string(gdSmallFont,360,105,"TGA $amino{tga} $per{tga}",$color{$amino{tga}});
$im->string(gdSmallFont,360,125,"TGG $amino{tgg} $per{tgg}",$color{$amino{tgg}});
$im->string(gdSmallFont,360,165,"CGT $amino{cgt} $per{cgt}",$color{$amino{cgt}});
$im->string(gdSmallFont,360,185,"CGC $amino{cgc} $per{cgc}",$color{$amino{cgc}});
$im->string(gdSmallFont,360,205,"CGA $amino{cga} $per{cga}",$color{$amino{cga}});
$im->string(gdSmallFont,360,225,"CGG $amino{cgg} $per{cgg}",$color{$amino{cgg}});
$im->string(gdSmallFont,360,265,"AGT $amino{agt} $per{agt}",$color{$amino{agt}});
$im->string(gdSmallFont,360,285,"AGC $amino{agc} $per{agc}",$color{$amino{agc}});
$im->string(gdSmallFont,360,305,"AGA $amino{aga} $per{aga}",$color{$amino{aga}});
$im->string(gdSmallFont,360,325,"AGG $amino{agg} $per{agg}",$color{$amino{agg}});
$im->string(gdSmallFont,360,365,"GGT $amino{ggt} $per{ggt}",$color{$amino{ggt}});
$im->string(gdSmallFont,360,385,"GGC $amino{ggc} $per{ggc}",$color{$amino{ggc}});
$im->string(gdSmallFont,360,405,"GGA $amino{gga} $per{gga}",$color{$amino{gga}});
$im->string(gdSmallFont,360,425,"GGG $amino{ggg} $per{ggg}",$color{$amino{ggg}});
$im->string(gdSmallFont,15,465,"yellow minus charge",$yellow);
$im->string(gdSmallFont,165,465,"red plus charge",$red);
$im->string(gdSmallFont,285,465,"blue noncharge",$blue);
$im->string(gdSmallFont,400,465,"green nonpolar",$green);
$im->string(gdSmallFont,20,485,"exception",$black);
$v=485;
$h=100;
foreach(sort keys(%exception)){
$color{$exception{$_}{amino}}=$black if($color{$exception{$_}{amino}}=="");
$CoDoN=uc $_;
$im->string(gdSmallFont,$h,$v,"$CoDoN $exception{$_}{amino} $exception{$_}{per}",$color{$exception{$_}{amino}});
$v+=20;
if($v == 545){
$v=485;
$h+=100;
}
}
mkdir ("graph",0777);
open(OUT,'>graph/'."$filename");
binmode OUT;
print OUT $im->png;
close(OUT);
msg_gimv("graph/$filename") if($output eq "show");
}
sub _codon_table2{
&opt_default(output=>"show",filename=>"codon_table.html");
my @args = opt_get(@_);
my $result = shift @args;
my $filename = opt_val("filename");
my $output = opt_val("output");
my (%amino, %data, %per, $amino_total, $amino);
my (%exception, %color);
foreach my $tmp (qw(D E)){ $color{$tmp} = 'gold' }
foreach my $tmp (qw(R K H)){ $color{$tmp} = 'red' }
foreach my $tmp (qw(N Q S T Y)){ $color{$tmp} = 'blue' }
foreach my $tmp (qw(A G V L I P F M W C)){ $color{$tmp} = 'green' }
$color{'/'} = 'black';
foreach $amino (keys(%{$result})){
$amino_total = 0;
foreach my $codon (keys(%{$$result{$amino}})){
$amino_total += $$result{$amino}{$codon};
}
foreach my $codon (keys(%{$$result{$amino}})){
if ($$result{$amino}{$codon} > $data{$codon}){
if ($data{$codon} != ''){
$exception{$codon}{amino} = $amino{$codon};
$exception{$codon}{per} = $per{$codon};
}
$data{$codon} = $$result{$amino}{$codon};
$amino{$codon} = $amino;
$per{$codon} = sprintf("%.3f", $$result{$amino}{$codon} / $amino_total); }
else{
$exception{$codon}{amino} = $amino;
$exception{$codon}{per} = sprintf("%.3f",$$result{$amino}{$codon} / $amino_total); }
}
}
mkdir ('graph', 0777);
open(FILE, '>graph/' . $filename);
print FILE qq(
<HTML>
<HEAD><TITLE>Codon Table v.2</TITLE>
<style type="text/css">
<!--
FONT{font-size:8pt}
-->
</style>
</HEAD>
<BODY bgcolor="white">
<table border=0 cellpadding=0 cellspacing=0><tr><td align=center valign=top>
<table border rules=all cellspacing=0 cellpadding=6>
<tr><td align=center rowspan=2><font color="red">first</font></td>
<td colspan=4 align=center><font color="green">second</font></td>
<td align=center rowspan=2><font color="blue">third</font></td>
</tr><tr>
<td align=center><font color="green">T</font></td>
<td align=center><font color="green">C</font></td>
<td align=center><font color="green">A</font></td>
<td align=center><font color="green">G</font></td>
</tr>
);
foreach my $first (qw(T C A G)){
print FILE qq(<tr><td align=center><font color="red">$first</font></td>);
foreach my $second (qw(T C A G)){
print FILE qq(<td>);
foreach my $third (qw(T C A G)){
my $triplet = $first . $second . $third;
print FILE qq(<table border=0 cellpadding=4 width="100\%"><tr>);
print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}};
print FILE '">', $triplet, '</td>', "\n";
print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}};
print FILE '">', $amino{lc($triplet)}, '</td>', "\n";
print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}};
print FILE '">', $per{lc($triplet)}, '</td>', "\n";
}
print FILE qq(</tr></table></td>);
}
print FILE qq(
<td align=center>
<table border=0 cellpadding=4>
<tr><td><font color="blue">T</font></td></tr>
<tr><td><font color="blue">C</font></td></tr>
<tr><td><font color="blue">A</font></td></tr>
<tr><td><font color="blue">G</font></td></tr>
</table>
</td>
);
}
print FILE qq(
</table><BR>
<font color="gold" style="font-size:9pt">yellow minus charge</font>\&
nbsp;\ \ \ \
<font color="red" style="font-size:9pt">red plus charge</font>\&
nbsp;\ \ \ \
<font color="blue" style="font-size:9pt">blue noncharge</font>\&
nbsp;\ \ \ \
<font color="green" style="font-size:9pt">green nonpolar</font>\&
nbsp;\ \ \ \
<a href="http://prowl.rockefeller.edu/aainfo/contents.htm"
style="font-size:9pt" target="_blank">(learn more here)</a> <BR><BR> <div align=left>exceptions:</div>
<table border=0 cellpadding=10 cellspacing=0><tr>
);
my $i = 0;
foreach my $triplet (sort keys(%exception)){
$color{$exception{$triplet}{amino}} = 'black'
if($color{$exception{$triplet}{amino}} eq '');
print FILE '<td><font color="', $color{$exception{$triplet}{amino}};
print FILE '">', uc $triplet, ' ',
$exception{$triplet}{amino}, ' ';
print FILE $exception{$triplet}{per}, '</font></td>';
if ($i == 4){
print FILE '</tr><tr>';
$i = -1;
}
$i ++;
}
print FILE qq(
</tr></table>
<BR>
<form method="GET" action="http://www.kazusa.or.jp/codon/cgi-bin/spsearch.cgi">
Search <a href="http://www.kazusa.or.jp/codon/">Kazusa Codon Usage Database</a>:
<input size="20" name="species">
<input type=hidden name="c" value="i">
<input type=submit value="Submit">
</form>
);
print FILE "</td></tr></table></body></html>";
close(FILE);
msg_gimv("graph/$filename") if($output eq "show");
}
=head2 delta_enc
Name: delta_enc - calculate the codon usage bias related to translation optimization (delta ENC)
Description:
This method calculates the codon usage bias related to translation optimization
(delta ENC) described in Reference 1. The basic idea is to calculate the Effective
Number of Codons (ENC) for highly-expressed genes (ribosomal genes) and weakly-
expressed genes (all genes), and taking the relative difference between them.
Usage:
$delta_enc = delta_enc($genome);
Options:
None.
References:
1. Rocha EPC (2004) "Codon usage bias from tRNA's point of view: Redundancy,
specialization, and efficient decoding for translation optimization",
Genome Research, 14(11):2279-2286
Author: Kazuharu Arakawa
History:
20110224-01 initial posting
Requirements: sub geneGroup
=cut
sub delta_enc{
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $gb2 = $gb->clone();
&geneGroup($gb2);
for my $cds ($gb2->cds()){ $gb2->{$cds}->{on} = 0 unless($gb2->{$cds}->{geneGroup} =~ /RP/); }
my $interface = msg_ask_interface();
msg_interface('NULL');
my $ENCrib = enc($gb2, -output=>'null', -id=>'All');
my $ENCall = enc($gb, -output=>'null', -id=>'All');
msg_interface($interface);
unless($ENCall){
warn("Problem in codon usage. delta_enc cannot be calculated.");
return 'NA';
}
return ($ENCall - $ENCrib)/$ENCall; }
=head2 S_value
Name: S_value - calculate the strength of selected codon usage bias (S)
Description:
This method calculates the strength of selected codon usage bias (S), also known
as Sharp's S index. Using four codon pairs that are recognized by the same tRNA anticodon, namely, Phe(UUC and UUU), Ile(AUC and AUU), Tyr(UAC and UAU), and Asn(AAC and AAU), since the former in each of the pairs has stronger Watson-Crick pairing, selection towards the former codon can be observed for highly expressed genes. S index is therefore the weighted average of such bias, giving an over-all value for a genome, indicating its strength of selected codon usage bias. See Reference 1 for details.
Sharp originally defined 40 genes as the highly expressed gene group, with tufA, tsf, fusA, rplA-rplF, rplI-rplT, rpsB-rpsT. Since the identificaiton of these genes is not convenient for computational automation, by default, this method uses ribosomal proteins as the highly expressed gene group, as used by Rocha and colleagues in Reference 2.
However, Sharp's gene group can be optionally used with -sharp=>1 option.
With this option, all of the 40 genes must be named accordingly in the given
genome file.
Usage:
$s_value = S_value($genome);
Options:
-sharp set to 1 to use the 40 genes used by Sharp instead of ribosomal proteins
(default: 0)
References:
1. Sharp PM et al. (2005) "Variation in the strength of selected codon usage bias among
bacteria", Nucleic Acids Research, 33(4):1141-1153
2. Vieira-Silva S and Rocha EPC (2010) "The systemic imprint of growth and its uses in
ecological (meta)genomics", PLoS Genetics, 6(1):e1000808
Author: Kazuharu Arakawa
History:
20110224-01 initial posting
Requirements: sub geneGroup
=cut
sub S_value{
opt_default('sharp'=>0);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $gb2 = $gb->clone();
my %opt = opt_val();
&geneGroup($gb2);
for my $cds ($gb2->cds()){
if($opt{'sharp'}){
$gb2->{$cds}->{on} = 0 unless($gb2->{$cds}->{geneGroup} =~ /highlyExpressed/);
}else{
$gb2->{$cds}->{on} = 0 unless($gb2->{$cds}->{geneGroup} =~ /RP/);
}
}
my $Crib = codon_compiler($gb2, -data=>'R1', -output=>'null');
my $Call = codon_compiler($gb, -data=>'R1', -output=>'null');
my ($S, $tot, $error);
for my $pair (qw/Fttc-Fttt Iatc-Iatt Ytac-Ytat Naac-Naat/){
my ($c1, $c2) = split(/-/, $pair, 2);
if(($$Crib{$c1} + $$Crib{$c2}) == 0 ||
($$Call{$c1} + $$Call{$c2}) == 0
){
warn("Problem in codon usage. S cannot be calculated.");
$error = 1;
last;
}
my $Prib = $$Crib{$c1}/($$Crib{$c1}+$$Crib{$c2}); my $Pall = $$Call{$c1}/($$Call{$c1}+$$Call{$c2});
if($Prib == 1 || $Prib == 0){
warn("Problem in codon usage. S cannot be calculated.");
$error = 1;
last;
}
$S += (($$Crib{$c1} + $$Crib{$c2}) * log(($Prib*((1-$Pall)/$Pall))/(1-$Prib)));
$tot += ($$Crib{$c1} + $$Crib{$c2});
}
return $error ? 'NA' : $S/$tot; }
=head2 geneGroup
Name: geneGroup - assign a gene group based on function
Description:
This method assigns a gene group based on function and expression:
'aaRS', Aminoacyl tRNA synthetase
'EF', Translation elongation factor
'RP', Ribosomal protein
'highlyExpressed', 40 highly expressed genes compiled by Sharp (2005)
Usage:
$number = geneGroup($gb);
Options:
-number set to 1 to print the number of genes in each group (default: 0)
References:
1. Sharp PM et al. (2005) "Variation in the strength of selected codon usage bias among
bacteria", Nucleic Acids Research, 33(4):1141-1153
Author: Haruo Suzuki
History:
20110309-01 initial posting
=cut
sub geneGroup{
opt_default('number'=>0);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my %opt = opt_val();
my %number;
for my $cds ($gb->cds()){
my $gene = lcfirst($gb->{$cds}->{gene});
my $product = $gb->{$cds}->{product};
my $gpn = "gene[$gb->{$cds}->{gene}] product[$gb->{$cds}->{product}] note[$gb->{$cds}->{note}]";
if($gpn =~ /elongation.*factor/i && $gpn !~ /transcription elongation factor/i){
$gpn =~ s/,.+//g;
$gpn =~ s/(EF1A|1-alpha|TU)/Tu/;
$gpn =~ s/(EF1B|1-beta|TS)/Ts/;
$gpn =~ s/EF-2/G/;
$product = "Translation elongation factor $1" if($gpn =~ /[Ee]longation [Ff]actor [\W]*(G|P|Tu|Ts)[\W]*/);
$gene = 'fus' if($1 =~ /G/); $gene = 'efp' if($1 =~ /P/);
$gene = 'tuf' if($1 =~ /Tu/);
$gene = 'tsf' if($1 =~ /Ts/);
}elsif($gpn =~ /ribosomal.*protein/i){
$product =~ s/,|putative//g;
$product =~ s/^ | $//g;
$product =~ s/(large subunit |small subunit |^[53]0S |^[LS]SU |^)ribosomal.*protein ([LS]\d{1,2}(\/L12)?)[A-Za-z]{0,2}( \(?rp[lms].+)?( \(.+\))?/Ribosomal protein $2/i;
my $alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
if($product =~ /Ribosomal protein ([LS])(\d{1,2})(\/L12)?$/){
$gene = $2 < 27 ? "rp".lc($1).substr($alphabet,$2-1,1) : "rpm".substr($alphabet,$2-27,1);
$gene = 'rplL' if($product =~ /L7\/L12/);
$gene =~ s/rpsV/sra/;
}
$gene = lcfirst($product) if($product =~ /Rp[lms][A-Z]/);
}
my @geneGroup;
my $COG40 = 'COG0480|COG0081|COG0090|COG0087|COG0088|COG0094|COG0097|COG0359|COG0244|COG0080|COG0222|COG0102|COG0093|COG0200|COG0197|COG0203|COG0256|COG0335|COG0292|COG0052|COG0092|COG0522|COG0098|COG0049|COG0096|COG0103|COG0051|COG0100|COG0048|COG0099|COG0199|COG0184|COG0228|COG0186|COG0238|COG0185|COG0268|COG0264|COG0050|COG0360';
if($gene =~ /fus|tuf|tsf|(rpl[A-F]|rpl[I-T]|rps[B-T])$/){ push(@geneGroup, 'highlyExpressed'); $number{highlyExpressed} ++; }
if($gene =~ /fus|tuf|tsf|efp/){ push(@geneGroup, 'EF'); $number{EF} ++; }
if($product =~ /Ribosomal protein [LS]/){ push(@geneGroup, 'RP'); $number{RP} ++; }
if($gpn =~ /tRNA synthetas/i){ push(@geneGroup, 'aaRS'); $number{aaRS} ++; }
if($gpn =~ /transposase/i){ push(@geneGroup, 'transposase'); $number{transposase} ++; }
$gb->{$cds}->{geneGroup} = join("\t", sort @geneGroup);
}
if($opt{'number'}){ for (sort keys %number){ print "$_=$number{$_}; "; } print "\n"; }
return\% number;
}
=head2 Dmean
Name: Dmean - calculate synonymous codon usage diversity (Dmean)
Description:
This method calculates the level of diversity in synonymous codon usage among genes.
Usage:
$Dmean = Dmean($genome);
Options:
-translate 1 when translates using standard codon table (default: 0)
-del_key regular expression to delete key (i.e. amino acids and nucleotides)
(default: '[^ACDEFGHIKLMNPQRSTVWYacgtU\/]')
-data data matrix of amino acid and codon usage (default: 'R4')
Author:
Haruo Suzuki (haruo@g-language.org)
History:
20120620 initial posting
References:
Suzuki H. et al. (2009) BMC Bioinformatics. 10:167.
Requirements: sub codon_compiler
=cut
sub Dmean{
&opt_default(translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU\/]', data=>'R4');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $data = opt_val("data");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $fh = File::Temp->new;
my $fname = $fh->filename;
my $usageAll = &codon_compiler($gb, -output=>"null", -translate=>$translate, -del_key=>$del_key, -data=>$data, -id=>'All');
my @keyAll = sort keys %$usageAll;
print $fh join(',', @keyAll), "\n";
foreach my $cds ($gb->cds()){
next if(defined $gb->{$cds}->{pseudo}); my $usage = &codon_compiler($gb, -output=>"null", -translate=>$translate, -del_key=>$del_key, -data=>$data, -id=>$cds);
my @tmp;
foreach (@keyAll){ push(@tmp, $$usage{$_} || 0); }
print $fh join(',', @tmp), "\n";
}
if(-z $fname){ warn("$gb->{ACCESSION}: Dmean cannot be calculated without CDS."); return; }
my $rcmd= new Rcmd;
my @result = $rcmd->exec(
qq||
dat = read.csv("$fname")
usage = dat#[apply(dat, 1, sd) > 0,] # pseudo
mean(as.dist(1 - cor(t(usage))))
|
);
my $Dmean = sprintf "%.4f", shift @result;
return $Dmean;
}
=head2 w_tai
Name: w_tai - calculate relative codon adaptiveness (W) values for tAI
Description:
This program calculates the 'relative adaptiveness of each codon' (W value)
necessary for tAI analysis. Returned value is a pointer HASH of W values.
Usage:
pointer HASH_w_tai = &w_tai(G instance);
Options:
-tRNA pointer array of vectors of tRNA gene copy number. By default,
it is retrieved from tRNADB-CE (http://trna.nagahama-i-bio.ac.jp/). -sking Superkingdom: 0-Eukaryota, 1-Prokaryota (default: 1)
Author: Haruo Suzuki (haruo@g-language.org) Kazuharu Arakawa (gaou@g-language.org)
History: 20140514 first release
References: dos Reis M et al. (2004) Nucleic Acids Res. 32(17):5036-44. http://people.cryst.bbk.ac.uk/~fdosr01/tAI/
=cut
sub w_tai {
&opt_default(sking=>1);
my @args=opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $sking = opt_val("sking");
my @tRNA;
if(opt_val("tRNA")){ my $tmp = opt_val("tRNA"); @tRNA = @$tmp; }
else{
my $acc = $gb->{ACCESSION};
$acc = refseq2gbk($acc) if ($acc =~ /^NC_/);
my %count;
my $html = readFile('http://trna.ie.niigata-u.ac.jp/cgi-bin/trnadb/whole_seq_list.cgi?GID=|' . $acc . '&DTYPE=CMP');
$html =~ s/(?:\r|\n)//g;
for my $line (split(/<TR>/, $html)){
if($line =~ /<TD class=\"cellcolor(?:.*?)align=right>(\d+)<\/TD>(?:.*?)align=right>(\d+)<\/TD>(?:.*?)align=right>([+-])<\/TD>(?:.*?)>(\S+)<\/TD>(?:.*?)>(\S+)<\/TD>/){
$count{$5} ++;
}
}
my @anti = ('AAA','GAA','TAA','CAA',
'AGA','GGA','TGA','CGA',
'ATA','GTA','TTA','CTA',
'ACA','GCA','TCA','CCA',
'AAG','GAG','TAG','CAG',
'AGG','GGG','TGG','CGG',
'ATG','GTG','TTG','CTG',
'ACG','GCG','TCG','CCG',
'AAT','GAT','TAT','CAT',
'AGT','GGT','TGT','CGT',
'ATT','GTT','TTT','CTT',
'ACT','GCT','TCT','CCT',
'AAC','GAC','TAC','CAC',
'AGC','GGC','TGC','CGC',
'ATC','GTC','TTC','CTC',
'ACC','GCC','TCC','CCC');
foreach(@anti){ push(@tRNA, $count{$_} || 0); }
}
my @s; @s = (0, 0, 0, 0, 0.5, 0.5, 0.75, 0.5, 0.5); @s = (0.0, 0.0, 0.0, 0.0, 0.41, 0.28, 0.9999, 0.68, 0.89); my @p; foreach(@s){ push(@p, 1 - $_); }
my @W;
for (my $i; $i < 61; $i = $i + 4){
push(@W,
@p[0]*@tRNA[$i] + @p[4]*@tRNA[$i+1], @p[1]*@tRNA[$i+1] + @p[5]*@tRNA[$i], @p[2]*@tRNA[$i+2] + @p[6]*@tRNA[$i], @p[3]*@tRNA[$i+3] + @p[7]*@tRNA[$i+2]) }
$W[34] = $p[8] if($sking == 1);
$W[10] = 0; $W[11] = 0; $W[14] = 0; $W[35] = 0;
my $max; for(@W){ $max = $_ if ($_ > $max); }
my @ws; foreach(@W){ push(@ws, $_/$max); } # substitute ws of 0 by geometric mean (gm) my ($cnt, $sum); for(@ws){ if($_ != 0){ $cnt ++; $sum += log($_); } }
my $gm = exp($sum / $cnt); for(@ws){ if($_ == 0){ $_ = $gm; } }
my @codon = (
'Fttt','Fttc','Ltta','Lttg',
'Stct','Stcc','Stca','Stcg',
'Ytat','Ytac','/taa','/tag',
'Ctgt','Ctgc','/tga','Wtgg',
'Lctt','Lctc','Lcta','Lctg',
'Pcct','Pccc','Pcca','Pccg',
'Hcat','Hcac','Qcaa','Qcag',
'Rcgt','Rcgc','Rcga','Rcgg',
'Iatt','Iatc','Iata','Matg',
'Tact','Tacc','Taca','Tacg',
'Naat','Naac','Kaaa','Kaag',
'Sagt','Sagc','Raga','Ragg',
'Vgtt','Vgtc','Vgta','Vgtg',
'Agct','Agcc','Agca','Agcg',
'Dgat','Dgac','Egaa','Egag',
'Gggt','Gggc','Ggga','Gggg');
my @key_val;
foreach(@codon){ push(@key_val, $_, shift(@ws) ); }
my %w_val = @key_val;
delete $w_val{'/taa'};
delete $w_val{'/tag'};
delete $w_val{'/tga'};
delete $w_val{'Matg'};
return\% w_val;
}
1;} |