sub enc
{ &opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my %tbox; my $aa; my %relative; my %F; my %cntaa; my %numaa; my %sumF; my %aveF; my $enc;
my $count = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C0',
-translate=>$translate, -del_key=>$del_key);
foreach (keys %{$count}){ $tbox{substr($_, 0, 1)} += $count->{$_}; }
foreach (keys %{$count}){
$aa = substr($_, 0, 1);
if($tbox{$aa} - 1 == 0){ delete $tbox{$aa}; }
else { $relative{$_} = $count->{$_} / $tbox{$aa}; } } foreach (keys %relative){ $F{substr($_, 0, 1)} += $relative{$_} ** 2; } foreach $aa (keys %F){ $F{$aa} = ($tbox{$aa} * $F{$aa} - 1) / ($tbox{$aa} - 1);
}
foreach $aa (keys %F){
if($F{$aa} > 0.0000001){
$sumF{$gb->{degeneracy}->{$aa}} += $F{$aa};
$cntaa{$gb->{degeneracy}->{$aa}} ++;
}
}
foreach (values %{$gb->{degeneracy}}){ $numaa{$_} ++; }
foreach (keys %numaa){
if($_ == 1){ $aveF{$_} = 1; }
elsif($cntaa{$_} && $sumF{$_}){
$aveF{$_} = $sumF{$_} / $cntaa{$_}; }
elsif($_ == 3 && $sumF{2} && $sumF{4}){
$aveF{3} = ($sumF{2} / $cntaa{2} + $sumF{4} / $cntaa{4}) / 2; }
else { $enc = -999; }
}
unless($enc){ foreach (keys %numaa){ $enc += $numaa{$_} / $aveF{$_}; }} return ($enc > 61) ? 61.00 : sprintf "%.2f", $enc; }
=head2 cbi
cbi ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: scalar = &cbi(pointer G instance);
Options:
-id ID of a group of genes or a single gene (default:'All')
-translate 1 when translates using regular codon table (default: 0)
-del_key regular expression to delete amino acid and codon keys
(default:'[^ACDEFGHIKLMNPQRSTVWYacgtU]')
Description:
This program calculates the codon bias index (CBI)
Reference: Morton BR (1993) J.Mol.Evol. 37:273-280.
Requirements: sub codon_compiler();
=cut
sub cbi {
&opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $aacu = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'A1C4',
-translate=>$translate, -del_key=>$del_key);
my %bias; my $cbi;
foreach (keys %{$gb->{All}->{A1C4}}){
my $aa = substr($_, 0, 1);
$bias{$aa} += ($aacu->{$_} - 1) ** 2 / ($gb->{degeneracy}->{$aa} - 1) if($aacu->{$aa} && length($_) == 4 && $gb->{degeneracy}->{$aa} > 1); }
foreach (keys %bias){ $cbi += $aacu->{$_} * $bias{$_}; }
return sprintf "%.4f", $cbi;
}
=head2 icdi
icdi ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: scalar = &icdi(pointer G instance);
Options:
-id ID of a group of genes or a single gene (default:'All')
-translate 1 when translates using regular codon table (default: 0)
-del_key regular expression to delete amino acid and codon keys
(default:'[^ACDEFGHIKLMNPQRSTVWYacgtU]')
Description:
This program calculates the intrinsic codon deviation index (ICDI)
Reference: Freire-Picos MA et al. (1994) Gene 139:43-49.
Requirements: sub codon_compiler();
=cut
sub icdi {
&opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $rscu; my %tbox; my %Sk; my $icdi; my $ndaa;
$rscu = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C3',
-translate=>$translate, -del_key=>$del_key);
foreach (keys %{$gb->{All}->{C3}}){
$tbox{substr($_, 0, 1)} += $rscu->{$_};
}
foreach (keys %{$gb->{All}->{C3}}){
my $aa = substr($_, 0, 1);
$Sk{$aa} += ($rscu->{$_} - 1) ** 2 if($tbox{$aa} > 1);
}
foreach (keys %Sk){
my $k = $gb->{degeneracy}->{$_};
$icdi += $Sk{$_} / ($k * ($k - 1)); $ndaa ++;
}
return sprintf "%.4f", $icdi /= $ndaa if($ndaa); }
=head2 fop
fop ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &fop(pointer G instance);
Options:
-output output option (default: 'stdout')
-filename output filename (default: 'fop.csv')
-binary pointer HASH of vectors of binary number, 1 or 0 refering to
the optimal or nonoptimal codon. By default, four amino acids
(Ile, Asn, Phe, and Tyr), where the optimal codons seem to be
the same in all species, are used.
-translate 1 when translates using regular codon table (default: 0)
Description:
This program calculates the frequency of optimal codons (Fop), and
inputs in the G instance. i.e. Fop values will be accessible at
$gb->{"FEATURE$i"}->{fop}
Reference:
Ikemura (1981) J.Mol.Biol. 151:389-409.
Ikemura (1985) Mol.Biol.Evol. 2(1):13-34.
Kanaya et al. (1999) Gene 238:143-155.
Requirements: none.
=cut
sub fop {
&opt_default(output=>'stdout', filename=>'fop.csv', translate=>0);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my %Sc = ('Lcta'=>0,'Lctc'=>0,'Lctg'=>0,'Lctt'=>0,'Ltta'=>0,'Lttg'=>1,
'Raga'=>1,'Ragg'=>0,'Rcga'=>0,'Rcgc'=>0,'Rcgg'=>0,'Rcgt'=>0,
'Sagc'=>0,'Sagt'=>0,'Stca'=>0,'Stcc'=>1,'Stcg'=>0,'Stct'=>1,
'Agca'=>0,'Agcc'=>1,'Agcg'=>0,'Agct'=>1,
'Ggga'=>0,'Gggc'=>1,'Gggg'=>0,'Gggt'=>1,
'Pcca'=>0,'Pccc'=>0,'Pccg'=>0,'Pcct'=>0,
'Taca'=>0,'Tacc'=>1,'Tacg'=>0,'Tact'=>1,
'Vgta'=>0,'Vgtc'=>1,'Vgtg'=>0,'Vgtt'=>1,
'Iata'=>0,'Iatc'=>1,'Iatt'=>1,
'Naac'=>1,'Naat'=>0,
'Dgac'=>1,'Dgat'=>0,
'Ctgc'=>0,'Ctgt'=>0,
'Qcaa'=>0,'Qcag'=>0,
'Egaa'=>1,'Egag'=>0,
'Hcac'=>1,'Hcat'=>0,
'Kaaa'=>0,'Kaag'=>1,
'Fttc'=>1,'Fttt'=>0,
'Ytac'=>1,'Ytat'=>0);
my %Ec = ('Lcta'=>0,'Lctc'=>0,'Lctg'=>1,'Lctt'=>0,'Ltta'=>0,'Lttg'=>0,
'Raga'=>0,'Ragg'=>0,'Rcga'=>0,'Rcgc'=>1,'Rcgg'=>0,'Rcgt'=>1,
'Sagc'=>0,'Sagt'=>0,'Stca'=>0,'Stcc'=>0,'Stcg'=>0,'Stct'=>0,
'Agca'=>1,'Agcc'=>0,'Agcg'=>1,'Agct'=>1,
'Ggga'=>0,'Gggc'=>1,'Gggg'=>0,'Gggt'=>1,
'Pcca'=>0,'Pccc'=>0,'Pccg'=>1,'Pcct'=>0,
'Taca'=>0,'Tacc'=>1,'Tacg'=>0,'Tact'=>1,
'Vgta'=>1,'Vgtc'=>0,'Vgtg'=>1,'Vgtt'=>1, 'Vgta'=>1,'Vgtc'=>0,'Vgtg'=>0,'Vgtt'=>1, 'Iata'=>0,'Iatc'=>1,'Iatt'=>0,
'Naac'=>1,'Naat'=>0,
'Dgac'=>1,'Dgat'=>0,
'Ctgc'=>0,'Ctgt'=>0,
'Qcaa'=>0,'Qcag'=>1,
'Egaa'=>1,'Egag'=>0,
'Hcac'=>1,'Hcat'=>0,
'Kaaa'=>1,'Kaag'=>0,
'Fttc'=>1,'Fttt'=>0,
'Ytac'=>1,'Ytat'=>0);
my %four = ('Iatc'=>1,'Iatt'=>0, 'Naac'=>1,'Naat'=>0,
'Fttc'=>1,'Fttt'=>0,
'Ytac'=>1,'Ytat'=>0);
my $binary = opt_val("binary") ? opt_val("binary") :\% four;
my %degeneracy;
foreach (keys %{$binary}){ $degeneracy{substr($_, 0, 1)} ++; }
if($output eq 'stdout'){
msg_send("\n\n",
"optimal (1) and nonoptimal (0) codons\n",
"-------------------------------------\n"
);
foreach (sort keys %$binary){ msg_send("$_\t"); }
print "\n";
foreach (sort keys %$binary){ msg_send("$binary->{$_}\t"); }
msg_send("\n\n",
"Frequency of optimal codons (Fop; ratio of optimal codons to synonymous codons)\n",
" Laa: length in amino acids\n",
" Ls: length in silent sites (number of synonymous codons)\n",
"--------------------------------------------\n",
"Laa\tLs\tstart\tstop\tFop\tgene\n",
"--------------------------------------------\n");
}
if($output eq 'f'){
open(FILE,">>$filename");
print FILE
"Frequency of optimal codons (Fop; ratio of optimal codons to synonymous codons)\n",
" Laa: length in amino acids\n",
" Ls: length in silent sites (number of synonymous codons)\n",
"Laa,Ls,start codon,stop codon,Fop,gene,product\n";
}
foreach my $cds ($gb->cds()){
my $seq = substr($gb->get_geneseq($cds), $gb->{$cds}->{codon_start} - 1 + 3, -3);
my $aaseq;
my $i;
if($translate){
while($i <= length($seq) - 3){
my $codon = substr($seq, $i, 3);
$aaseq .= ($codon =~ /[^acgt]/) ? '?' : $Table{$codon};
$i += 3;
}
}
else { $aaseq = substr($gb->{$cds}->{translation}, 1); }
my $Laa = length($aaseq);
my $Ls; my $fop;
while($aaseq){
my $aa = substr($aaseq, 0, 1);
substr($aaseq, 0, 1) = '';
my $codon = substr($seq, $i, 3);
if($degeneracy{$aa} > 1 && exists $binary->{$aa.$codon}){
$Ls ++;
$fop += $binary->{$aa.$codon};
}
$i += 3;
}
$gb->{$cds}->{fop} = sprintf "%.4f", $fop / $Ls if($Ls > 1);
if($output eq 'stdout'){
msg_send("$Laa\t$Ls\t",
$gb->startcodon($cds), "\t",
$gb->stopcodon($cds), "\t",
$gb->{$cds}->{fop}, "\t",
$gb->{$cds}->{gene}, "\n");
}
if($output eq 'f'){
print FILE
"$Laa,$Ls,",
$gb->startcodon($cds), ",",
$gb->stopcodon($cds), ",",
$gb->{$cds}->{fop}, ",",
$gb->{$cds}->{gene}, ",",
$gb->{$cds}->{product}, "\n";
}
}
close(FILE);
}
=head2 w_value
w_value ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: pointer HASH_w_values = &w_value(pointer G instance);
Options:
-in regular expression to include genes in a reference set
a reference set in several studies are in-built
1: Nakamura & Tabata, 2: Sharp & Li, 3: Sakai et al.
(default: 1)
-ex regular expression to exclude genes from a reference set
(default: '[Mm]itochondrial')
Description:
Calculates the W value necessary for CAI analysis.
Returned value is a pointer HASH of W values.
Reference:
Sharp and Li (1987) Nucleic Acids Res. 15:1281-1295.
Nakamura and Tabata (1997) Microb.Comp.Genomics 2:299-312.
Sakai et al. (2001) J.Mol.Evol. 52:164-170.
History:
20030401 addition of -in & -out option
20010830 first release (Author: Kazuharu Arakawa)
Requirements: none.
=cut
sub w_value {
&opt_default(output=>'stdout', filename=>"w_value.csv",
in=>'ribosomal.*protein', ex=>'[Mm]itochondrial');
my @args=opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $include;
my $exclude = opt_val("ex");
if(opt_val("in") == 1){ $include = 'ribosomal.*protein'; }
elsif(opt_val("in") == 2){ $include = 'ribosomal.*protein|outer membrane protein|^elongation factor'; }
elsif(opt_val("in") == 3){ $include = 'ribosom.*protein|outer.*membrane|elongation|heat.*shock|RNA.*polymerase.*subunit'; }
else{ $include = opt_val("in"); }
if($output eq 'stdout'){
msg_send("\n",
"Reference set of highly expressed genes\n",
"-----------------------------------------\n",
"length\tgene\tproduct\n",
"-----------------------------------------\n");
}
if($output eq 'f'){
open(FILE,">>$filename");
print FILE
"Reference set of highly expressed genes\n",
"length,gene,product\n";
}
my $aaseq;
my $seq;
my $aa;
my $codon;
my %count;
my $i;
my %max;
my %w_val;
my $cnt;
foreach my $cds ($gb->cds()){
next if ($gb->{$cds}->{note} !~ /$include/i &&
$gb->{$cds}->{product} !~ /$include/i);
next if ($gb->{$cds}->{product} =~ /$exclude/);
$cnt ++;
$aaseq = substr($gb->{$cds}->{translation}, 1);
$seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3);
while($aaseq){
$aa = substr($aaseq, 0, 1);
substr($aaseq, 0, 1) = '';
$codon = substr($seq, $i, 3);
$count{$aa.$codon} ++;
$i += 3;
}
$i = 0;
if($output eq 'stdout'){
msg_send(length($gb->{$cds}->{translation}) - 1, "\t",
$gb->{$cds}->{gene}, "\t",
$gb->{$cds}->{product}, "\n");
}
if($output eq 'f'){
print FILE
length($gb->{$cds}->{translation}) - 1, ",",
$gb->{$cds}->{gene}, ",",
$gb->{$cds}->{product}, "\n";
}
}
&msg_error("\ncai: No reference proteins found (could not define W values).") unless($cnt);
foreach (keys %count){
$aa = substr($_, 0, 1);
$max{$aa} = $count{$_} if($count{$_} > $max{$aa});
}
foreach (keys %count){
$w_val{$_} = sprintf "%.4f", $count{$_} / $max{substr($_, 0, 1)}; }
if($output eq 'stdout'){
msg_send("\nRelative adaptiveness (W) of each codon\n",
"-----------------------------------------\n",
"aa\tcodon\tW value\n",
"-----------------------------------------\n");
foreach (sort keys %count){
$aa = substr($_, 0, 1);
$codon = substr($_, 1, 3);
msg_send("$aa\t$codon\t$w_val{$_}\n");
}
}
if($output eq 'f'){
print FILE
"\nRelative adaptiveness (W) of each codon\n",
"amino acid,codon,W value\n";
foreach (sort keys %count){
$aa = substr($_, 0, 1);
$codon = substr($_, 1, 3);
print FILE "$aa,$codon,$w_val{$_}\n";
}
close(FILE);
}
return\% w_val;
}
=head2 cai
cai ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &cai(pointer G instance);
Options:
-output output option (default: stdout)
-filename output filename (default: "cai.csv")
-translate 1 when translates using regular codon table (default: 0)
-w_output output option for W value (default: stdout)
-w_filename filename to output the W values (default: "w_value.csv")
-w_values pointer HASH of W values used in CAI calculation
-w_absent W value of codons absent from a reference set (default: "-")
a value of 0.5 was suggested by Sharp and Li (1987)
"-" when excludes such codons from the calculation
Description:
This program calculates codon adaptation index (CAI) for each genes,
and inputs in the G instance. i.e. CAI values will be accessible at
$gb->{"FEATURE$i"}->{cai};
History:
20030401 addition of -w_absent option
20010830 first release (Author: Kazuharu Arakawa)
Requirements: sub w_value()
=cut
sub cai {
&opt_default(output=>'stdout', filename=>"cai.csv", translate=>0,
w_output=>'stdout', w_filename=>"w_value.csv", w_absent=>"-");
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my $w_output = opt_val("w_output");
my $w_fname = opt_val("w_filename");
my $w_abs = opt_val("w_absent");
my $w_val;
$w_output = $output if($w_output eq 'stdout' && $output ne 'stdout');
if(opt_val("w_values")){
$w_val = opt_val("w_values");
}else{
$w_val = &w_value($gb, -output=>$w_output, -filename=>$w_fname);
}
if($output eq 'stdout'){
msg_send("\n",
"Codon Adaptation Index (CAI; geometric mean of W values)\n",
"--------------------------------------------------------\n",
"length\tstart\tstop\tCAI\tgene\n",
"--------------------------------------------------------\n");
}
if($output eq 'f'){
open(FILE,">>$filename");
print FILE
"Codon Adaptation Index (CAI; geometric mean of W values)\n",
"length,start codon,stop codon,CAI,gene,product\n";
}
foreach my $cds ($gb->cds()){
my $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); my $aaseq;
my $i;
my $codon;
if($translate){
while($i <= length($seq) - 3){
$codon = substr($seq, $i, 3);
$aaseq .= ($codon =~ /[^acgt]/) ? '?' : $Table{$codon};
$i += 3;
}
$i = 0;
}
else { $aaseq = substr($gb->{$cds}->{translation}, 1); }
my $cai;
my $L;
for($i = 0; $i * 3 <= length($seq) - 3; $i ++){
my $aa = substr($aaseq, $i, 1);
$codon = substr($seq, $i * 3, 3);
if($w_val->{$aa.$codon} > 0){ $cai += log($w_val->{$aa.$codon}); }
elsif($w_abs){ next if($w_abs eq '-'); $cai += log($w_abs); }
else { $cai = 999; last; }
$L ++;
}
$gb->{$cds}->{cai} = ($cai == 999) ? 0 : sprintf "%.4f", exp($cai/$L) if($L);
if($output eq 'stdout'){
msg_send("$L\t",
$gb->startcodon($cds), "\t",
$gb->stopcodon($cds), "\t",
$gb->{$cds}->{cai}, "\t",
$gb->{$cds}->{gene}, "\n");
}
if($output eq 'f'){
print FILE
"$L,",
$gb->startcodon($cds), ",",
$gb->stopcodon($cds), ",",
$gb->{$cds}->{cai}, ",",
$gb->{$cds}->{gene}, ",",
$gb->{$cds}->{product}, "\n";
}
}
close(FILE);
}
=head2 phx
phx ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &phx(pointer G instance);
Options:
-output output option (default: 'stdout')
-filename output filename (default: "phx.csv")
-usage pointer HASH of vectors of amino acid and codon usage
of a set of highly expressed genes. By default,
the ribosomal protein genes are used.
-translate 1 when translates using regular codon table (default: 0)
-del_key regular expression to delete amino acid and codon keys
(default:'[^ACDEFGHIKLMNPQRSTVWYacgtU]')
Description:
Calculates codon usage differences between gene classes for
characterizing predicted highly expressed (PHX) genes, and
inputs in the G instance. e.g. predicted expression level,
E(g), will be accessible at $gb->{"FEATURE$i"}->{pel}
Informations stored in the G instance:
$gb->{"FEATURE$i"}->{cudC}: B(g|C)
$gb->{"FEATURE$i"}->{cudG}: B(g|G)
$gb->{"FEATURE$i"}->{pel}: E(g) [B(g|C) / B(g|G)] $gb->{"FEATURE$i"}->{phx}: 1 if a gene is PHX [i.e. E(g) > 1.05] References: Karlin and Mrazek (2000) J.Bacteriol. 182(18):5238-5250. Jansen et al. (2003) Nucleic Acids Res. 31(8):2242-2251. Requirements: sub codon_compiler()
=cut
sub phx { &opt_default(output=>'stdout', filename=>"phx.csv", usage=>"", translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]'); my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $refset = opt_val("usage");
my $cds;
my $cnt;
unless($refset){
foreach $cds ($gb->cds()){
$gb->{$cds}->{RefSet} = ( ($gb->{$cds}->{product} =~ /ribosomal.*protein/i) ? 1 : 0 );
$cnt += $gb->{$cds}->{RefSet};
}
}
if($cnt){
$refset = &codon_compiler($gb, -output=>'n', -id=>'RefSet', -data=>'A1C2', -del_key=>$del_key);
}
else {
&msg_error("\nphx: No reference proteins found.");
$refset = &codon_compiler($gb, -output=>'n', -data=>'A1C2', -del_key=>$del_key);
}
if($output eq 'stdout'){
msg_send("\n\n",
"Predicted highly expressed (PHX) genes\n",
" B(g|C): codon usage difference relative to collection of all genes\n",
" B(g|G): codon usage difference relative to a set of highly expressed genes\n",
" E(g): predicted expression level [B(g|C) / B(g|G)]\n",
" PHX = 1 if E(g) > 1.05\n",
"------------------------------------------------------------\n",
"length\tstart\tstop\tB(g|C)\tB(g|G)\tE(g)\tPHX\tgene\n",
"------------------------------------------------------------\n");
}
if($output eq 'f'){
open(FILE,">>$filename");
print FILE
"Predicted highly expressed (PHX) genes\n",
" B(g|C): codon usage difference relative to collection of all genes\n",
" B(g|G): codon usage difference relative to a set of highly expressed genes\n",
" E(g): predicted expression level [B(g|C) / B(g|G)]\n",
" PHX = 1 if E(g) > 1.05\n",
"length,start codon,stop codon,B(g|C),B(g|G),E(g),PHX,gene,product\n";
}
foreach $cds ($gb->cds()){
my $aacu = &codon_compiler($gb, -output=>'n', -id=>$cds, -data=>'A1C2', -translate=>$translate, -del_key=>$del_key);
my %boxC;
my %boxG;
my $cudC; my $cudG; foreach (keys %{$gb->{All}->{A1C2}}){
if(length($_) == 4){
my $aa = substr($_, 0, 1);
$boxC{$aa} += abs($aacu->{$_} - $gb->{All}->{A1C2}->{$_});
$boxG{$aa} += abs($aacu->{$_} - $refset->{$_});
}
}
foreach (keys %boxC){
$cudC += $aacu->{$_} * $boxC{$_};
$cudG += $aacu->{$_} * $boxG{$_};
}
if($cudG){
$gb->{$cds}->{cudC} = sprintf "%.4f", $cudC;
$gb->{$cds}->{cudG} = sprintf "%.4f", $cudG;
$gb->{$cds}->{pel} = sprintf "%.4f", $cudC / $cudG; $gb->{$cds}->{phx} = (($gb->{$cds}->{pel} > 1.05) ? 1 : 0);
}
if($output eq 'stdout'){
msg_send(length(substr($gb->{$cds}->{translation}, 1)), "\t",
$gb->startcodon($cds), "\t",
$gb->stopcodon($cds), "\t",
$gb->{$cds}->{cudC}, "\t",
$gb->{$cds}->{cudG}, "\t",
$gb->{$cds}->{pel}, "\t",
$gb->{$cds}->{phx}, "\t",
$gb->{$cds}->{gene}, "\n");
}
if($output eq 'f'){
print FILE
length(substr($gb->{$cds}->{translation}, 1)), ",",
$gb->startcodon($cds), ",",
$gb->stopcodon($cds), ",",
$gb->{$cds}->{cudC}, ",",
$gb->{$cds}->{cudG}, ",",
$gb->{$cds}->{pel}, ",",
$gb->{$cds}->{phx}, ",",
$gb->{$cds}->{gene}, ",",
$gb->{$cds}->{product}, "\n";
}
}
close(FILE);
}
=head2 cui
cui ver.20031001
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &cui(pointer G instance);
Options:
-output output option (default: 'stdout')
-filename output filename (default: 'cui.csv')
-list pointer ARRAY of list of codon usage indices.
['Hcu','Dcu','Dsyn','Hs','Hw','Ew','enc','cbi','icdi','cai','phx']
Description:
Calculates indices of codon usage, and inputs in the G instance.
Requirements:
sub entropy_cu; sub entropy_scu; sub enc; sub cbi; sub icdi;
sub fop; sub w_value; sub cai; sub phx;
=cut
sub cui {
&opt_default(output=>'stdout',filename=>'cui.csv',list=>['Ew','cai','gene']);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $list = opt_val("list");
my %func;
foreach(@$list){
if($_ =~ /Hcu|Dcu|Dsyn/){ $func{entropy_cu} ++; }
elsif($_ =~ /Hs|Hw|Ew/){ $func{entropy_scu} ++; }
elsif($_ =~ /enc|cbi|icdi|fop|cai|phx/){ $func{$_} ++; }
}
foreach ($gb->cds()){
&entropy_cu($gb, -output=>'n', -id=>$_) if($func{entropy_cu});
&entropy_scu($gb, -output=>'n', -id=>$_) if($func{entropy_scu});
$gb->{$_}->{enc} = &enc($gb, -id=>$_) if($func{enc});
$gb->{$_}->{cbi} = &cbi($gb, -id=>$_) if($func{cbi});
$gb->{$_}->{icdi} = &icdi($gb, -id=>$_) if($func{icdi});
}
&cai($gb, -output=>'n', -w_output=>'n') if($func{cai});
&phx($gb, -output=>'n') if($func{phx});
my $cds;
if($output eq 'stdout'){
msg_send("\n\nIndices of codon usage\n");
foreach(@$list){ msg_send("$_\t"); }
foreach $cds ($gb->cds()){
msg_send("\n");
foreach(@$list){ msg_send($gb->{$cds}->{$_}, "\t"); }
}
}
if($output eq 'f'){
open(FILE,">>$filename");
print FILE "Indices of codon usage\n";
foreach(@$list){ print FILE "$_,"; }
foreach $cds ($gb->cds()){
print FILE "\n";
foreach(@$list){ print FILE "$gb->{$cds}->{$_},"; }
}
}
close(FILE);
}
=head2 aaui
aaui ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &aaui(pointer G instance);
Options:
-output output option (default: 'stdout')
-filename output filename (default: 'aaui.csv')
-id ID of a group of genes or a single gene (default:'All')
Description:
Calculates indices of amino acid usage (excluding formylmethionine),
and inputs in the G instance.
Informations stored in the G instance:
$gb->{$id}->{ndaa}: number of distinct amino acids
$gb->{$id}->{Haau}: entropy of amino acid usage
$gb->{$id}->{mmw}: mean molecular weight
$gb->{$id}->{gravy}: mean hydropathic indices of each amino acid
$gb->{$id}->{aroma}: relative frequency of aromatic amino acids
$gb->{$id}->{acidic}: relative frequency of acidic amino acids
$gb->{$id}->{basic}: relative frequency of basic amino acids
$gb->{$id}->{neutral}: relative frequency of neutral amino acids
$gb->{$id}->{polar}: relative frequency of polar amino acids
$gb->{legend_aaui};
Reference: Lobry and Gautier (1994) Nucleic Acids Res. 22:3174-3180.
Requirements: none.
=cut
sub aaui {
&opt_default(output=>'stdout', filename=>'aaui.csv', id=>'All');
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $aaseq;
if($id =~ /FEATURE/){ $aaseq = substr($gb->{$id}->{translation}, 1); }
else { foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$aaseq .= substr($gb->{$cds}->{translation}, 1);
}
}
my $Laa = $aaseq =~ tr[ACDEFGHIKLMNPQRSTVWY][ACDEFGHIKLMNPQRSTVWY];
if($aaseq){
my %count;
$count{A} = $aaseq =~ tr/A/A/;
$count{C} = $aaseq =~ tr/C/C/;
$count{D} = $aaseq =~ tr/D/D/;
$count{E} = $aaseq =~ tr/E/E/;
$count{F} = $aaseq =~ tr/F/F/;
$count{G} = $aaseq =~ tr/G/G/;
$count{H} = $aaseq =~ tr/H/H/;
$count{I} = $aaseq =~ tr/I/I/;
$count{K} = $aaseq =~ tr/K/K/;
$count{L} = $aaseq =~ tr/L/L/;
$count{M} = $aaseq =~ tr/M/M/;
$count{N} = $aaseq =~ tr/N/N/;
$count{P} = $aaseq =~ tr/P/P/;
$count{Q} = $aaseq =~ tr/Q/Q/;
$count{R} = $aaseq =~ tr/R/R/;
$count{S} = $aaseq =~ tr/S/S/;
$count{T} = $aaseq =~ tr/T/T/;
$count{V} = $aaseq =~ tr/V/V/;
$count{W} = $aaseq =~ tr/W/W/;
$count{Y} = $aaseq =~ tr/Y/Y/;
my ($ndaa, $Haau, $mmw, $hydro);
foreach (qw(A C D E F G H I K L M N P Q R S T V W Y)){
next if($count{$_} == 0);
$ndaa ++;
$Haau += - $count{$_}/$Laa * log($count{$_}/$Laa) / log(2); $mmw += $count{$_} * $Mwt{$_};
$hydro += $count{$_} * $KyteDoolittle{$_};
}
$gb->{$id}->{Laa} = $Laa;
$gb->{$id}->{ndaa} = $ndaa;
$gb->{$id}->{Haau} = sprintf "%.4f", $Haau;
$gb->{$id}->{mmw} = sprintf "%.2f", $mmw / $Laa; $gb->{$id}->{gravy} = sprintf "%+.4f", $hydro / $Laa; $gb->{$id}->{aroma} = sprintf "%.4f", ($count{F} + $count{Y} + $count{W}) / $Laa; $gb->{$id}->{CMHFYW} = sprintf "%.4f", ($count{C} + $count{M} + $count{H} + $count{F} + $count{Y} + $count{W}) / $Laa; # these six residues have in common the property of being the preferential targets of reactive oxygen species and can act as sinks for radical fluxes through electron tranfer between amino acids in the protein $gb->{$id}->{acidic} = sprintf "%.4f", ($count{D} + $count{E}) / $Laa;
$gb->{$id}->{basic} = sprintf "%.4f", ($count{R} + $count{K} + $count{H}) / $Laa; $gb->{$id}->{neutral} = sprintf "%.4f", ($count{N} + $count{Q} + $count{S} + $count{T} + $count{Y}) / $Laa; $gb->{$id}->{polar} = $gb->{$id}->{acidic} + $gb->{$id}->{basic} + $gb->{$id}->{neutral};
}
my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}";
if($output eq 'stdout'){
unless($gb->{legend_aaui}){
msg_send("\n\n",
"Indices of amino acid usage (excluding formylmethionine)\n",
" Laa: length in amino acids\n",
" ndaa: number of distinct amino acids\n",
" Haau: entropy of amino acid usage\n",
" mmw: mean molecular weight\n",
" gravy: mean hydropathic indices of each amino acid\n",
"----------------------------------------------------------------------------\n",
"Laa\tndaa\tHaau\tmmw\tgravy\taroma\tacidic\tbasic\tneutral\tgene\n",
"----------------------------------------------------------------------------\n");
$gb->{legend_aaui} = 1;
}
msg_send(
$gb->{$id}->{Laa}, "\t",
$gb->{$id}->{ndaa}, "\t",
$gb->{$id}->{Haau}, "\t",
$gb->{$id}->{mmw}, "\t",
$gb->{$id}->{gravy}, "\t",
$gb->{$id}->{aroma}, "\t",
$gb->{$id}->{acidic}, "\t",
$gb->{$id}->{basic}, "\t",
$gb->{$id}->{neutral}, "\t",
$gene, "\n");
}
if($output eq 'f'){
open(FILE,">>$filename");
unless($gb->{legend_aaui}){
print FILE
"Indices of amino acid usage (excluding formylmethionine)\n",
" Laa: length in amino acids\n",
" ndaa: number of distinct amino acids\n",
" Haau: entropy in amino acid usage\n",
" mmw: mean molecular weight\n",
" gravy: mean hydropathic indices of each amino acid\n",
"Laa,ndaa,Haau,mmw,gravy,aroma,acidic,basic,neutral,polar,gene\n";
$gb->{legend_aaui} = 1;
}
print FILE
$gb->{$id}->{Laa}, ",",
$gb->{$id}->{ndaa}, ",",
$gb->{$id}->{Haau}, ",",
$gb->{$id}->{mmw}, ",",
$gb->{$id}->{gravy}, ",",
$gb->{$id}->{aroma}, ",",
$gb->{$id}->{acidic}, ",",
$gb->{$id}->{basic}, ",",
$gb->{$id}->{neutral}, ",",
$gb->{$id}->{polar}, ",",
$gene, "\n";
}
close(FILE);
}
=head2 bui
bui ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &bui(pointer G instance);
Options:
-output output option (default: 'stdout')
-filename output filename (default: 'bui.csv')
-id ID of a group of genes or a single gene (default:'All')
-translate 1 when translates using regular codon table (default: 0)
-del_key regular expression to delete amino acid and codon keys
(default:'[^ACDEFGHIKLMNPQRSTVWYacgtU]')
-position codon position (default: '');
'' when assessing overall base usage of the gene
'1' when assessing base usage at 1st position of codons
'2' when assessing base usage at 2nd position of codons
'3' when assessing base usage at 3rd position of codons
Description:
Calculates indices of base usage (excluding start and stop codons),
and inputs in the G instance. e.g. G+C content at 3rd position of
synonymous codons will be accessible at $gb->{"FEATURE$i"}->{gcc3s}
Informations stored in the G instance:
$gb->{$id}->{"acgt$p"}: total number of bases (A+T+G+C)
$gb->{$id}->{"a_c$p"}: A content [A/(A+T+G+C)] $gb->{$id}->{"c_c$p"}: C content [C/(A+T+G+C)]
$gb->{$id}->{"g_c$p"}: G content [G/(A+T+G+C)] $gb->{$id}->{"t_c$p"}: T content [T/(A+T+G+C)]
$gb->{$id}->{"ryr$p"}: purine/pyrimidine ratio [(A+G)/(T+C)]
$gb->{$id}->{"gac$p"}: G+A content [(G+A)/(A+T+G+C)] $gb->{$id}->{"gcc$p"}: G+C content [(G+C)/(A+T+G+C)]
$gb->{$id}->{"gtc$p"}: G+T content [(G+T)/(A+T+G+C)] $gb->{$id}->{"gcs$p"}: GC skew [(G-C)/(G+C)]
$gb->{$id}->{"tas$p"}: TA skew [(T-A)/(T+A)] $gb->{$id}->{"Hbu$p"}: entropy of base usage $gb->{legend_bui} Requirements: none.
=cut
sub bui { &opt_default(output=>'stdout',filename=>'bui.csv',id=>'All',translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]',position=>''); my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $p = opt_val("position");
my $seq;
my $aaseq;
my $codon;
my $i;
if($id =~ /FEATURE/){ $seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); if($translate){
$i = 0;
while($i <= length($seq) - 3){
my $codon = substr($seq, $i, 3);
$aaseq .= ($codon =~ /[^acgt]/) ? '?' : $Table{$codon};
$i += 3;
}
}
else { $aaseq = substr($gb->{$id}->{translation}, 1); }
my $partseq;
$i = 0;
while($aaseq){
my $aa = substr($aaseq, 0, 1);
$partseq .= $p ? substr($seq, $i * 3 + $p - 1, 1) : substr($seq, $i * 3, 3) if($aa !~ /$del_key/);
substr($aaseq, 0, 1) = '';
$i ++;
}
$seq = $partseq;
}
else { my $cumseq;
foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3);
if($translate){
$i = 0;
while($i <= length($seq) - 3){
my $codon = substr($seq, $i, 3);
$aaseq .= ($codon =~ /[^acgt]/) ? '?' : $Table{$codon};
$i += 3;
}
}
else { $aaseq = substr($gb->{$cds}->{translation}, 1); }
my $partseq;
$i = 0;
while($aaseq){
my $aa = substr($aaseq, 0, 1);
$partseq .= $p ? substr($seq, $i * 3 + $p - 1, 1) : substr($seq, $i * 3, 3) if($aa !~ /$del_key/);
substr($aaseq, 0, 1) = '';
$i ++;
}
$cumseq .= $partseq;
}
$seq = $cumseq;
}
my ($a, $c, $g, $t, $Hbu);
if($seq){
$a = $seq =~ tr/a/a/;
$c = $seq =~ tr/c/c/;
$g = $seq =~ tr/g/g/;
$t = $seq =~ tr/t/t/;
$gb->{$id}->{"acgt$p"} = $a+$c+$g+$t;
$gb->{$id}->{"a_c$p"} = sprintf "%.4f", $a / ($a+$c+$g+$t); $gb->{$id}->{"c_c$p"} = sprintf "%.4f", $c / ($a+$c+$g+$t); $gb->{$id}->{"g_c$p"} = sprintf "%.4f", $g / ($a+$c+$g+$t); $gb->{$id}->{"t_c$p"} = sprintf "%.4f", $t / ($a+$c+$g+$t);
if($t+$c){ $gb->{$id}->{"ryr$p"} = sprintf "%.4f", ($a+$g) / ($t+$c); } else { &msg_error("t + c = 0 in $id\n"); } if($g+$a){ $gb->{$id}->{"gas$p"} = sprintf "%+.4f", ($g-$a)/($g+$a); }
else { &msg_error("g + a = 0 in $id\n"); }
if($c+$t){ $gb->{$id}->{"cts$p"} = sprintf "%+.4f", ($c-$t)/($c+$t); } else { &msg_error("c + t = 0 in $id\n"); } if($g+$c){ $gb->{$id}->{"gcs$p"} = sprintf "%+.4f", ($g-$c)/($g+$c); }
else { &msg_error("g + c = 0 in $id\n"); }
if($t+$a){ $gb->{$id}->{"tas$p"} = sprintf "%+.4f", ($t-$a)/($t+$a); } else { &msg_error("t + a = 0 in $id\n"); } if($g+$t){ $gb->{$id}->{"gts$p"} = sprintf "%+.4f", ($g-$t)/($g+$t); }
else { &msg_error("g + t = 0 in $id\n"); }
if($a+$c){ $gb->{$id}->{"acs$p"} = sprintf "%+.4f", ($a-$c)/($a+$c); } else { &msg_error("a + c = 0 in $id\n"); } foreach($a,$c,$g,$t){ $Hbu -= $_/($a+$c+$g+$t) * log($_/($a+$c+$g+$t)) / log(2) if($_); }
$gb->{$id}->{"Hbu$p"} = sprintf "%.4f", $Hbu;
my $gc = ($g+$c) / ($a+$c+$g+$t); my $gt = ($g+$t) / ($a+$c+$g+$t); $gb->{$id}->{"gac$p"} = sprintf "%.4f", ($g+$a) / ($a+$c+$g+$t); $gb->{$id}->{"gcc$p"} = sprintf "%.4f", $gc;
$gb->{$id}->{"gtc$p"} = sprintf "%.4f", $gt;
$gb->{$id}->{"Hgc$p"} = sprintf "%.4f", - ($gc * log($gc) + (1-$gc) * log(1-$gc)) / log(2) if(0 < $gc && $gc < 1); $gb->{$id}->{"Hgt$p"} = sprintf "%.4f", - ($gt * log($gt) + (1-$gt) * log(1-$gt)) / log(2) if(0 < $gt && $gt < 1); }
my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}";
if($output eq 'stdout'){
unless($gb->{legend_bui}){
msg_send("\n\n",
"Base usage indices (excluding start and stop codons)\n",
" ACGT: A$p + T$p + G$p + C$p\n",
" GAc: (G$p + A$p)/(A$p + T$p + G$p + C$p)\n",
" GCc: (G$p + C$p)/(A$p + T$p + G$p + C$p)\n",
" GTc: (G$p + T$p)/(A$p + T$p + G$p + C$p)\n",
" GCs: (G$p - C$p)/(G$p + C$p)\n",
" TAs: (T$p - A$p)/(T$p + A$p)\n",
" Hbu: entropy of base usage\n",
"-------------------------------------------------------------\n",
"ACGT\tGAc\tGCc\tGTc\tGCs\tTAs\tHbu\tgene\n",
"-------------------------------------------------------------\n");
$gb->{legend_bui} = 1;
}
msg_send($gb->{$id}->{"acgt$p"}, "\t",
$gb->{$id}->{"gac$p"}, "\t",
$gb->{$id}->{"gcc$p"}, "\t",
$gb->{$id}->{"gtc$p"}, "\t",
$gb->{$id}->{"gcs$p"}, "\t",
$gb->{$id}->{"tas$p"}, "\t",
$gb->{$id}->{"Hbu$p"}, "\t",
$gene, "\n");
}
if($output eq 'f'){
open(FILE,">>$filename");
unless($gb->{legend_bui}){
print FILE
"Base usage indices (excluding start and stop codons)\n",
" ACGT: A$p + T$p + G$p + C$p\n",
" GAc: (G$p + A$p)/(A$p + T$p + G$p + C$p)\n",
" GCc: (G$p + C$p)/(A$p + T$p + G$p + C$p)\n",
" GTc: (G$p + T$p)/(A$p + T$p + G$p + C$p)\n",
" GCs: (G$p - C$p)/(G$p + C$p)\n",
" TAs: (T$p - A$p)/(T$p + A$p)\n",
" Hbu: entropy of base usage\n",
"ACGT,GAc,GCc,GTc,GCs,TAs,Hbu,gene\n";
$gb->{legend_bui} = 1;
}
print FILE
$gb->{$id}->{"acgt$p"}, ",",
$gb->{$id}->{"gac$p"}, ",",
$gb->{$id}->{"gcc$p"}, ",",
$gb->{$id}->{"gtc$p"}, ",",
$gb->{$id}->{"gcs$p"}, ",",
$gb->{$id}->{"tas$p"}, ",",
$gb->{$id}->{"Hbu$p"}, ",",
$gene, "\n";
close(FILE);
}
}
=head2 dinuc
dinuc ver.20030401
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &dinuc(pointer G instance);
Options:
-output output option (default: 'stdout')
-filename output filename (default: 'dinuc.csv')
-id ID of a group of genes or a single gene (default:'All')
-position codon position or reading frame (default: '');
'' when assessing all codon positions
'12' when assessing the reading frame 1-2
'23' when assessing the reading frame 2-3
'31' when assessing the reading frame 3-1
Description:
Calculates dinucleotide usage (excluding start and stop codons),
and inputs in the G instance. e.g. CG dinucleotide usage at the
reading frame 1-2 will be accessible at $gb->{"FEATURE$i"}->{cg12}
Dinucleotide usage was computed as the ratio of observed (O) to
expected (E) dinucleotide frequencies: O/E Informations stored in the G instance: $gb->{legend_dinuc} References: Yew et al. (2004) Virus Genes 28:1,41-53. Requirements: none.
=cut
sub dinuc { &opt_default(output=>'stdout',filename=>'dinuc.csv',id=>'All',position=>''); my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $id = opt_val("id");
my $rf = opt_val("position");
my $p = ($rf) ? substr($rf, 0, 1) : 1;
my $q = ($rf) ? substr($rf, 1, 1) : 1;
my $end = ($rf =~ /12|23/) ? 0 : 1;
my %usg;
my $seq;
if($rf){
if($id =~ /FEATURE/){ $seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); for(my $i; $i * 3 <= length($seq) - 3 - $end; $i ++){
$usg{"mono$p"}->{substr($seq, $i * 3 + $p - 1, 1)} ++;
$usg{"mono$q"}->{substr($seq, $i * 3 + $q - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i * 3 + $p - 1, 2).$rf} ++;
}
}
else { foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3);
for(my $i; $i * 3 <= length($seq) - 3 - $end; $i ++){
$usg{"mono$p"}->{substr($seq, $i * 3 + $p - 1, 1)} ++;
$usg{"mono$q"}->{substr($seq, $i * 3 + $q - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i * 3 + $p - 1, 2).$rf} ++;
}
}
}
}
else {
if($id =~ /FEATURE/){ $seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); for(my $i; $i <= length($seq) - 1 - $end; $i ++){
$usg{mono}->{substr($seq, $i + $p - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i + $p - 1, 2)} ++;
}
}
else { foreach my $cds ($gb->cds()){
next if( $id ne 'All' && !($gb->{$cds}->{$id}) );
$seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3);
for(my $i; $i <= length($seq) - 1 - $end; $i ++){
$usg{mono}->{substr($seq, $i + $p - 1, 1)} ++;
$usg{diobs}->{substr($seq, $i + $p - 1, 2)} ++;
}
}
}
}
my $total;
foreach my $key (keys %usg){
foreach (keys %{$usg{$key}}){ $total += $usg{$key}->{$_}; }
foreach (keys %{$usg{$key}}){ $usg{$key}->{$_} = $usg{$key}->{$_} / $total; } $total = 0; }
my @keysAll = ("aa$rf","ac$rf","ag$rf","at$rf","ca$rf","cc$rf","cg$rf","ct$rf","ga$rf","gc$rf","gg$rf","gt$rf","ta$rf","tc$rf","tg$rf","tt$rf");
foreach (@keysAll){
if($rf){ $usg{diexp}->{$_} = $usg{"mono$p"}->{substr($_,0,1)} * $usg{"mono$q"}->{substr($_,1,1)}; }
else { $usg{diexp}->{$_} = $usg{mono}->{substr($_,0,1)} * $usg{mono}->{substr($_,1,1)}; }
}
my %dioe;
foreach (@keysAll){
if($usg{diexp}->{$_}){
$dioe{$_} = sprintf("%.3f", $usg{diobs}->{$_} / $usg{diexp}->{$_}); $gb->{$id}->{$_} = $dioe{$_}; }
}
$gb->{$id}->{"dinuc$rf"} =\% dioe;
my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}";
if($output =~ /stdout/){
unless($gb->{"legend_dinuc$p"}){
msg_send("\n\n");
foreach(@keysAll){ msg_send("$_\t"); }
msg_send("#\tgene");
$gb->{"legend_dinuc$p"} = 1;
}
msg_send("\n\n");
foreach(@keysAll){ msg_send("$dioe{$_}\t"); }
msg_send(scalar keys %dioe, "\t$gene");
}
if($output eq 'f'){
open(FILE,">>$filename");
unless($gb->{"legend_dinuc$p"}){
print FILE "\nkeys,";
foreach(@keysAll){ print FILE "$_,"; }
print FILE "gene,";
$gb->{"legend_dinuc$p"} = 1;
}
print FILE "\n$id,";
foreach(@keysAll){
print FILE (exists $dioe{$_}) ? "$dioe{$_}," : "0,";
}
close(FILE);
}
}
=head2 codon_pca
codon_pca ver.200503031
Author: Haruo Suzuki (haruo@g-language.org)
Usage: NULL = &codon_pca(pointer G instance);
Options:
-output output option (default: 'show')
-filename output filename (default: 'pca.csv')
-translate 1 when translates using regular codon table (default: 0)
-del_key regular expression to delete amino acid and codon keys
(default:'[^ACDEFGHIKLMNPQRSTVWYacgtU]')
-data data matrix of amino acid and codon usage (default:'R1')
-npc number of principal components (default:'4')
-parameter pointer ARRAY of list of gene parameters
['Laa','Haau','mmw','gravy','aroma','acidic','basic','neutral','polar',
"Hbu$p","a_c$p","c_c$p","g_c$p","t_c$p","ryr$p",
"gac$p","gcc$p","gtc$p","gcs$p","tas$p",
"aa$_","ac$_","ag$_","at$_","ca$_","cc$_","cg$_","ct$_",
"ga$_","gc$_","gg$_","gt$_","ta$_","tc$_","tg$_","tt$_",
'Hcu','Dcu','Dsyn','Hs','Hw','Ew','enc','cbi','icdi',
'fop','cai','pel']
Description:
Performs a principal component analysis (PCA) on codon usage and
analyzes correlations between the PCA axis scores and a number of
gene parameters related to amino acid usage and base usage.
Requirements:
sub aaui; sub bui; sub dinuc;
=cut
sub codon_pca {
&opt_default(output=>'show',filename=>'pca.csv',translate=>0,
del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]',data=>'R1',npc=>'4',
parameter=>['Laa','Hcu','Dcu','Dsyn','Hs','Hw','Ew','enc','cbi','icdi','fop','cai','pel','mmw','gravy','aroma','acidic','basic','neutral','polar','Haau']);
my @args = opt_get(@_);
my $gb = opt_as_gb(shift @args);
my $output = opt_val("output");
my $filename = opt_val("filename");
my $translate = opt_val("translate");
my $del_key = opt_val("del_key");
my $data = opt_val("data");
my $npc = opt_val("npc");
my $parameter = opt_val("parameter");
my %calc;
foreach(@$parameter){
if($_ =~ /mmw|gravy|aroma|acidic|basic|neutral|polar|Haau/){ $calc{aaui} ++; }
elsif($_ =~ /Hcu|Dcu|Dsyn/){ $calc{entropy_cu} ++; }
elsif($_ =~ /Hs|Hw|Ew/){ $calc{entropy_scu} ++; }
elsif($_ =~ /pel/){ $calc{phx} ++; }
elsif($_ =~ /enc|cbi|icdi|fop|cai/){ $calc{$_} ++; }
}
if($data =~ /AA/){ $data = 'A1'; }
elsif($data =~ /RF/){ $data = 'R2'; }
elsif($data =~ /RSCU/){ $data = 'R3'; }
my @pp = ('1','2','3');
foreach (@pp){ $parameter = [@$parameter, "gcc$_","gtc$_"]; }
$parameter = [@$parameter, 'Hgc3','Hgt3'];
&fop($gb, -output=>'n') if($calc{fop});
&cai($gb, -output=>'n', -w_output=>'n') if($calc{cai});
&phx($gb, -output=>'n') if($calc{phx});
my $in = 'ribosomal.*protein|elongation.*factor';
my $ex = 'modification|alanine|serine|transferase|synthase|acetylase|acetylating enzyme|methylase|precursor|regulat|initiates|RNA|DNA|transcription|selenocysteine-specific';
my $inRp = 'ribosomal.*protein';
my $exRp = 'modification|alanine|serine|transferase|synthase|acetylase|acetylating enzyme|methylase|precursor|regulat|initiates|RNA';
my $cds;
foreach $cds ('All','High','leading','lagging',$gb->cds()){ $gb->{$cds}->{High} = (($gb->{$cds}->{product} =~ /$in/i &&
$gb->{$cds}->{product} !~ /$ex/i ||
$gb->{$cds}->{note} =~ /$in/i &&
$gb->{$cds}->{note} !~ /$ex/i
) ? 1 : 0 );
$gb->{$cds}->{High} = (($gb->{$cds}->{product} =~ /$inRp/i &&
$gb->{$cds}->{product} !~ /$exRp/i ||
$gb->{$cds}->{note} =~ /$inRp/i &&
$gb->{$cds}->{note} !~ /$exRp/i
) ? 1 : 0 );
unless(exists $gb->{$cds}->{$data}){
&codon_compiler($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -data=>$data, -id=>$cds);
}
$gb->{$cds}->{Laa} = length($gb->{$cds}->{translation}) - 1;
&entropy_cu($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{entropy_cu});
&entropy_scu($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{entropy_scu});
$gb->{$cds}->{enc} = &enc($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{enc});
$gb->{$cds}->{cbi} = &cbi($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{cbi});
$gb->{$cds}->{icdi} = &icdi($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{icdi});
&aaui($gb, -output=>'n', -id=>$cds) if($calc{aaui});
foreach (@pp){ &bui($gb, -output=>'n', -position=>$_, -del_key=>$del_key, -id=>$cds); }
foreach(@$parameter){ $gb->{$cds}->{on} = 0 unless(exists $gb->{$cds}->{$_}); }
}
$parameter = [@$parameter, 'High'];
open(OUT, ">raw.csv");
my @keysAll = sort keys %{$gb->{All}->{$data}};
my @k_array;
foreach(@keysAll){ push(@k_array, $gb->{degeneracy}->{substr($_, 0, 1)}); }
my $k_join = join(',',@k_array);
undef $k_join if($data =~ /A[01]/);
my $nkey = scalar @keysAll;
foreach(@keysAll){ print OUT "$_,"; }
foreach(@$parameter){
if($_ !~ /High|Ew|Hgc|Hgt/){
my $key = $_;
$key =~ tr[a-z][A-Z];
$key =~ s/LAA/Length/;
$key =~ s/C1/1/;
$key =~ s/C2/2/;
$key =~ s/C3/3/;
print OUT "$key,";
}
else { print OUT "$_,"; }
}
print OUT "\n";
foreach $cds ('All','High','leading','lagging',$gb->cds()){ foreach(@keysAll){ print OUT (exists $gb->{$cds}->{$data}->{$_}) ? "$gb->{$cds}->{$data}->{$_}," : "0,"; }
foreach(@$parameter){ print OUT "$gb->{$cds}->{$_},"; }
print OUT "\n";
}
close(OUT);
my $rcmd= new Rcmd;
my @result = $rcmd->exec(
"nkey <- $nkey",
"npc <- $npc",
"output <-\" $output\"",
'dat <- read.csv("raw.csv")',
'dat <- dat[,-ncol(dat)]',
'cum <- dat[c(1,2,3,4),1:nkey]',
'dat <- dat[-c(1,2,3,4),]',
'usg <- as.matrix(dat[,1:nkey])',
'Ra <- as.dist(1 - cor(t(usg)))',
'mnRa <- mean(Ra); sdRa <- sd(Ra)',
'Ea <- dist(usg)',
'mnEa <- mean(Ea); sdEa <- sd(Ea)',
'usg.h <- usg[dat[,"High"] == 1,]',
'Rh <- as.dist(1 - cor(t(usg.h)))',
'mnRh <- mean(Rh); sdRh <- sd(Rh)',
'Eh <- dist(usg.h)',
'mnEh <- mean(Eh); sdEh <- sd(Eh)',
'usg.o <- usg[dat[,"High"] == 0,]',
'Ro <- as.dist(1 - cor(t(usg.o)))',
'Eo <- dist(usg.o)',
'mnRho <- (sum(Ra) - sum(Rh) - sum(Ro))/(length(Ra) - length(Rh) - length(Ro))',
'mnEho <- (sum(Ea) - sum(Eh) - sum(Eo))/(length(Ea) - length(Eh) - length(Eo))',
'Ew <- dat[, order(dimnames(dat)[[2]] != "Ew")[1]]',
'zzEw <- scale(Ew)',
'zEw <- mean(scale(Ew)[dat[,"High"]==1])',
'x <- "Hgc3"; y <- "Ew"',
'EwHgc3 <- cor(rank(dat[,x]), rank(dat[,y]))^2',
'x <- "Hgt3"; y <- "Ew"',
'EwHgt3 <- cor(rank(dat[,x]), rank(dat[,y]))^2',
'dat <- dat[, -order(dimnames(dat)[[2]] != "Ew")[1]]',
'dat <- dat[, -order(dimnames(dat)[[2]] != "Hgc3")[1]]',
'dat <- dat[, -order(dimnames(dat)[[2]] != "Hgt3")[1]]',
'num <- dat[,(nkey+1):(ncol(dat)-1)]',
"kk <- c($k_join)", 'omit <- c()',
'for(i in 1:ncol(usg)) if(sd(usg[,i]) == 0) omit <- c(omit, i)',
'if(length(omit)){ usg <- usg[,-omit]; kk <- kk[-omit] }',
'pca <- function(x, normalize = 0, npc = 3)',
'{',
'x <- as.matrix(x)',
'npc <- min(npc, ncol(x))',
'if(normalize) x <- scale(x)',
'eval <- (result <- eigen(var(x)))$values[1:npc]',
'evec <- result$vectors[, 1:npc]',
'contr <- eval / sum(result$values)',
'score <- x %*% evec',
'fl <- cor(x, score)',
'rownames(evec) <- rownames(fl) <- colnames(x)',
'names(contr) <- colnames(evec) <- colnames(fl) <- colnames(score) <- paste("PC", 1:npc)',
'list(contribution = contr, weight = evec, factor.loading = fl, score = score)',
'}',
'pc <- pca(usg, 0, npc)',
'gto <- apply(pc$s,2,var) > max(apply(usg,2,var))', 'ngto <- sum(gto)',
'pcz <- apply(pc$s, 2, scale)',
'for(y in 1:ngto){ if(cor(Ew, pcz[,y]) < 0) pcz[,y] <- -pcz[,y] }',
'zz <- c(); zz <- round(abs(apply(pcz[dat[,"High"]==1,], 2, mean)), digits=4)',
'rr <- cor(apply(num, 2, rank), apply(pcz[,1:npc], 2, rank))', 'if(0) rr <- rbind(pc$f,rr)', 'key <- dimnames(rr)[[1]][ apply(abs(rr), 2, order)[nrow(rr),] ]',
'if(output == "show"){',
'postscript("PCAplot.ps", horizontal=FALSE, width=10, height=10, pointsize=15)',
'par(mfrow=c(ceiling(sqrt(ngto)),ceiling(sqrt(ngto))))',
'for(y in 1:ngto){',
'x <- key[y]',
'r <- round(rr[x,y], digits=4)',
'if(sum(dat[,"High"]) > 1) main <- paste("z =", zz[y], "\nr =",r)',
'else main <- paste("r =",r)',
'plot(dat[,x],pcz[,y],xlab=x, ylab=paste("Z",y," (",round(pc$c[y]*100,digits=1),"%)"), type="n", main=main)',
'text(dat[dat[,"High"]==0,x],pcz[dat[,"High"]==0,y],label="+")',
'text(dat[dat[,"High"]==1,x],pcz[dat[,"High"]==1,y],label="o",col="red")',
'}',
'}',
'if(output == "f"){',
'out <- rbind(pc$c,pc$f)',
'dimnames(out)[[1]][1] <- c("contribution")',
'write.table(out, file = "pca.csv", sep = ",", quote = F)',
'}',
'efk <- c(); for(k in c(2,4,6)) for(y in 1:npc) efk <- c(efk, cor(pc$s[,y], (usg[,kk == k] %*% pc$w[kk == k,])[,y]))',
'efk <- matrix(efk, 3, npc, byrow = T)',
'pp <- c(); for(y in 1:npc) pp <- c(pp, max(sum(dat[order(-pc$s[,y]), "High"][1:50]), sum(dat[order(pc$s[,y]), "High"][1:50])))',
'out <- round(rbind(pc$c,efk^2,rr^2), digits=4)',
'dimnames(out)[[1]][1:4] <- c("contr","k2","k4","k6")',
'val <- round(apply(rr^2, 2, max), digits=4)',
'gpd <- key', 'gpd[val < 0.5] = "nd"; zc <- 2.33', 'res <- gfd <- gpd',
'gfd[zz > zc & gfd != "nd"] = paste(gfd[zz > zc & gfd != "nd"], "(Expression)")',
'gfd[zz > zc & gfd == "nd"] = "Expression"',
'res[zz > zc & res != "nd"] = "two"',
'res[zz > zc & res == "nd"] = "Expression"',
'gpd[!gto] = "ns"; gfd[!gto] = "ns"; res[!gto] = "ns"',
'usg2 <- usg[zzEw < -1.8, ]',
'pcz2 <- pcz[zzEw < -1.8, ]',
'usg2 <- usg2[apply(pcz2, 2, order)[1,], ]',
'cds <- dimnames(usg2)[[1]]',
'write.table(rbind(cds,gto,gpd,gfd,res,key,val,pp,zz,out), file = "haruo.csv", sep = ",", quote = F)',
'as.vector(c(EwHgc3,EwHgt3,mnRa,sdRa,mnEa,sdEa,mnRh,mnEh,mnRho,mnEho,zEw, pc$c,pc$s,rr))',
);
shift @result if($result[0] eq '');
$gb->{EwHgc3} = sprintf "%.4f", shift @result;
$gb->{EwHgt3} = sprintf "%.4f", shift @result;
$gb->{mnRa} = sprintf "%.4f", shift @result;
$gb->{sdRa} = sprintf "%.4f", shift @result;
$gb->{mnEa} = sprintf "%.4f", shift @result;
$gb->{sdEa} = sprintf "%.4f", shift @result;
$gb->{mnRh} = sprintf "%.4f", shift @result;
$gb->{mnEh} = sprintf "%.4f", shift @result;
$gb->{mnRho} = sprintf "%.4f", shift @result;
$gb->{mnEho} = sprintf "%.4f", shift @result;
$gb->{zEw} = sprintf "%.4f", shift @result;
for(1..$npc){
$gb->{"PC$_"}->{contribution} = sprintf "%.3f", shift @result;
}
for(1..$npc){
foreach $cds ($gb->cds()){
$gb->{$cds}->{"PC$_"} = sprintf "%+.3f", shift @result;
}
}
for(1..$npc){
foreach my $key (@$parameter){
next if($key =~ /High/);
$gb->{"PC$_"}->{$key} = sprintf "%+.3f", shift @result;
}
}
msg_gimv("PCAplot.ps") if($output =~ /g|show/);
if($output =~ /stdout|show/){
msg_send("\n\nPrincipal Component Analysis (PCA) of data $data\n");
msg_send("\nContribution of each principal component (PC)\n\t");
for(1..$npc){ msg_send("PC$_\t"); }
msg_send("\n\t");
for(1..$npc){ msg_send($gb->{"PC$_"}->{contribution}, "\t"); }
msg_send("\n\nCorrelations between the PC scores and other gene parameters\n\t");
for(1..$npc){ msg_send("PC$_\t"); }
foreach my $key (@$parameter){
next if($key =~ /High/);
msg_send("\n$key\t");
for(1..$npc){ msg_send($gb->{"PC$_"}->{$key}, "\t"); }
}
}
}
sub codon_counter{
&opt_default(CDSid=>"",filename=>'codon.csv',output=>'stdout');
my @args=opt_get(@_);
my $gb=opt_as_gb(shift @args);
my $key=opt_val("CDSid");
my $filename=opt_val("filename");
my $output=opt_val("output");
my %result;
my $seq;
my $i=1;
my $q;
if($key){
if(defined(%{$gb->{"$key"}})){
$seq=$gb->get_geneseq("$key");
for($q=0;$q<length($seq);$q+=3){
$result{substr($seq,$q,3)}++;
}
}
}
else{
foreach($gb->cds()){
$seq=$gb->get_geneseq("CDS$i");
for($q=0;$q<length($seq);$q+=3){
$result{substr($seq,$q,3)}++;
}
$i++;
}
}
if($output eq "f"){
_codon_amino_printer(\%result,-output=>"f",-filename=>"$filename");
}
if($output eq "stdout"){
_codon_amino_printer(\%result,-output=>"stdout");
}
return\% result;
}
sub amino_counter{
&opt_default(CDSid=>"",filename=>'amino.csv',output=>'stdout');
my @args=opt_get(@_);
my $gb= shift @args;
my $key=opt_val("CDSid");
my $filename=opt_val("filename");
my $output=opt_val("output");
my %amino;
my @tmp;
my $num;
if(length $key){
if(defined(%{$gb->{"$key"}})){
$num=$gb->{"$key"}->{feature};
@tmp=split(//,$gb->{"FEATURE$num"}->{translation});
foreach(@tmp){
$amino{$_}++;
}
}
}
else{
foreach my $cds ($gb->cds()){
@tmp=split(//,$gb->{$cds}->{translation});
foreach(@tmp){
$amino{$_}++;
}
}
}
if($output eq "f"){
_codon_amino_printer(\%amino,-output=>"f",-filename=>"$filename");
}
if($output eq "stdout"){
_codon_amino_printer(\%amino,-output=>"stdout");
}
return\% amino;
}
sub codon_amino_counter{
my $gb=shift;
my $key=shift;
my %result;
my $seq;
my $amino;
my $codon;
my $num;
my $q;
my @tmp;
if(defined(%{$gb->{"$key"}})){
$seq=lc($gb->get_geneseq("$key"));
$num=$gb->{"$key"}->{feature};
}
@tmp=split(//,$gb->{"FEATURE$num"}->{translation});
for($q=0;$q<length($seq);$q+=3){
$amino=shift @tmp;
$amino="/" if($amino eq "");
$codon=substr($seq,$q,3);
$result{$amino}{$codon}++;
}
return %result;
}
sub codon_usage{
&opt_default(CDSid=>"",filename=>'codon_usage.csv',output=>'show',version=>1);
my @args=opt_get(@_);
my $gb=shift @args;
my $key=opt_val("CDSid");
my $filename=opt_val("filename");
my $output=opt_val("output");
my $version = opt_val("version");
my $codon;
my $amino;
my %result;
my %usage;
if($key){
%usage=codon_amino_counter($gb,$key);
}
else{
foreach($gb->cds()){
%result=();
%result=codon_amino_counter($gb,$_);
foreach $amino (keys(%result)){
foreach $codon (keys(%{$result{$amino}})){
$usage{$amino}{$codon}+=$result{$amino}{$codon};
}
}
}
}
if($output eq "f"){
_codon_usage_printer(\%usage,-output=>"f",-filename=>"$filename");
}
if($output!~/[fn]/){
_codon_usage_printer(\%usage,-output=>"stdout");
}
if ($version == 2){
_codon_table2(\%usage,-output=>"$output") if($output=~/(g|show)/ && $key eq "");
}else{
_codon_table(\%usage,-output=>"$output") if($output=~/(g|show)/ && $key eq "");
}
return\% usage;
}
sub _codon_table{
&opt_default(output=>"show",filename=>"codon_table.png");
my @args=opt_get(@_);
my $result=shift @args;
my $filename=opt_val("filename");
my $output=opt_val("output");
my $x;
my $y;
my %amino;
my %data;
my %per;
my $amino_total;
my $codon;
my $amino;
my $v;
my $h;
my %exception;
my $CoDoN;
my %color;
my $im = new GD::Image(500,550);
my $white = $im->colorAllocate(255,255,255);
my $black = $im->colorAllocate(0,0,0);
my $red = $im->colorAllocate(255,0,0);
my $yellow = $im->colorAllocate(200,200,0);
my $green = $im->colorAllocate(0,150,0);
my $blue = $im->colorAllocate(0,0,255);
$color{D}=$yellow;
$color{E}=$yellow;
$color{R}=$red;
$color{K}=$red;
$color{H}=$red;
$color{N}=$blue;
$color{Q}=$blue;
$color{S}=$blue;
$color{T}=$blue;
$color{Y}=$blue;
$color{A}=$green;
$color{G}=$green;
$color{V}=$green;
$color{L}=$green;
$color{I}=$green;
$color{P}=$green;
$color{F}=$green;
$color{M}=$green;
$color{W}=$green;
$color{C}=$green;
$color{'/'}=$black;
foreach((10,50,450,490)){
$x=$_;
for($y=10;$y<450;$y++){
$im->setPixel($x,$y,$black);
}
}
foreach((150,250,350)){
$x=$_;
for($y=30;$y<450;$y++){
$im->setPixel($x,$y,$black);
}
}
$y=30;
for($x=50;$x<450;$x++){
$im->setPixel($x,$y,$black);
}
foreach((10,50,150,250,350,450)){
$y=$_;
for($x=10;$x<490;$x++){
$im->setPixel($x,$y,$black);
}
}
$im->string(gdSmallFont,15,25,"first",$red);
$im->string(gdSmallFont,233,15,"second",$green);
$im->string(gdSmallFont,455,25,"third",$blue);
$im->string(gdSmallFont,30,95,"T",$red);
$im->string(gdSmallFont,30,195,"C",$red);
$im->string(gdSmallFont,30,295,"A",$red);
$im->string(gdSmallFont,30,395,"G",$red);
$im->string(gdSmallFont,100,35,"T",$green);
$im->string(gdSmallFont,200,35,"C",$green);
$im->string(gdSmallFont,300,35,"A",$green);
$im->string(gdSmallFont,400,35,"G",$green);
$im->string(gdSmallFont,470,65,"T",$blue);
$im->string(gdSmallFont,470,85,"C",$blue);
$im->string(gdSmallFont,470,105,"A",$blue);
$im->string(gdSmallFont,470,125,"G",$blue);
$im->string(gdSmallFont,470,165,"T",$blue);
$im->string(gdSmallFont,470,185,"C",$blue);
$im->string(gdSmallFont,470,205,"A",$blue);
$im->string(gdSmallFont,470,225,"G",$blue);
$im->string(gdSmallFont,470,265,"T",$blue);
$im->string(gdSmallFont,470,285,"C",$blue);
$im->string(gdSmallFont,470,305,"A",$blue);
$im->string(gdSmallFont,470,325,"G",$blue);
$im->string(gdSmallFont,470,365,"T",$blue);
$im->string(gdSmallFont,470,385,"C",$blue);
$im->string(gdSmallFont,470,405,"A",$blue);
$im->string(gdSmallFont,470,425,"G",$blue);
foreach $amino (keys(%{$result})){
$amino_total=0;
foreach $codon (keys(%{$$result{$amino}})){
$amino_total+=$$result{$amino}{$codon};
}
foreach $codon (keys(%{$$result{$amino}})){
if($$result{$amino}{$codon} > $data{$codon}){
if($data{$codon}!=""){
$exception{$codon}{amino}=$amino{$codon};
$exception{$codon}{per}=$per{$codon};
}
$data{$codon}=$$result{$amino}{$codon};
$amino{$codon}=$amino;
$per{$codon}=sprintf("%.3f",$$result{$amino}{$codon}/$amino_total); }
else{
$exception{$codon}{amino}=$amino;
$exception{$codon}{per}=sprintf("%.3f",$$result{$amino}{$codon}/$amino_total); }
}
}
$im->string(gdSmallFont,60,65,"TTT $amino{ttt} $per{ttt}",$color{$amino{ttt}});
$im->string(gdSmallFont,60,85,"TTC $amino{ttc} $per{ttc}",$color{$amino{ttc}});
$im->string(gdSmallFont,60,105,"TTA $amino{tta} $per{tta}",$color{$amino{tta}});
$im->string(gdSmallFont,60,125,"TTG $amino{ttg} $per{ttg}",$color{$amino{ttg}});
$im->string(gdSmallFont,60,165,"CTT $amino{ctt} $per{ctt}",$color{$amino{ctt}});
$im->string(gdSmallFont,60,185,"CTC $amino{ctc} $per{ctc}",$color{$amino{ctc}});
$im->string(gdSmallFont,60,205,"CTA $amino{cta} $per{cta}",$color{$amino{cta}});
$im->string(gdSmallFont,60,225,"CTG $amino{ctg} $per{ctg}",$color{$amino{ctg}});
$im->string(gdSmallFont,60,265,"ATT $amino{att} $per{att}",$color{$amino{att}});
$im->string(gdSmallFont,60,285,"ATC $amino{atc} $per{atc}",$color{$amino{atc}});
$im->string(gdSmallFont,60,305,"ATA $amino{ata} $per{ata}",$color{$amino{ata}});
$im->string(gdSmallFont,60,325,"ATG $amino{atg} $per{atg}",$color{$amino{atg}});
$im->string(gdSmallFont,60,365,"GTT $amino{gtt} $per{gtt}",$color{$amino{gtt}});
$im->string(gdSmallFont,60,385,"GTC $amino{gtc} $per{gtc}",$color{$amino{gtc}});
$im->string(gdSmallFont,60,405,"GTA $amino{gta} $per{gta}",$color{$amino{gta}});
$im->string(gdSmallFont,60,425,"GTG $amino{gtg} $per{gtg}",$color{$amino{gtg}});
$im->string(gdSmallFont,160,65,"TCT $amino{tct} $per{tct}",$color{$amino{tct}});
$im->string(gdSmallFont,160,85,"TCC $amino{tcc} $per{tcc}",$color{$amino{tcc}});
$im->string(gdSmallFont,160,105,"TCA $amino{tca} $per{tca}",$color{$amino{tca}});
$im->string(gdSmallFont,160,125,"TCG $amino{tcg} $per{tcg}",$color{$amino{tcg}});
$im->string(gdSmallFont,160,165,"CCT $amino{cct} $per{cct}",$color{$amino{cct}});
$im->string(gdSmallFont,160,185,"CCC $amino{ccc} $per{ccc}",$color{$amino{ccc}});
$im->string(gdSmallFont,160,205,"CCA $amino{cca} $per{cca}",$color{$amino{cca}});
$im->string(gdSmallFont,160,225,"CCG $amino{ccg} $per{ccg}",$color{$amino{ccg}});
$im->string(gdSmallFont,160,265,"ACT $amino{act} $per{act}",$color{$amino{act}});
$im->string(gdSmallFont,160,285,"ACC $amino{acc} $per{acc}",$color{$amino{acc}});
$im->string(gdSmallFont,160,305,"ACA $amino{aca} $per{aca}",$color{$amino{aca}});
$im->string(gdSmallFont,160,325,"ACG $amino{acg} $per{acg}",$color{$amino{acg}});
$im->string(gdSmallFont,160,365,"GCT $amino{gct} $per{gct}",$color{$amino{gct}});
$im->string(gdSmallFont,160,385,"GCC $amino{gcc} $per{gcc}",$color{$amino{gcc}});
$im->string(gdSmallFont,160,405,"GCA $amino{gca} $per{gca}",$color{$amino{gca}});
$im->string(gdSmallFont,160,425,"GCG $amino{gcg} $per{gcg}",$color{$amino{gcg}});
$im->string(gdSmallFont,260,65,"TAT $amino{tat} $per{tat}",$color{$amino{tat}});
$im->string(gdSmallFont,260,85,"TAC $amino{tac} $per{tac}",$color{$amino{tac}});
$im->string(gdSmallFont,260,105,"TAA $amino{taa} $per{taa}",$color{$amino{taa}});
$im->string(gdSmallFont,260,125,"TAG $amino{tag} $per{tag}",$color{$amino{tag}});
$im->string(gdSmallFont,260,165,"CAT $amino{cat} $per{cat}",$color{$amino{cat}});
$im->string(gdSmallFont,260,185,"CAC $amino{cac} $per{cac}",$color{$amino{cac}});
$im->string(gdSmallFont,260,205,"CAA $amino{caa} $per{caa}",$color{$amino{caa}});
$im->string(gdSmallFont,260,225,"CAG $amino{cag} $per{cag}",$color{$amino{cag}});
$im->string(gdSmallFont,260,265,"AAT $amino{aat} $per{aat}",$color{$amino{aat}});
$im->string(gdSmallFont,260,285,"AAC $amino{aac} $per{aac}",$color{$amino{aac}});
$im->string(gdSmallFont,260,305,"AAA $amino{aaa} $per{aaa}",$color{$amino{aaa}});
$im->string(gdSmallFont,260,325,"AAG $amino{aag} $per{aag}",$color{$amino{aag}});
$im->string(gdSmallFont,260,365,"GAT $amino{gat} $per{gat}",$color{$amino{gat}});
$im->string(gdSmallFont,260,385,"GAC $amino{gac} $per{gac}",$color{$amino{gac}});
$im->string(gdSmallFont,260,405,"GAA $amino{gaa} $per{gaa}",$color{$amino{gaa}});
$im->string(gdSmallFont,260,425,"GAG $amino{gag} $per{gag}",$color{$amino{gag}});
$im->string(gdSmallFont,360,65,"TGT $amino{tgt} $per{tgt}",$color{$amino{tgt}});
$im->string(gdSmallFont,360,85,"TGC $amino{tgc} $per{tgc}",$color{$amino{tgc}});
$im->string(gdSmallFont,360,105,"TGA $amino{tga} $per{tga}",$color{$amino{tga}});
$im->string(gdSmallFont,360,125,"TGG $amino{tgg} $per{tgg}",$color{$amino{tgg}});
$im->string(gdSmallFont,360,165,"CGT $amino{cgt} $per{cgt}",$color{$amino{cgt}});
$im->string(gdSmallFont,360,185,"CGC $amino{cgc} $per{cgc}",$color{$amino{cgc}});
$im->string(gdSmallFont,360,205,"CGA $amino{cga} $per{cga}",$color{$amino{cga}});
$im->string(gdSmallFont,360,225,"CGG $amino{cgg} $per{cgg}",$color{$amino{cgg}});
$im->string(gdSmallFont,360,265,"AGT $amino{agt} $per{agt}",$color{$amino{agt}});
$im->string(gdSmallFont,360,285,"AGC $amino{agc} $per{agc}",$color{$amino{agc}});
$im->string(gdSmallFont,360,305,"AGA $amino{aga} $per{aga}",$color{$amino{aga}});
$im->string(gdSmallFont,360,325,"AGG $amino{agg} $per{agg}",$color{$amino{agg}});
$im->string(gdSmallFont,360,365,"GGT $amino{ggt} $per{ggt}",$color{$amino{ggt}});
$im->string(gdSmallFont,360,385,"GGC $amino{ggc} $per{ggc}",$color{$amino{ggc}});
$im->string(gdSmallFont,360,405,"GGA $amino{gga} $per{gga}",$color{$amino{gga}});
$im->string(gdSmallFont,360,425,"GGG $amino{ggg} $per{ggg}",$color{$amino{ggg}});
$im->string(gdSmallFont,15,465,"yellow minus charge",$yellow);
$im->string(gdSmallFont,165,465,"red plus charge",$red);
$im->string(gdSmallFont,285,465,"blue noncharge",$blue);
$im->string(gdSmallFont,400,465,"green nonpolar",$green);
$im->string(gdSmallFont,20,485,"exception",$black);
$v=485;
$h=100;
foreach(sort keys(%exception)){
$color{$exception{$_}{amino}}=$black if($color{$exception{$_}{amino}}=="");
$CoDoN=uc $_;
$im->string(gdSmallFont,$h,$v,"$CoDoN $exception{$_}{amino} $exception{$_}{per}",$color{$exception{$_}{amino}});
$v+=20;
if($v == 545){
$v=485;
$h+=100;
}
}
mkdir ("graph",0777);
open(OUT,'>graph/'."$filename");
binmode OUT;
print OUT $im->png;
close(OUT);
msg_gimv("graph/$filename") if($output eq "show");
}
sub _codon_table2{
&opt_default(output=>"show",filename=>"codon_table.html");
my @args = opt_get(@_);
my $result = shift @args;
my $filename = opt_val("filename");
my $output = opt_val("output");
my (%amino, %data, %per, $amino_total, $amino);
my (%exception, %color);
foreach my $tmp (qw(D E)){ $color{$tmp} = 'gold' }
foreach my $tmp (qw(R K H)){ $color{$tmp} = 'red' }
foreach my $tmp (qw(N Q S T Y)){ $color{$tmp} = 'blue' }
foreach my $tmp (qw(A G V L I P F M W C)){ $color{$tmp} = 'green' }
$color{'/'} = 'black';
foreach $amino (keys(%{$result})){
$amino_total = 0;
foreach my $codon (keys(%{$$result{$amino}})){
$amino_total += $$result{$amino}{$codon};
}
foreach my $codon (keys(%{$$result{$amino}})){
if ($$result{$amino}{$codon} > $data{$codon}){
if ($data{$codon} != ''){
$exception{$codon}{amino} = $amino{$codon};
$exception{$codon}{per} = $per{$codon};
}
$data{$codon} = $$result{$amino}{$codon};
$amino{$codon} = $amino;
$per{$codon} = sprintf("%.3f", $$result{$amino}{$codon} / $amino_total); }
else{
$exception{$codon}{amino} = $amino;
$exception{$codon}{per} = sprintf("%.3f",$$result{$amino}{$codon} / $amino_total); }
}
}
mkdir ('graph', 0777);
open(FILE, '>graph/' . $filename);
print FILE qq(
<HTML>
<HEAD><TITLE>Codon Table v.2</TITLE>
<style type="text/css">
<!--
FONT{font-size:8pt}
-->
</style>
</HEAD>
<BODY bgcolor="white">
<table border=0 cellpadding=0 cellspacing=0><tr><td align=center valign=top>
<table border rules=all cellspacing=0 cellpadding=6>
<tr><td align=center rowspan=2><font color="red">first</font></td>
<td colspan=4 align=center><font color="green">second</font></td>
<td align=center rowspan=2><font color="blue">third</font></td>
</tr><tr>
<td align=center><font color="green">T</font></td>
<td align=center><font color="green">C</font></td>
<td align=center><font color="green">A</font></td>
<td align=center><font color="green">G</font></td>
</tr>
);
foreach my $first (qw(T C A G)){
print FILE qq(<tr><td align=center><font color="red">$first</font></td>);
foreach my $second (qw(T C A G)){
print FILE qq(<td>);
foreach my $third (qw(T C A G)){
my $triplet = $first . $second . $third;
print FILE qq(<table border=0 cellpadding=4 width="100\%"><tr>);
print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}};
print FILE '">', $triplet, '</td>', "\n";
print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}};
print FILE '">', $amino{lc($triplet)}, '</td>', "\n";
print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}};
print FILE '">', $per{lc($triplet)}, '</td>', "\n";
}
print FILE qq(</tr></table></td>);
}
print FILE qq(
<td align=center>
<table border=0 cellpadding=4>
<tr><td><font color="blue">T</font></td></tr>
<tr><td><font color="blue">C</font></td></tr>
<tr><td><font color="blue">A</font></td></tr>
<tr><td><font color="blue">G</font></td></tr>
</table>
</td>
);
}
print FILE qq(
</table><BR>
<font color="gold" style="font-size:9pt">yellow minus charge</font>\&
nbsp;\ \ \ \
<font color="red" style="font-size:9pt">red plus charge</font>\&
nbsp;\ \ \ \
<font color="blue" style="font-size:9pt">blue noncharge</font>\&
nbsp;\ \ \ \
<font color="green" style="font-size:9pt">green nonpolar</font>\&
nbsp;\ \ \ \
<a href="http://prowl.rockefeller.edu/aainfo/contents.htm"
style="font-size:9pt" target="_blank">(learn more here)</a> <BR><BR> <div align=left>exceptions:</div>
<table border=0 cellpadding=10 cellspacing=0><tr>
);
my $i = 0;
foreach my $triplet (sort keys(%exception)){
$color{$exception{$triplet}{amino}} = 'black'
if($color{$exception{$triplet}{amino}} eq '');
print FILE '<td><font color="', $color{$exception{$triplet}{amino}};
print FILE '">', uc $triplet, ' ',
$exception{$triplet}{amino}, ' ';
print FILE $exception{$triplet}{per}, '</font></td>';
if ($i == 4){
print FILE '</tr><tr>';
$i = -1;
}
$i ++;
}
print FILE qq(
</tr></table>
<BR>
<form method="GET" action="http://www.kazusa.or.jp/codon/cgi-bin/spsearch.cgi">
Search <a href="http://www.kazusa.or.jp/codon/">Kazusa Codon Usage Database</a>:
<input size="20" name="species">
<input type=hidden name="c" value="i">
<input type=submit value="Submit">
</form>
);
print FILE "</td></tr></table></body></html>";
close(FILE);
msg_gimv("graph/$filename") if($output eq "show");
}
sub _codon_amino_printer{
&opt_default(output=>"stdout");
my @args=opt_get(@_);
my $result=shift @args;
my $printer=opt_val("output");
my $filename=opt_val("filename");
my $total;
if($printer eq "f"){
open(FILE,">>$filename");
foreach(sort keys(%$result)){
print FILE $_,",",$result->{$_},"\n";
$total+=$result->{$_};
}
print FILE "total,",$total,"\n";
print FILE "\n\n";
close(FILE);
}
else{
foreach(sort keys(%$result)){
&msg_send($_," -> ",$result->{$_},"\n");
$total+=$result->{$_};
}
&msg_send("total -> ",$total,"\n");
}
}
sub _codon_usage_printer{
&opt_default(output=>"stdout");
my @args=opt_get(@_);
my $result=shift @args;
my $printer=opt_val("output");
my $filename=opt_val("filename");
my $amino_total;
my $key;
my $key2;
my $total;
if($printer eq "f"){
open(FILE,">>$filename");
foreach $key (sort keys(%$result)){
$amino_total=0;
foreach $key2 (sort keys(%{$$result{$key}})){
$amino_total+=$$result{$key}{$key2};
}
foreach $key2 (sort keys(%{$$result{$key}})){
print FILE $key,",",$key2,",",$$result{$key}{$key2},",",sprintf("%.3f",$$result{$key}{$key2}/$amino_total),"\n"; $total+=$$result{$key}{$key2};
}
}
print FILE "total,$total\n";
print FILE "\n\n";
close(FILE);
}
else{
foreach $key (sort keys(%$result)){
$amino_total=0;
foreach $key2 (sort keys(%{$$result{$key}})){
$amino_total+=$$result{$key}{$key2};
}
foreach $key2 (sort keys(%{$$result{$key}})){
&msg_send($key," -> ",$key2," -> ",$$result{$key}{$key2}," ",sprintf("%.3f",$$result{$key}{$key2}/$amino_total),"\n"); $total+=$$result{$key}{$key2};
}
}
&msg_send("total -> $total\n");
}
}
sub DESTROY {
my $self = shift;
}
1;
__END__
=head1 NAME
G::Seq::Codon - Perl extension for blah blah blah
=head1 SYNOPSIS
use G::Seq::Codon;
blah blah blah
=head1 DESCRIPTION
Stub documentation for G::Seq::Codon was created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.
Blah blah blah.
=head1 AUTHOR
A. U. Thor, a.u.thor@a.galaxy.far.far.away
=head1 SEE ALSO
perl(1).
=cut} |