G::Seq Codon
Included librariesPackage variablesGeneral documentationMethods
Package variables
Privates (from "my" definitions)
$ndc;
$version = 3
$total;
$tna;
$naa1;
%degeneracy;
$value;
%usage = %count
@allkey = sort keys %{$gb->{All}->{$data}}
%tbox;
$tnc;
$ndaa;
%max;
$result;
$nltk;
Included modules
G::Messenger
G::Seq::AminoAcid
G::Seq::Primitive
G::Seq::Util
GD(1)
GD(2)
SelfLoader
SubOpt
Inherit
Exporter
Synopsis
No synopsis!
Description
No description!
Methods
_codon_usage_printer
No description
Code
_codon_usage_tableDescriptionCode
codon_amino_counter
No description
Code
codon_compilerDescriptionCode
encDescriptionCode
Methods description
_codon_usage_tablecode    nextTop
  Name: _codon_usage_table   -   construct and display the codon usage table

  Description:
   This program constructs and displays the codon usage table. 

  Usage: 
   NULL = &_codon_usage_table(pointer HASH);

  Options:
   -output    output option (default: 'show')
   -filename    output filename (default: 'codon_usage_table.png')

  Author: 
    Haruo Suzuki (haruo@g-language.org)
History: 20030401 minor change 20010721 first release (Author: Koya Mori)
codon_compilercodeprevnextTop
  Name: codon_compiler   -   calculate various kinds of amino acid and codon usage data

  Description:
   This program calculates various kinds of amino acid and codon usage data.
   Value of Met codon ATG will be accessible at $gb->{$id}->{$data}->{'Matg'};

  Usage: 
   pointer HASH = &codon_compiler(G instance);

  Options:
   -output      output option (default: 'show')
                'stdout' for standard output
                'f' for file output in directory "data", 
                'g' for graph output in directory "graph",
                'show' for graph output and display
   -filename    output filename (default: 'usage.csv')
   -id          feature ID or gene group ID (default: 'All')
   -translate   1 when translates using standard codon table (default: 0)
   -startcodon  1 when includes start codon (default: 0)
   -stopcodon   1 when includes stop codon (default: 0)
   -del_key     regular expression to delete key (i.e. amino acids and nucleotides)
                (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]')
   -data        kinds of codon usage data (default: 'R0')
                'A0': Absolute amino acid frequency ('AA')
                'A1': Relative amino acid frequency ('RAAU')
                'R0': Absolute codon frequency ('AF')
                'R1': Relative codon frequency in a complete sequence
                'R2': Relative codon frequency in each amino acid ('RF')
                'R3': Relative synonymous codon usage ('RSCU') 
                'R4': Relative adaptiveness; i.e., ratio to maximum of minor codon ('W')
                'R5': Maximum (1) or minor (0) codon

  Author: 
   Haruo Suzuki (haruo@g-language.org)
History: 20030401 initial posting Informations stored in the G instance: $gb->{$id}->{tna}: total number of amino acids $gb->{$id}->{tnc}: total number of codons $gb->{$id}->{ndc}: number of different codons $gb->{$id}->{ndaa}: number of different amino acids $gb->{$id}->{nltk}: number of amino acids with less than k residues $gb->{$id}->{naa1}: number of amino acids with one residue in the gene $gb->{degeneracy}->{*}: degree of codon degeneracy of amino acid '*' $gb->{"header_codon_compiler_$data"} $gb->{errormsg_codon_compiler}
enccodeprevnextTop
  Name: enc   -   calculate the effective number of codons (Nc)

  Description:
   This program calculates the effective number of codons (Nc).

  Usage:
    scalar = &enc(G instance);

  Options:
   -id          ID of a group of genes or a single gene (default: 'All')
   -translate   1 when translates using standard codon table (default: 0)
   -del_key     regular expression to delete key (i.e. amino acids and nucleotides)
                (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]')

  Author:
   Haruo Suzuki (haruo@g-language.org)
History: 20030401 initial posting References: Wright F (1990) Gene 87:23-29. Requirements: sub codon_compiler()
Methods code
_codon_usage_printerdescriptionprevnextTop
sub _codon_usage_printer {
    &opt_default(output=>"stdout");
    my @args=opt_get(@_);
    my $result=shift @args;
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $amino_total;
    my $key;
    my $key2;
    my $total;


    if($printer eq "f"){
	open(FILE,">>$filename");
	foreach $key (sort keys(%$result)){
	    $amino_total=0;
	    foreach $key2 (sort keys(%{$$result{$key}})){
		$amino_total+=$$result{$key}{$key2};
	    }
	    foreach $key2 (sort keys(%{$$result{$key}})){
		print FILE $key,",",$key2,",",$$result{$key}{$key2},",",sprintf("%.3f",$$result{$key}{$key2}/$amino_total),"\n";
$total+=$$result{$key}{$key2}; } } print FILE "total,$total\n"; print FILE "\n\n"; close(FILE); } else{ foreach $key (sort keys(%$result)){ $amino_total=0; foreach $key2 (sort keys(%{$$result{$key}})){ $amino_total+=$$result{$key}{$key2}; } foreach $key2 (sort keys(%{$$result{$key}})){ &msg_send($key," -> ",$key2," -> ",$$result{$key}{$key2}," ",sprintf("%.3f",$$result{$key}{$key2}/$amino_total),"\n");
$total+=$$result{$key}{$key2}; } } &msg_send("total -> $total\n"); }
}
_codon_usage_tabledescriptionprevnextTop
sub _codon_usage_table {
    &opt_default(output=>'show', filename=>'codon_usage_table.png');
    my @args = opt_get(@_);
    my $usage = shift @args;
    my $output = opt_val("output");
    my $filename = opt_val("filename");

    my $im = new GD::Image(500, 550);
    my $white = $im->colorAllocate(255, 255, 255);
    my $black = $im->colorAllocate(0, 0, 0);
    my $blue = $im->colorAllocate(0, 0, 255);
    my $red = $im->colorAllocate(255, 0, 0);
    my $yellow = $im->colorAllocate(200, 200, 0);
    my $green = $im->colorAllocate(0, 150, 0);

    my $x;
    my $y;
    foreach $x ((10,50,450,490)){ $im->line($x,10,$x,450,$black); }
    foreach $x ((150,250,350)){ $im->line($x,30,$x,450,$black); }
    $im->line(50,30,450,30,$black); 
    foreach $y ((10,50,150,250,350,450)){ $im->line(10,$y,490,$y,$black); }

    $im->string(gdSmallFont, 15, 25, "first", $black);
    $im->string(gdSmallFont, 233, 15, "second", $black);
    $im->string(gdSmallFont, 455, 25, "third", $black);

    $y = 95;
    foreach(qw(U C A G)){ $im->string(gdSmallFont,30,$y,$_,$black);$y += 100; }
    $x = 90;
    foreach(qw(U C A G)){ $im->string(gdSmallFont,$x,35,$_,$black);$x += 100; }
    $y = 65;
    foreach(1..4){ foreach(qw(U C A G)){ $im->string(gdSmallFont,470,$y,$_,$black);$y += 20; } $y += 20; }

    my %CodonTable = ('gca'=>'A','gcc'=>'A','gcg'=>'A','gct'=>'A',
		      'tgc'=>'C','tgt'=>'C',
		      'gac'=>'D','gat'=>'D',
		      'gaa'=>'E','gag'=>'E',
		      'ttc'=>'F','ttt'=>'F',
		      'gga'=>'G','ggc'=>'G','ggg'=>'G','ggt'=>'G',
		      'cac'=>'H','cat'=>'H',
		      'ata'=>'I','atc'=>'I','att'=>'I',
		      'aaa'=>'K','aag'=>'K',
		      'cta'=>'L','ctc'=>'L','ctg'=>'L','ctt'=>'L','tta'=>'L','ttg'=>'L',
		      'atg'=>'M',
		      'aac'=>'N','aat'=>'N',
		      'cca'=>'P','ccc'=>'P','ccg'=>'P','cct'=>'P',
		      'caa'=>'Q','cag'=>'Q',
		      'aga'=>'R','agg'=>'R','cga'=>'R','cgc'=>'R','cgg'=>'R','cgt'=>'R',
		      'agc'=>'S','agt'=>'S','tca'=>'S','tcc'=>'S','tcg'=>'S','tct'=>'S',
		      'aca'=>'T','acc'=>'T','acg'=>'T','act'=>'T', 
		      'gta'=>'V','gtc'=>'V','gtg'=>'V','gtt'=>'V', 
		      'tgg'=>'W',
		      'tac'=>'Y','tat'=>'Y',
		      'taa'=>'/','tag'=>'/','tga'=>'/');
    
    my %one2three = &G::Seq::AminoAcid::one2three();
    
    my ($codon, $p1st, $p2nd, $p3rd, $aa, $value, $color);
    foreach $codon (keys %CodonTable){
	$p1st = substr($codon, 0, 1);
	$y = 65 if($p1st eq 't');
	$y = 165 if($p1st eq 'c');
	$y = 265 if($p1st eq 'a');
	$y = 365 if($p1st eq 'g');
	$p2nd = substr($codon, 1, 1);
	$x = 60 if($p2nd eq 't');
	$x = 160 if($p2nd eq 'c');
	$x = 260 if($p2nd eq 'a');
	$x = 360 if($p2nd eq 'g');
	$p3rd = substr($codon, 2, 1);
	$y += 20 if($p3rd eq 'c');
	$y += 40 if($p3rd eq 'a');
	$y += 60 if($p3rd eq 'g');
	$aa = &translate($codon);
	$value = $usage->{$aa.$codon};
	$value = sprintf( (int($value) - $value) ? "%.3f" : "%d", $value );
	
	$codon = uc($codon);
	$codon =~ s/T/U/g;
	if($aa =~ /[DE]/){ $color = $blue; }
	elsif($aa =~ /[RKH]/){ $color = $red; }
	elsif($aa =~ /[NQSTY]/){ $color = $yellow; }
	elsif($aa =~ /[AGVLIPFMWC]/){ $color = $green; }
	else { $color = $black; } 
	$im->string(gdSmallFont,$x,$y,"$one2three{$aa} $codon $value",$color);
    }
    $im->string(gdSmallFont,20,465,"blue acidic",$blue);
    $im->string(gdSmallFont,100,465,"red basic",$red);
    $im->string(gdSmallFont,170,465,"yellow neutral & polar",$yellow);
    $im->string(gdSmallFont,315,465,"green neutral & hydrophobic",$green);
    
    $im->string(gdSmallFont,20,485,"exception",$black);
    my $h = 100;
    my $v = 485;
    foreach(sort keys %{$usage}){
	next if($_ =~ /Agca|Agcc|Agcg|Agct|Ctgc|Ctgt|Dgac|Dgat|Egaa|Egag|Fttc|Fttt|Ggga|Gggc|Gggg|Gggt|Hcac|Hcat|Iata|Iatc|Iatt|Kaaa|Kaag|Lcta|Lctc|Lctg|Lctt|Ltta|Lttg|Matg|Naac|Naat|Pcca|Pccc|Pccg|Pcct|Qcaa|Qcag|Raga|Ragg|Rcga|Rcgc|Rcgg|Rcgt|Sagc|Sagt|Stca|Stcc|Stcg|Stct|Taca|Tacc|Tacg|Tact|Vgta|Vgtc|Vgtg|Vgtt|Wtgg|Ytac|Ytat|\//);
	$codon = uc(substr($_, 1, 3));
	$codon =~ s/T/U/g;
	$value = sprintf((int($usage->{$_}) - $usage->{$_}) ? "%.3f" : "%d", $usage->{$_});
	$aa = substr($_, 0, 1);
	$im->string(gdSmallFont,$h,$v,"$one2three{$aa} $codon $value",$black);
	$v += 20;
	if($v == 545){ $h += 100; $v = 485; }
    }

    mkdir('graph', 0777);
    open(OUT, ">graph/$filename");
    binmode OUT;
    print OUT $im->png;
    close(OUT);
    
    msg_gimv("graph/$filename") if($output eq 'show');
}
codon_amino_counterdescriptionprevnextTop
sub codon_amino_counter {
    my $gb=shift;
    my $key=shift;
    my %result;
    my $seq;
    my $amino;
    my $codon;
    my $num;
    my $q;
    my @tmp;
    
    if(defined(%{$gb->{"$key"}})){
	$seq=lc($gb->get_geneseq("$key"));
	$num=$gb->{"$key"}->{feature};
    }
    
    @tmp=split(//,$gb->{"FEATURE$num"}->{translation});
    for($q=0;$q<length($seq);$q+=3){
	$amino=shift @tmp;
	$amino="/" if($amino eq "");
	$codon=substr($seq,$q,3);
	$result{$amino}{$codon}++;
    }
    
    return %result;
}
codon_compilerdescriptionprevnextTop
sub codon_compiler {
    &opt_default(output=>'show',filename=>'usage.csv',id=>'All',translate=>0,startcodon=>0,stopcodon=>0,
		 del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]',data=>'R0');
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $output = opt_val("output");
    my $filename = opt_val("filename");
    my $id = opt_val("id");
    my $data = opt_val("data");
    my $translate = opt_val("translate");
    my $startcodon = opt_val("startcodon");
    my $stopcodon = opt_val("stopcodon");
    my $del_key = opt_val("del_key");

    if($data eq 'AA'){ $data = 'A0'; }
    elsif($data eq 'RAAU'){ $data = 'A1'; }
    elsif($data eq 'AF'){ $data = 'R0'; }
    elsif($data eq 'RF'){ $data = 'R2'; }
    elsif($data eq 'RSCU'){ $data = 'R3'; }
    elsif($data eq 'W'){ $data = 'R4'; }
    if($data !~ /A[01]|[CR][0-5]/){ &msg_error("$data: undefined type of amino acid and codon usage.\n"); return; }
    if($data =~ /A[01]/){ $startcodon = $stopcodon = 0; }

    my $i;
    my $aaseq;
    my $aa;
    my $codon;
    my %count;
    my $omit;
    my $seq;

    if($id =~ /FEATURE/){ # for a single gene
$seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); # excluding start and stop codons
$seq = $gb->startcodon($id).$seq if($startcodon); $seq .= $gb->stopcodon($id) if($stopcodon); if($translate){ $aaseq = &translate($seq); } else { $aaseq = substr($gb->{$id}->{translation}, 1-$startcodon); } $i = 0; while($aaseq){ my $aa = substr($aaseq, 0, 1); if($data =~ /A/){ $count{$aa} ++; } if($data =~ /[CR]/){ $count{$aa.substr($seq, $i, 3)} ++; } substr($aaseq, 0, 1) = ''; $i += 3; } #$count{'/'.substr($seq, $i, 3)} ++ if($stopcodon);
} else { # for a group of genes
my $errormsg; foreach my $cds ($gb->cds()){ next if( $id ne 'All' && !($gb->{$cds}->{$id}) ); $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); $seq = $gb->startcodon($cds).$seq if($startcodon); $seq .= $gb->stopcodon($cds) if($stopcodon); # omitting irregular CDS
if($data =~ /C/ && $id eq 'All'){ if(length($seq) % 3){ unless($gb->{errormsg_codon_compiler}){ $errormsg ++; &msg_error("$gb->{$cds}->{locus_tag}: sequence isn't length 3n\n"); } $omit .= "$cds|"; next; } if($gb->{$cds}->{note} =~ /Protein sequence is in conflict with the conceptual translation$/){ unless($gb->{errormsg_codon_compiler}){ $errormsg ++; &msg_error("$gb->{$cds}->{locus_tag}:\t", $gb->{$cds}->{note}, "\n"); } $omit .= "$cds|"; next; } } # result of 'A0' is not necessarily equal to that of 'A0C0'
if($translate){ $aaseq = &translate($seq); } else { $aaseq = substr($gb->{$cds}->{translation}, 1-$startcodon); if($data =~ /C/ && $id eq 'All' && length($seq) / 3 != length($aaseq)+$stopcodon && !($gb->{errormsg_codon_compiler})){
$errormsg ++; &msg_error("$cds:\t", length($aaseq), " amino acids, but ", length($seq)/3, " sense codons\n"); } } $i = 0; while($aaseq){ my $aa = substr($aaseq, 0, 1); if($data =~ /A/){ $count{$aa} ++; } if($data =~ /[CR]/){ $count{$aa.substr($seq, $i, 3)} ++; } substr($aaseq, 0, 1) = ''; $i += 3; } #$count{'/'.substr($seq, $i, 3)} ++ if($stopcodon);
} $gb->{errormsg_codon_compiler} = 1 if($errormsg);
}
encdescriptionprevnextTop
sub enc {
    &opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]');
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $id = opt_val("id");
    my $translate = opt_val("translate");
    my $del_key = opt_val("del_key");
    
    my %tbox;     # total number of codons in each amino acid box
my $aa; # amino acid
my %relative; # relative use of each codon
my %F; # homozygosity (F) for each aa
my %cntaa; # Number of members (aa) for each SF type in the gene
my %numaa; # Number of members (aa) for each SF type in all genes
my %sumF; # sum of F value in the SF type
my %aveF; # average F value in the SF type
my $enc; # effective number of codons
my $count = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C0', -translate=>$translate, -del_key=>$del_key); foreach (keys %{$count}){ $tbox{substr($_, 0, 1)} += $count->{$_}; } foreach (keys %{$count}){ $aa = substr($_, 0, 1); if($tbox{$aa} - 1 == 0){ delete $tbox{$aa}; } else { $relative{$_} = $count->{$_} / $tbox{$aa}; }
}

foreach (keys
%relative){ $F{substr($_, 0, 1)} += $relative{$_} ** 2; }
foreach
$aa (keys %F){
$F{$aa} = ($tbox{$aa} * $F{$aa} - 1) / ($tbox{$aa} - 1); } foreach $aa (keys %F){ if($F{$aa} > 0.0000001){ $sumF{$gb->{degeneracy}->{$aa}} += $F{$aa}; $cntaa{$gb->{degeneracy}->{$aa}} ++; } } foreach (values %{$gb->{degeneracy}}){ $numaa{$_} ++; } foreach (keys %numaa){ if($_ == 1){ $aveF{$_} = 1; } elsif($cntaa{$_} && $sumF{$_}){ $aveF{$_} = $sumF{$_} / $cntaa{$_};
} elsif($_ == 3 && $sumF{2} && $sumF{4}){ $aveF{3} = ($sumF{2} / $cntaa{2} + $sumF{4} / $cntaa{4}) / 2;
} else { $enc = -1; } } unless($enc){ foreach (keys %numaa){ $enc += $numaa{$_} / $aveF{$_}; }}
#return (
$enc > 61) ? 61.00 : sprintf "%.2f", $enc;
if($enc < 0){ $enc = undef; } elsif($enc > 61){ $enc = 61; } else { sprintf "%.2f", $enc; } return $enc; } =head2 cbi Name: cbi - calculates the codon bias index (CBI) Description: This program calculates the codon bias index (CBI). Usage: scalar = &cbi(G instance); Options: -id ID of a group of genes or a single gene (default: 'All') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting References: Morton BR (1993) J.Mol.Evol. 37:273-280. Requirements: sub codon_compiler() =cut sub cbi { &opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]'); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $id = opt_val("id"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); my $aacu = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'A1C4', -translate=>$translate, -del_key=>$del_key); my %bias; # biases of individual amino acids
my $cbi; foreach (keys %{$gb->{All}->{A1C4}}){ my $aa = substr($_, 0, 1); $bias{$aa} += ($aacu->{$_} - 1) ** 2 / ($gb->{degeneracy}->{$aa} - 1) if($aacu->{$aa} && length($_) == 4 && $gb->{degeneracy}->{$aa} > 1);
} foreach (keys %bias){ $cbi += $aacu->{$_} * $bias{$_}; } return sprintf "%.4f", $cbi; } =head2 scs Name: scs - calculates the scaled chi-square Description: This program calculates the chi-square for deviation from uniform synonymous codon usage, scaled by gene length. Usage: scalar = &scs(G instance); Options: -id ID of a group of genes or a single gene (default: 'All') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') Author: Haruo Suzuki (haruo@g-language.org) History: 20090204 initial posting References: Shields DC, Sharp PM. (1987) Nucleic Acids Res. 15(19):8023-40. Requirements: sub codon_compiler() =cut sub scs { &opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]'); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $id = opt_val("id"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); my $observed = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C0', -translate=>$translate, -del_key=>$del_key); my %tbox; # total number of codons in each amino acid box
my $tnc; # total number of codons
my %degeneracy; foreach (keys %{$observed}){ $tbox{substr($_, 0, 1)} += $observed->{$_}; $tnc += $observed->{$_}; $degeneracy{substr($_, 0, 1)} ++; } my $scs; # scaled chi-square
foreach (sort keys %{$observed}){ my $aa = substr($_, 0, 1); #my $expected = $tbox{$aa} / $degeneracy{$aa};
my $expected = $tbox{$aa} / $gb->{degeneracy}->{$aa};
#print "\n$_\t$aa\t$degeneracy{$aa}\t$gb->{degeneracy}->{$aa}\t$tbox{$aa}\t$observed->{$_}\t$expected";
$scs += ($observed->{$_} - $expected)**2 / $expected;
} return sprintf "%.4f", $scs /= $tnc if($tnc);
} =head2 icdi Name: icdi - calculates the intrinsic codon deviation index (ICDI) Description: This program calculates the intrinsic codon deviation index (ICDI). Usage: scalar = &icdi(G instance); Options: -id ID of a group of genes or a single gene (default: 'All') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting References: Freire-Picos MA et al. (1994) Gene 139:43-49. Requirements: sub codon_compiler() =cut sub icdi { &opt_default(id=>'All', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]'); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $id = opt_val("id"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); my $rscu; # relative synonymous codon usage (RSCU) values
my %tbox; # total in each amino acid box
my %Sk; # Sk values of individual amino acids
my $icdi; # intrinsic codon deviation index (ICDI)
my $ndaa; # number of different amino acids
$rscu = &codon_compiler($gb, -output=>'n', -id=>$id, -data=>'C3', -translate=>$translate, -del_key=>$del_key); foreach (keys %{$gb->{All}->{C3}}){ $tbox{substr($_, 0, 1)} += $rscu->{$_}; } foreach (keys %{$gb->{All}->{C3}}){ my $aa = substr($_, 0, 1); $Sk{$aa} += ($rscu->{$_} - 1) ** 2 if($tbox{$aa} > 1); # exclude absent or non-degenerate amino acids
} #next unless(%Sk); # e.g. NSP1 in S_cerevisiae/X.gbk
foreach (keys %Sk){ my $k = $gb->{degeneracy}->{$_}; $icdi += $Sk{$_} / ($k * ($k - 1));
$ndaa ++; } return sprintf "%.4f", $icdi /= $ndaa if($ndaa);
} =head2 fop Name: fop - calculate the frequency of optimal codons (Fop) Description: This program calculates the frequency of optimal codons (Fop), and inputs in the G instance; i.e. Fop values will be accessible at $gb->{"FEATURE$i"}->{fop}; Usage: NULL = &fop(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'fop.csv') -binary pointer HASH of vectors of binary number, 1 or 0 refering to the optimal or nonoptimal codon. By default, four amino acids (Ile, Asn, Phe, and Tyr), where the optimal codons seem to be the same in all species, are used. -translate 1 when translates using standard codon table (default: 0) Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting References: Ikemura (1981) J.Mol.Biol. 151:389-409. Ikemura (1985) Mol.Biol.Evol. 2(1):13-34. =cut sub fop { &opt_default(output=>'stdout', filename=>'fop.csv', translate=>0); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $translate = opt_val("translate"); # S.cerevisiae
my %Sc = ('Lcta'=>0,'Lctc'=>0,'Lctg'=>0,'Lctt'=>0,'Ltta'=>0,'Lttg'=>1, 'Raga'=>1,'Ragg'=>0,'Rcga'=>0,'Rcgc'=>0,'Rcgg'=>0,'Rcgt'=>0, 'Sagc'=>0,'Sagt'=>0,'Stca'=>0,'Stcc'=>1,'Stcg'=>0,'Stct'=>1, 'Agca'=>0,'Agcc'=>1,'Agcg'=>0,'Agct'=>1, 'Ggga'=>0,'Gggc'=>1,'Gggg'=>0,'Gggt'=>1, 'Pcca'=>0,'Pccc'=>0,'Pccg'=>0,'Pcct'=>0, 'Taca'=>0,'Tacc'=>1,'Tacg'=>0,'Tact'=>1, 'Vgta'=>0,'Vgtc'=>1,'Vgtg'=>0,'Vgtt'=>1, 'Iata'=>0,'Iatc'=>1,'Iatt'=>1, 'Naac'=>1,'Naat'=>0, 'Dgac'=>1,'Dgat'=>0, 'Ctgc'=>0,'Ctgt'=>0, 'Qcaa'=>0,'Qcag'=>0, 'Egaa'=>1,'Egag'=>0, 'Hcac'=>1,'Hcat'=>0, 'Kaaa'=>0,'Kaag'=>1, 'Fttc'=>1,'Fttt'=>0, 'Ytac'=>1,'Ytat'=>0); # E.coli
my %Ec = ('Lcta'=>0,'Lctc'=>0,'Lctg'=>1,'Lctt'=>0,'Ltta'=>0,'Lttg'=>0, 'Raga'=>0,'Ragg'=>0,'Rcga'=>0,'Rcgc'=>1,'Rcgg'=>0,'Rcgt'=>1, 'Sagc'=>0,'Sagt'=>0,'Stca'=>0,'Stcc'=>0,'Stcg'=>0,'Stct'=>0, 'Agca'=>1,'Agcc'=>0,'Agcg'=>1,'Agct'=>1, 'Ggga'=>0,'Gggc'=>1,'Gggg'=>0,'Gggt'=>1, 'Pcca'=>0,'Pccc'=>0,'Pccg'=>1,'Pcct'=>0, 'Taca'=>0,'Tacc'=>1,'Tacg'=>0,'Tact'=>1, 'Vgta'=>1,'Vgtc'=>0,'Vgtg'=>1,'Vgtt'=>1, # Ikemura (1985)
'Vgta'=>1,'Vgtc'=>0,'Vgtg'=>0,'Vgtt'=>1, # Kanaya et al.(1999)
'Iata'=>0,'Iatc'=>1,'Iatt'=>0, 'Naac'=>1,'Naat'=>0, 'Dgac'=>1,'Dgat'=>0, 'Ctgc'=>0,'Ctgt'=>0, 'Qcaa'=>0,'Qcag'=>1, 'Egaa'=>1,'Egag'=>0, 'Hcac'=>1,'Hcat'=>0, 'Kaaa'=>1,'Kaag'=>0, 'Fttc'=>1,'Fttt'=>0, 'Ytac'=>1,'Ytat'=>0); my %four = ('Iatc'=>1,'Iatt'=>0, # 'Iata' is ignored
'Naac'=>1,'Naat'=>0, 'Fttc'=>1,'Fttt'=>0, 'Ytac'=>1,'Ytat'=>0); my $binary = opt_val("binary") ? opt_val("binary") :\% four; my %degeneracy; foreach (keys %{$binary}){ $degeneracy{substr($_, 0, 1)} ++; } if($output eq 'stdout'){ msg_send("\n\n", "optimal (1) and nonoptimal (0) codons\n", "-------------------------------------\n" ); foreach (sort keys %$binary){ msg_send("$_\t"); } print "\n"; foreach (sort keys %$binary){ msg_send("$binary->{$_}\t"); } msg_send("\n\n", "Frequency of optimal codons (Fop; ratio of optimal codons to synonymous codons)\n", " Laa: length in amino acids\n", " Ls: length in silent sites (number of synonymous codons)\n", "--------------------------------------------\n", "Laa\tLs\tstart\tstop\tFop\tgene\n", "--------------------------------------------\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); #open(FILE,">>$filename");
print FILE "Frequency of optimal codons (Fop; ratio of optimal codons to synonymous codons)\n", " Laa: length in amino acids\n", " Ls: length in silent sites (number of synonymous codons)\n", "Laa,Ls,start codon,stop codon,Fop,gene,product\n"; } foreach my $cds ($gb->cds()){ #my $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3);
my $seq = substr($gb->get_geneseq($cds), $gb->{$cds}->{codon_start} - 1 + 3, -3); my $aaseq; my $i; if($translate){ $aaseq = &translate($seq); } else { $aaseq = substr($gb->{$cds}->{translation}, 1); } my $Laa = length($aaseq); my $Ls; # length in silent sites
my $fop; # frequency of optimal codons
$i = 0; while($aaseq){ my $aa = substr($aaseq, 0, 1); substr($aaseq, 0, 1) = ''; my $codon = substr($seq, $i, 3); if($degeneracy{$aa} > 1 && exists $binary->{$aa.$codon}){ $Ls ++; $fop += $binary->{$aa.$codon}; } $i += 3; } $gb->{$cds}->{fop} = sprintf "%.4f", $fop / $Ls if($Ls);
$gb->{$cds}->{Ls} = $Ls; if($output eq 'stdout'){ msg_send("$Laa\t$Ls\t", $gb->startcodon($cds), "\t", $gb->stopcodon($cds), "\t", $gb->{$cds}->{fop}, "\t", $gb->{$cds}->{gene}, "\n"); } if($output eq 'f'){ print FILE "$Laa,$Ls,", $gb->startcodon($cds), ",", $gb->stopcodon($cds), ",", $gb->{$cds}->{fop}, ",", $gb->{$cds}->{gene}, ",", $gb->{$cds}->{product}, "\n"; } } close(FILE); } =head2 w_value Name: w_value - calculate the 'relative adaptiveness of each codon' (W) Description: This program calculates the 'relative adaptiveness of each codon' (W value) necessary for CAI analysis. Returned value is a pointer HASH of W values. Usage: pointer HASH_w_values = &w_value(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'w_value.csv') -include regular expression to include genes in a reference set a reference set in several studies are in-built 1: Nakamura and Tabata, 2: Sharp and Li, 3: Sakai et al. (default: 1.'ribosomal.*protein') -exclude regular expression to exclude genes from a reference set (default: '[Mm]itochondrial') Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 addition of -in & -out option 20010830 first release (Author: Kazuharu "Gaou" Arakawa) References: Sharp and Li (1987) Nucleic Acids Res. 15:1281-1295. Nakamura and Tabata (1997) Microb.Comp.Genomics 2:299-312. Sakai et al. (2001) J.Mol.Evol. 52:164-170. Requirements: none. =cut sub w_value { &opt_default(output=>'stdout', filename=>"w_value.csv", include=>'ribosomal.*protein', exclude=>'[Mm]itochondrial'); my @args=opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $include; my $exclude = opt_val("exclude"); #&msg_error("CAI bases its measure on reference codon usage, i.e. usually that of ribosomoal proteins, and therefore is not usually applicable to organisms other than prokaryotes.") if($gb->{HEADER} !~ /Archaea|Bacteria/);
if(opt_val("include") == 1){ $include = 'ribosomal.*protein'; } elsif(opt_val("include") == 2){ $include = 'ribosomal.*protein|outer membrane protein|^elongation factor'; } elsif(opt_val("include") == 3){ $include = 'ribosom.*protein|outer.*membrane|elongation|heat.*shock|RNA.*polymerase.*subunit'; } else{ $include = opt_val("include"); } if($output eq 'stdout'){ msg_send("\n", "Reference set of highly expressed genes\n", "start\tend\tgene\tproduct\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); print FILE "Reference set of highly expressed genes\n", "start,end,gene,product\n"; } my $aaseq; my $seq; my $aa; my $codon; my %count; my $i; my %max; my %w_val; my $cnt; foreach my $cds ($gb->cds()){ next if ($gb->{$cds}->{note} !~ /$include/i && $gb->{$cds}->{product} !~ /$include/i); next if ($gb->{$cds}->{product} =~ /$exclude/); $cnt ++; $aaseq = substr($gb->{$cds}->{translation}, 1); $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); while($aaseq){ $aa = substr($aaseq, 0, 1); substr($aaseq, 0, 1) = ''; $codon = substr($seq, $i, 3); $count{$aa.$codon} ++; $i += 3; } $i = 0; if($output eq 'stdout'){ msg_send($gb->{$cds}->{start}, "\t", $gb->{$cds}->{end}, "\t", $gb->{$cds}->{gene}, "\t", $gb->{$cds}->{product}, "\n"); } if($output eq 'f'){ print FILE $gb->{$cds}->{start}, ",", $gb->{$cds}->{end}, ",", $gb->{$cds}->{gene}, ",", $gb->{$cds}->{product}, "\n"; } } #die("\nCould not define w values. No reference proteins found.") unless($cnt);
&msg_error("\ncai: No reference proteins found (could not define W values).") unless($cnt); foreach (keys %count){ $aa = substr($_, 0, 1); $max{$aa} = $count{$_} if($count{$_} > $max{$aa}); } foreach (keys %count){ $w_val{$_} = sprintf "%.4f", $count{$_} / $max{substr($_, 0, 1)};
} if($output eq 'stdout'){ msg_send("\nRelative adaptiveness (W) of each codon\n", "aa\tcodon\tW value\n"); foreach (sort keys %count){ $aa = substr($_, 0, 1); $codon = substr($_, 1, 3); msg_send("$aa\t$codon\t$w_val{$_}\n"); } } if($output eq 'f'){ print FILE "\nRelative adaptiveness (W) of each codon\n", "amino acid,codon,W value\n"; foreach (sort keys %count){ $aa = substr($_, 0, 1); $codon = substr($_, 1, 3); print FILE "$aa,$codon,$w_val{$_}\n"; } close(FILE); } return\% w_val; } =head2 cai Name: cai - calculate codon adaptation index (CAI) for each gene Description: This program calculates codon adaptation index (CAI) for each gene, and inputs in the G instance. i.e. CAI values will be accessible at $gb->{"FEATURE$i"}->{cai}; Usage: NULL = &cai(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: "cai.csv") -translate 1 when translates using standard codon table (default: 0) -w_output output option for W value (default: 'stdout') -w_filename filename to output the W values (default: "w_value.csv") -w_values pointer HASH of W values used in CAI calculation -w_absent W value of codons absent from a reference set (default: "-") "-" when excludes such codons from the calculation a value of 0.5 was suggested by Sharp and Li (1987) Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 addition of -w_absent option 20010830 first release (Author: Kazuharu "Gaou" Arakawa) References: Sharp PM, Li WH. 1987 Nucleic Acids Res. 15(3):1281-95. Requirements: sub w_value() =cut sub cai { &opt_default(output=>'stdout', filename=>"cai.csv", translate=>0, w_output=>'stdout', w_filename=>"w_value.csv", w_absent=>"-"); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $translate = opt_val("translate"); my $w_output = opt_val("w_output"); my $w_fname = opt_val("w_filename"); my $w_abs = opt_val("w_absent"); my $w_val; $w_output = $output if($w_output eq 'stdout' && $output ne 'stdout'); if(opt_val("w_values")){ $w_val = opt_val("w_values"); }else{ $w_val = &w_value($gb, -output=>$w_output, -filename=>$w_fname); } if($output eq 'stdout'){ msg_send("\nCodon Adaptation Index (CAI) - geometric mean of W values\n", "start\tend\tcai\tgene\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); print FILE "start,end,cai,gene,product\n"; } foreach my $cds ($gb->cds()){ my $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); # excluding start and stop codons
my $aaseq; my $i; my $codon; if($translate){ $aaseq = &translate($seq); } else { $aaseq = substr($gb->{$cds}->{translation}, 1); } my $cai; my $L; for($i = 0; $i * 3 <= length($seq) - 3; $i ++){ my $aa = substr($aaseq, $i, 1); $codon = substr($seq, $i * 3, 3); if($w_val->{$aa.$codon} > 0){ $cai += log($w_val->{$aa.$codon}); } elsif($w_abs){ next if($w_abs eq '-'); $cai += log($w_abs); } else { $cai = 999; last; } $L ++; } $gb->{$cds}->{cai} = ($cai == 999) ? 0 : sprintf "%.4f", exp($cai/$L) if($L);
if($output eq 'stdout'){ msg_send($gb->{$cds}->{start}, "\t", $gb->{$cds}->{end}, "\t", $gb->{$cds}->{cai}, "\t", $gb->{$cds}->{gene}, "\n"); } if($output eq 'f'){ print FILE $gb->{$cds}->{start}, ",", $gb->{$cds}->{end}, ",", $gb->{$cds}->{cai}, ",", $gb->{$cds}->{gene}, ",", $gb->{$cds}->{product}, "\n"; } } close(FILE); } =head2 phx Name: phx - identify predicted highly expressed (PHX) genes Description: This program calculates codon usage differences between gene classes for identifying predicted highly expressed (PHX) genes. The "expression measure" of a gene, E(g), will be accessible at $gb->{"FEATURE$i"}->{E_g}. Usage: NULL = &phx(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'phx.csv') -usage pointer HASH of vectors of amino acid and codon usage of a set of highly expressed genes. By default, the ribosomal protein genes are used. -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting Informations stored in the G instance: $gb->{"FEATURE$i"}->{BgC}: B(g|C) $gb->{"FEATURE$i"}->{BgH}: B(g|H) $gb->{"FEATURE$i"}->{E_g}: E(g) [B(g|C) / B(g|H)]
$gb->{"FEATURE$i"}->{phx}: 1 if a gene is PHX [i.e. E(g) > 1.05]

References:
Karlin and Mrazek (2000) J.Bacteriol. 182(18):5238-5250.
Henry I, Sharp PM. 2007 Mol Biol Evol. 24(1):10-2.

Requirements: sub codon_compiler()

=cut

sub phx {
&opt_default(output=>'stdout', filename=>"phx.csv", usage=>"",
translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]');
my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); my $High = opt_val("usage"); my $cds; unless($High){ foreach $cds ($gb->cds()){ $gb->{$cds}->{High} = ( ($gb->{$cds}->{product} =~ /ribosomal.*protein/i) ? 1 : 0 ); } } my $cnt; foreach $cds ($gb->cds()){ $cnt += $gb->{$cds}->{High}; } if($cnt){ $High = &codon_compiler($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -data=>'A1C2', -id=>'High'); } else { &msg_error("\nphx: No reference proteins found."); $High = &codon_compiler($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -data=>'A1C2'); } if($output eq 'stdout'){ msg_send("\n\nPredicted highly expressed (PHX) genes\n", " B(g|C): codon usage difference relative to collection of all genes\n", " B(g|H): codon usage difference relative to a set of highly expressed genes\n", " E(g): predicted expression level [B(g|C) / B(g|H)]\n", " PHX = 1 if E(g) > 1.05\n", "start\tend\tB(g|C)\tB(g|H)\tE(g)\tPHX\tgene\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); print FILE "start,end,B(g|C),B(g|H),E(g),PHX,gene,product\n"; } foreach $cds ($gb->cds()){ my $aacu = &codon_compiler($gb, -output=>'n', -id=>$cds, -data=>'A1C2', -translate=>$translate, -del_key=>$del_key); # codon usage difference (cud) between gene classes
my %boxC; my %boxH; my $BgC; # cud relative to collection of all genes
my $BgH; # cud relative to a set of highly expressed genes
foreach (keys %{$gb->{All}->{A1C2}}){ if(length($_) == 4){ my $aa = substr($_, 0, 1); $boxC{$aa} += abs($aacu->{$_} - $gb->{All}->{A1C2}->{$_}); $boxH{$aa} += abs($aacu->{$_} - $High->{$_}); } } foreach (keys %boxC){ $BgC += $aacu->{$_} * $boxC{$_}; $BgH += $aacu->{$_} * $boxH{$_}; } if($BgH){ $gb->{$cds}->{BgC} = sprintf "%.4f", $BgC; $gb->{$cds}->{BgH} = sprintf "%.4f", $BgH; $gb->{$cds}->{E_g} = sprintf "%.4f", $BgC / $BgH;
$gb->{$cds}->{phx} = (($gb->{$cds}->{E_g} > 1.05) ? 1 : 0); } if($output eq 'stdout'){ msg_send($gb->{$cds}->{start}, "\t", $gb->{$cds}->{end}, "\t", $gb->{$cds}->{BgC}, "\t", $gb->{$cds}->{BgH}, "\t", $gb->{$cds}->{E_g}, "\t", $gb->{$cds}->{phx}, "\t", $gb->{$cds}->{gene}, "\n"); } if($output eq 'f'){ print FILE $gb->{$cds}->{start}, ",", $gb->{$cds}->{end}, ",", $gb->{$cds}->{BgC}, ",", $gb->{$cds}->{BgH}, ",", $gb->{$cds}->{E_g}, ",", $gb->{$cds}->{phx}, ",", $gb->{$cds}->{gene}, ",", $gb->{$cds}->{product}, "\n"; } } close(FILE); } =head2 aaui Name: aaui - calculates various indices of amino acid usage Description: This program calculates amino acid usage indices for proteins (excluding formylmethionine), and inputs in the G instance. Informations stored in the G instance are: $gb->{$id}->{ndaa}: number of different amino acids $gb->{$id}->{Haau}: entropy of amino acid usage $gb->{$id}->{mmw}: mean molecular weight $gb->{$id}->{gravy}: mean hydropathic indices of each amino acid $gb->{$id}->{aroma}: relative frequency of aromatic amino acids $gb->{$id}->{acidic}: relative frequency of acidic amino acids $gb->{$id}->{basic}: relative frequency of basic amino acids $gb->{$id}->{neutral}: relative frequency of neutral amino acids $gb->{$id}->{polar}: relative frequency of polar amino acids $gb->{header_aaui} Usage: NULL = &aaui(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'aaui.csv') -id ID of a group of genes or a single gene (default: 'All') Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting References: Lobry and Gautier (1994) Nucleic Acids Res. 22:3174-3180. Zavala A et al. (2002) J Mol Evol. 54(5):563-8. =cut sub aaui { &opt_default(output=>'stdout', filename=>'aaui.csv', id=>'All'); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $id = opt_val("id"); my $aaseq; if($id =~ /FEATURE/){ # for a single gene
$aaseq = substr($gb->{$id}->{translation}, 1); # excluding formylMet
} else { # for a group of genes
foreach my $cds ($gb->cds()){ next if( $id ne 'All' && !($gb->{$cds}->{$id}) ); $aaseq .= substr($gb->{$cds}->{translation}, 1); } } my $Laa = $aaseq =~ tr[ACDEFGHIKLMNPQRSTVWY][ACDEFGHIKLMNPQRSTVWY]; if($aaseq){ my %count; $count{A} = $aaseq =~ tr/A/A/; $count{C} = $aaseq =~ tr/C/C/; $count{D} = $aaseq =~ tr/D/D/; $count{E} = $aaseq =~ tr/E/E/; $count{F} = $aaseq =~ tr/F/F/; $count{G} = $aaseq =~ tr/G/G/; $count{H} = $aaseq =~ tr/H/H/; $count{I} = $aaseq =~ tr/I/I/; $count{K} = $aaseq =~ tr/K/K/; $count{L} = $aaseq =~ tr/L/L/; $count{M} = $aaseq =~ tr/M/M/; $count{N} = $aaseq =~ tr/N/N/; $count{P} = $aaseq =~ tr/P/P/; $count{Q} = $aaseq =~ tr/Q/Q/; $count{R} = $aaseq =~ tr/R/R/; $count{S} = $aaseq =~ tr/S/S/; $count{T} = $aaseq =~ tr/T/T/; $count{V} = $aaseq =~ tr/V/V/; $count{W} = $aaseq =~ tr/W/W/; $count{Y} = $aaseq =~ tr/Y/Y/; my ($ndaa, $Haau, $mmw, $hydro); foreach (qw(A C D E F G H I K L M N P Q R S T V W Y)){ next if($count{$_} == 0); $ndaa ++; $Haau += - $count{$_}/$Laa * log($count{$_}/$Laa) / log(2);
$mmw += $count{$_} * &G::Seq::AminoAcid::mass($_); $hydro += $count{$_} * &G::Seq::AminoAcid::hydropathy($_); } $gb->{$id}->{Laa} = $Laa; $gb->{$id}->{ndaa} = $ndaa; $gb->{$id}->{Haau} = sprintf "%.4f", $Haau; $gb->{$id}->{mmw} = sprintf "%.2f", $mmw / $Laa;
$gb->{$id}->{gravy} = sprintf "%+.4f", $hydro / $Laa;
$gb->{$id}->{aroma} = sprintf "%.4f", ($count{F} + $count{Y} + $count{W}) / $Laa;
$gb->{$id}->{CMHFYW} = sprintf "%.4f", ($count{C} + $count{M} + $count{H} + $count{F} + $count{Y} + $count{W}) / $Laa; # these six residues have in common the property of being the preferential targets of reactive oxygen species and can act as sinks for radical fluxes through electron tranfer between amino acids in the protein
$gb->{$id}->{acidic} = sprintf "%.4f", ($count{D} + $count{E}) / $Laa; $gb->{$id}->{basic} = sprintf "%.4f", ($count{R} + $count{K} + $count{H}) / $Laa;
$gb->{$id}->{neutral} = sprintf "%.4f", ($count{N} + $count{Q} + $count{S} + $count{T} + $count{Y}) / $Laa;
$gb->{$id}->{polar} = $gb->{$id}->{acidic} + $gb->{$id}->{basic} + $gb->{$id}->{neutral}; #$gb->{$id}->{polar} = sprintf "%.4f", ($count{D} + $count{E} + $count{R} + $count{K} + $count{H} + $count{N} + $count{Q} + $count{S} + $count{T} + $count{Y}) / $Laa;
#$gb->{$id}->{nonpolar} = sprintf "%.4f", ($count{A} + $count{G} + $count{V} + $count{L} + $count{I} + $count{P} + $count{F} + $count{M} + $count{W} + $count{C}) / $Laa;
} my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}"; if($output eq 'stdout'){ unless($gb->{header_aaui}){ msg_send("\n\n", "Indices of amino acid usage (excluding formylmethionine)\n", " Laa: length in amino acids\n", " ndaa: number of different amino acids\n", " Haau: entropy of amino acid usage\n", " mmw: mean molecular weight\n", " gravy: mean hydropathic indices of each amino acid\n", "----------------------------------------------------------------------------\n", "Laa\tndaa\tHaau\tmmw\tgravy\taroma\tacidic\tbasic\tneutral\tgene\n", "----------------------------------------------------------------------------\n"); $gb->{header_aaui} = 1; } msg_send( $gb->{$id}->{Laa}, "\t", $gb->{$id}->{ndaa}, "\t", $gb->{$id}->{Haau}, "\t", $gb->{$id}->{mmw}, "\t", $gb->{$id}->{gravy}, "\t", $gb->{$id}->{aroma}, "\t", $gb->{$id}->{acidic}, "\t", $gb->{$id}->{basic}, "\t", $gb->{$id}->{neutral}, "\t", $gene, "\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); #open(FILE,">>$filename");
unless($gb->{header_aaui}){ print FILE "Indices of amino acid usage (excluding formylmethionine)\n", " Laa: length in amino acids\n", " ndaa: number of different amino acids\n", " Haau: entropy in amino acid usage\n", " mmw: mean molecular weight\n", " gravy: mean hydropathic indices of each amino acid\n", "Laa,ndaa,Haau,mmw,gravy,aroma,acidic,basic,neutral,polar,gene\n"; $gb->{header_aaui} = 1; } print FILE $gb->{$id}->{Laa}, ",", $gb->{$id}->{ndaa}, ",", $gb->{$id}->{Haau}, ",", $gb->{$id}->{mmw}, ",", $gb->{$id}->{gravy}, ",", $gb->{$id}->{aroma}, ",", $gb->{$id}->{acidic}, ",", $gb->{$id}->{basic}, ",", $gb->{$id}->{neutral}, ",", $gb->{$id}->{polar}, ",", $gene, "\n"; } close(FILE); } =head2 bui Name: bui - calculates base usage indices for protein-coding sequences Description: This program calculates base usage indices for protein-coding sequences (excluding start and stop codons), and inputs in the G instance. Informations stored in the G instance are: $gb->{$id}->{"acgt$p"}: total number of bases (A+T+G+C) $gb->{$id}->{"a_c$p"}: A content [A/(A+T+G+C)]
$gb->{$id}->{"c_c$p"}: C content [C/(A+T+G+C)] $gb->{$id}->{"g_c$p"}: G content [G/(A+T+G+C)]
$gb->{$id}->{"t_c$p"}: T content [T/(A+T+G+C)] $gb->{$id}->{"ryr$p"}: purine/pyrimidine ratio [(A+G)/(T+C)] $gb->{$id}->{"gac$p"}: G+A content [(G+A)/(A+T+G+C)]
$gb->{$id}->{"gcc$p"}: G+C content [(G+C)/(A+T+G+C)] $gb->{$id}->{"gtc$p"}: G+T content [(G+T)/(A+T+G+C)]
$gb->{$id}->{"gcs$p"}: GC skew [(G-C)/(G+C)] $gb->{$id}->{"tas$p"}: TA skew [(T-A)/(T+A)]
$gb->{$id}->{"Hbu$p"}: entropy of base usage
$gb->{header_bui}

Usage:
NULL = &bui(G instance);
Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'bui.csv') -id ID of a group of genes or a single gene (default: 'All') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') -position codon position (default: ''); '' when assessing overall base usage of the gene '1' when assessing base usage at 1st position of codons '2' when assessing base usage at 2nd position of codons '3' when assessing base usage at 3rd position of codons Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting =cut sub bui { &opt_default(output=>'stdout',filename=>'bui.csv',id=>'All',translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]',position=>''); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $id = opt_val("id"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); my $p = opt_val("position"); my $seq; my $aaseq; my $codon; my $i; if($id =~ /FEATURE/){ # for a single gene
$seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); # excluding start and stop codons
if($translate){ $aaseq = &translate($seq); } else { $aaseq = substr($gb->{$id}->{translation}, 1); } my $partseq; $i = 0; while($aaseq){ my $aa = substr($aaseq, 0, 1); $partseq .= $p ? substr($seq, $i * 3 + $p - 1, 1) : substr($seq, $i * 3, 3) if($aa !~ /$del_key/); substr($aaseq, 0, 1) = ''; $i ++; } $seq = $partseq; } else { # for a group of genes
my $cumseq; foreach my $cds ($gb->cds()){ next if( $id ne 'All' && !($gb->{$cds}->{$id}) ); $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); if($translate){ $aaseq = &translate($seq); } else { $aaseq = substr($gb->{$cds}->{translation}, 1); } my $partseq; $i = 0; while($aaseq){ my $aa = substr($aaseq, 0, 1); $partseq .= $p ? substr($seq, $i * 3 + $p - 1, 1) : substr($seq, $i * 3, 3) if($aa !~ /$del_key/); substr($aaseq, 0, 1) = ''; $i ++; } #print length($cumseq), " ";
$cumseq .= $partseq; } $seq = $cumseq; } my ($a, $c, $g, $t, $Hbu); if($seq){ $a = $seq =~ tr/a/a/; $c = $seq =~ tr/c/c/; $g = $seq =~ tr/g/g/; $t = $seq =~ tr/t/t/; $gb->{$id}->{"acgt$p"} = $a+$c+$g+$t; $gb->{$id}->{"a_c$p"} = sprintf "%.4f", $a / ($a+$c+$g+$t);
$gb->{$id}->{"c_c$p"} = sprintf "%.4f", $c / ($a+$c+$g+$t);
$gb->{$id}->{"g_c$p"} = sprintf "%.4f", $g / ($a+$c+$g+$t);
$gb->{$id}->{"t_c$p"} = sprintf "%.4f", $t / ($a+$c+$g+$t);
if($t+$c){ $gb->{$id}->{"ryr$p"} = sprintf "%.4f", ($a+$g) / ($t+$c); }
else { &msg_error("t + c = 0 in $id\n"); }

if($g+$a){ $gb->{$id}->{"gas$p"} = sprintf "%+.4f", ($g-$a)/
($g+$a); } else { &msg_error("g + a = 0 in $id\n"); } if($c+$t){ $gb->{$id}->{"cts$p"} = sprintf "%+.4f", ($c-$t)/($c+$t); }
else { &msg_error("c + t = 0 in $id\n"); }

if($g+$c){ $gb->{$id}->{"gcs$p"} = sprintf "%+.4f", ($g-$c)/
($g+$c); } else { &msg_error("g + c = 0 in $id\n"); } if($t+$a){ $gb->{$id}->{"tas$p"} = sprintf "%+.4f", ($t-$a)/($t+$a); }
else { &msg_error("t + a = 0 in $id\n"); }

if($g+$t){ $gb->{$id}->{"gts$p"} = sprintf "%+.4f", ($g-$t)/
($g+$t); } else { &msg_error("g + t = 0 in $id\n"); } if($a+$c){ $gb->{$id}->{"acs$p"} = sprintf "%+.4f", ($a-$c)/($a+$c); }
else { &msg_error("a + c = 0 in $id\n"); }

foreach($a,$c,$g,$t){ $Hbu -= $_/
($a+$c+$g+$t) * log($_/($a+$c+$g+$t)) / log(2) if($_); } $gb->{$id}->{"Hbu$p"} = sprintf "%.4f", $Hbu; my $gc = ($g+$c) / ($a+$c+$g+$t);
my $gt = ($g+$t) / ($a+$c+$g+$t);
$gb->{$id}->{"gac$p"} = sprintf "%.4f", ($g+$a) / ($a+$c+$g+$t);
$gb->{$id}->{"gcc$p"} = sprintf "%.4f", $gc; $gb->{$id}->{"gtc$p"} = sprintf "%.4f", $gt; $gb->{$id}->{"Hgc$p"} = sprintf "%.4f", - ($gc * log($gc) + (1-$gc) * log(1-$gc)) / log(2) if(0 < $gc && $gc < 1);
$gb->{$id}->{"Hgt$p"} = sprintf "%.4f", - ($gt * log($gt) + (1-$gt) * log(1-$gt)) / log(2) if(0 < $gt && $gt < 1);
} my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}"; if($output eq 'stdout'){ unless($gb->{header_bui}){ msg_send("\n\n", "Base usage indices (excluding start and stop codons)\n", " ACGT: A$p + T$p + G$p + C$p\n", " GAc: (G$p + A$p)/(A$p + T$p + G$p + C$p)\n", " GCc: (G$p + C$p)/(A$p + T$p + G$p + C$p)\n", " GTc: (G$p + T$p)/(A$p + T$p + G$p + C$p)\n", " GCs: (G$p - C$p)/(G$p + C$p)\n", " TAs: (T$p - A$p)/(T$p + A$p)\n", " Hbu: entropy of base usage\n", "-------------------------------------------------------------\n", "start\tend\tACGT\tGAc\tGCc\tGTc\tGCs\tTAs\tHbu\tgene\n", "-------------------------------------------------------------\n"); $gb->{header_bui} = 1; } msg_send($gb->{$id}->{start}, "\t", $gb->{$id}->{end}, "\t", $gb->{$id}->{"acgt$p"}, "\t", $gb->{$id}->{"gac$p"}, "\t", $gb->{$id}->{"gcc$p"}, "\t", $gb->{$id}->{"gtc$p"}, "\t", $gb->{$id}->{"gcs$p"}, "\t", $gb->{$id}->{"tas$p"}, "\t", $gb->{$id}->{"Hbu$p"}, "\t", $gene, "\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); #open(FILE,">>$filename");
unless($gb->{header_bui}){ print FILE "start,end,ACGT,GAc,GCc,GTc,GCs,TAs,Hbu,gene\n"; $gb->{header_bui} = 1; } print FILE $gb->{$id}->{start}, ",", $gb->{$id}->{end}, ",", $gb->{$id}->{"acgt$p"}, ",", $gb->{$id}->{"gac$p"}, ",", $gb->{$id}->{"gcc$p"}, ",", $gb->{$id}->{"gtc$p"}, ",", $gb->{$id}->{"gcs$p"}, ",", $gb->{$id}->{"tas$p"}, ",", $gb->{$id}->{"Hbu$p"}, ",", $gene, "\n"; close(FILE); } } =head2 dinuc Name: dinuc - calculates dinucleotide usage Description: This program calculates dinucleotide usage indices for protein-coding sequences (excluding start and stop codons), and inputs in the G instance. e.g. CG dinucleotide usage at the reading frame 1-2 will be accessible at $gb->{"FEATURE$i"}->{cg12} Dinucleotide usage was computed as the ratio of observed (O) to expected (E) dinucleotide frequencies. Informations stored in the G instance are: $gb->{header_dinuc} Usage: NULL = &dinuc(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'dinuc.csv') -id ID of a group of genes or a single gene (default: 'All') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') -position codon position or reading frame (default: ''); '' when assessing all codon positions '12' when assessing the reading frame 1-2 '23' when assessing the reading frame 2-3 '31' when assessing the reading frame 3-1 Author: Haruo Suzuki (haruo@g-language.org) History: 20030401 initial posting References: Yew et al. (2004) Virus Genes 28:1,41-53. =cut sub dinuc { &opt_default(output=>'stdout',filename=>'dinuc.csv',id=>'All',position=>''); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $id = opt_val("id"); my $rf = opt_val("position"); #if($rf==1){ $rf=12; } elsif($rf==2){ $rf=23; } elsif($rf==3){ $rf=31; }
my $p = ($rf) ? substr($rf, 0, 1) : 1; my $q = ($rf) ? substr($rf, 1, 1) : 1; my $end = ($rf =~ /12|23/) ? 0 : 1; my %usg; my $seq; if($rf){ if($id =~ /FEATURE/){ # for a single gene
$seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); # excluding start and stop codons
for(my $i; $i * 3 <= length($seq) - 3 - $end; $i ++){ $usg{"mono$p"}->{substr($seq, $i * 3 + $p - 1, 1)} ++; $usg{"mono$q"}->{substr($seq, $i * 3 + $q - 1, 1)} ++; $usg{diobs}->{substr($seq, $i * 3 + $p - 1, 2).$rf} ++; } } else { # for a group of genes
foreach my $cds ($gb->cds()){ next if( $id ne 'All' && !($gb->{$cds}->{$id}) ); $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); for(my $i; $i * 3 <= length($seq) - 3 - $end; $i ++){ $usg{"mono$p"}->{substr($seq, $i * 3 + $p - 1, 1)} ++; $usg{"mono$q"}->{substr($seq, $i * 3 + $q - 1, 1)} ++; $usg{diobs}->{substr($seq, $i * 3 + $p - 1, 2).$rf} ++; } } } } else { if($id =~ /FEATURE/){ # for a single gene
$seq = substr(lc($gb->get_geneseq($id)), $gb->{$id}->{codon_start} - 1 + 3, -3); # excluding start and stop codons
for(my $i; $i <= length($seq) - 1 - $end; $i ++){ $usg{mono}->{substr($seq, $i + $p - 1, 1)} ++; $usg{diobs}->{substr($seq, $i + $p - 1, 2)} ++; } } else { # for a group of genes
foreach my $cds ($gb->cds()){ next if( $id ne 'All' && !($gb->{$cds}->{$id}) ); $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); for(my $i; $i <= length($seq) - 1 - $end; $i ++){ $usg{mono}->{substr($seq, $i + $p - 1, 1)} ++; $usg{diobs}->{substr($seq, $i + $p - 1, 2)} ++; } } } } my $total; foreach my $key (keys %usg){ foreach (keys %{$usg{$key}}){ $total += $usg{$key}->{$_}; } foreach (keys %{$usg{$key}}){ $usg{$key}->{$_} = $usg{$key}->{$_} / $total; }
$total = 0;
} my @word = ("aa$rf","ac$rf","ag$rf","at$rf","ca$rf","cc$rf","cg$rf","ct$rf","ga$rf","gc$rf","gg$rf","gt$rf","ta$rf","tc$rf","tg$rf","tt$rf"); foreach (@word){ if($rf){ $usg{diexp}->{$_} = $usg{"mono$p"}->{substr($_,0,1)} * $usg{"mono$q"}->{substr($_,1,1)}; } else { $usg{diexp}->{$_} = $usg{mono}->{substr($_,0,1)} * $usg{mono}->{substr($_,1,1)}; } } my %dioe; foreach (@word){ if($usg{diexp}->{$_}){ $dioe{$_} = sprintf("%.3f", $usg{diobs}->{$_} / $usg{diexp}->{$_});
$gb->{$id}->{$_} = $dioe{$_}; # input in the G instance
} } $gb->{$id}->{"dinuc$rf"} =\% dioe; # input in the G instance
my $gene = ($id =~ /FEATURE/) ? $gb->{$id}->{gene} : "{$id}"; if($output =~ /stdout/){ unless($gb->{"header_dinuc$p"}){ msg_send("\n\n"); foreach(@word){ msg_send("$_\t"); } msg_send("#\tgene"); $gb->{"header_dinuc$p"} = 1; } msg_send("\n\n"); foreach(@word){ msg_send("$dioe{$_}\t"); } msg_send(scalar keys %dioe, "\t$gene"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); #open(FILE,">>$filename");
unless($gb->{"header_dinuc$p"}){ print FILE "\nkeys,"; foreach(@word){ print FILE "$_,"; } print FILE "gene,"; $gb->{"header_dinuc$p"} = 1; } print FILE "\n$id,"; foreach(@word){ print FILE (exists $dioe{$_}) ? "$dioe{$_}," : "0,"; } print FILE $gb->{$id}->{gene}; close(FILE); } } =head2 Ew Name: Ew - calculate a measure of synonymous codon usage evenness (Ew) Description: This program calculates the 'weighted sum of relative entropy' (Ew) as a measure of synonymous codon usage evenness for each gene, and inputs in the G instance; i.e. Ew values will be accessible at $gb->{"FEATURE$i"}->{Ew}; Usage: NULL = &Ew(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'Ew.csv') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') Author: Haruo Suzuki (haruo@g-language.org) History: 20070701 initial posting References: Suzuki H et al. (2004) Gene. 23;335:19-23. Suzuki H et al. (2007) http://www.hindawi.com/GetArticle.aspx?doi=10.1155/2007/61374

Requirements: sub codon_compiler;
=cut sub Ew { &opt_default(output=>'stdout', filename=>'Ew.csv', translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]'); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); if($output eq 'stdout'){ msg_send("\n\nThe 'weighted sum of relative entropy' (Ew)\n", "Ew\tgene\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); print FILE "Ew,gene\n"; } foreach my $cds ($gb->cds()){ my %Hi; # entropy for each amino acid
my %Ei; # relative entropy for each amino acid
my $Ew; # weighted sum of relative entropy (Ew)
my $aacu = &codon_compiler($gb, -output=>'n', -data=>'A1C2', -id=>$cds, -translate=>$translate, -del_key=>$del_key); foreach (keys %{$aacu}){ if(length($_) == 4){ $Hi{substr($_, 0, 1)} += - $aacu->{$_} * log($aacu->{$_}) / log(2); }
}
foreach (keys
%Hi){ $Ei{$_} = $Hi{$_} / ( log($gb->{degeneracy}->{$_}) / log(2) ) if($gb->{degeneracy}->{$_} > 1); }
foreach (keys
%Hi){
if(
$Ei{$_} > 1){ $Ew = -0.1; }
else {
$Ew += $Ei{$_} * $aacu->{$_}; }
}
$gb->{$cds}->{Ew} = sprintf "%.4f", $Ew; # input in the G instance

if($output eq 'stdout'){
msg_send($gb->{$cds}->{Ew}, "\t",
$gb->{$cds}->{gene}, "\n");
} if($output eq 'f'){ print FILE $gb->{$cds}->{Ew}, ",", $gb->{$cds}->{gene}, "\n"; } } close(FILE); } =head2 P2 Name: P2 - calculate the P2 index of each gene Description: This program calculates the P2 index for each gene. This index describes the proportion of codons conforming to the intermediate strength of codon-anticodon interaction energy rule of Grosjean and Fiers: P2 = (WWC+SSU)/(WWY+SSY)
where W = A or U, S = C or G, and Y = C or U.
P2 values will be accessible at
$gb->{"FEATURE$i"}->{P2};
Usage: NULL = &P2(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'P2.csv') Author: Haruo Suzuki (haruo@g-language.org) History: 20070701 initial posting References: Gouy M, Gautier C. (1982) Nucleic Acids Res. 10(22):7055-74. Requirements: sub codon_compiler(); =cut sub P2 { &opt_default(output=>'stdout', filename=>'P2.csv'); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); if($output eq 'stdout'){ msg_send("\n\nThe P2 index\n", "P2\tgene\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(FILE,">>data/$filename"); print FILE "P2,gene\n"; } my ($WWC, $SST, $WWY, $SSY); foreach my $cds ($gb->cds()){ $WWC = $SST = $WWY = $SSY = 0; my $seq = substr(lc($gb->get_geneseq($cds)), $gb->{$cds}->{codon_start} - 1 + 3, -3); # excluding start and stop codons
for(my $i = 0; $i * 3 <= length($seq) - 3; $i ++){ my $codon = substr($seq, $i * 3, 3); $WWC ++ if($codon =~ /[at][at]c/); $SST ++ if($codon =~ /[cg][cg]t/); $WWY ++ if($codon =~ /[at][at][ct]/); $SSY ++ if($codon =~ /[cg][cg][ct]/); } $gb->{$cds}->{P2} = sprintf "%.4f", ($WWC+$SST)/($WWY+$SSY) if($WWY+$SSY); # input in the G instance

if(
$output eq 'stdout'){
msg_send(
$gb->{$cds}->{P2}, "\t",
$gb->{$cds}->{gene}, "\n");
} if($output eq 'f'){ print FILE $gb->{$cds}->{P2}, ",", $gb->{$cds}->{gene}, "\n"; } } close(FILE); } =head2 codon_mva Name: codon_mva - perform multivariate analyses of codon usage data Description: This program performs multivariate analyses on codon usage data, and analyzes correlations between the axes and various gene parameters. Usage: NULL = &codon_mva(G instance); Options: -output output option (default: 'show') 'stdout' for standard output, 'f' for file output in directory "data", 'g' for graph output in directory "graph", 'show' for graph output and display -filename output filename (default: 'codon_mva.csv') -translate 1 when translates using standard codon table (default: 0) -del_key regular expression to delete key (i.e. amino acids and nucleotides) (default: '[^ACDEFGHIKLMNPQRSTVWYacgtU]') -data data matrix of amino acid and codon usage (default: 'R4') -naxis number of axes (default: '4') -method multivariate analysis method (default: 'pca') 'pca' for principal component analysis 'coa' for correspondence analysis 'wca' for within-group correspondence analysis -parameter pointer ARRAY of list of gene parameters (default: ['Laa','mmw','gravy','aroma','P2','gcc3','gcs3'])) Author: Haruo Suzuki (haruo@g-language.org) History: 20070101 initial posting References: Suzuki et al. (2005) FEBS Lett. 579(28):6499-6504. Charif et al. (2005) Bioinformatics 21(4):545-547. Requirements: sub codon_compiler; sub aaui; sub bui; sub dinuc; =cut sub codon_mva { &opt_default(output=>'show',filename=>'codon_mva.csv',translate=>0, del_key=>'[^ACDEFGHIKLMNPQRSTVWYacgtU]',data=>'R4', naxis=>'4',method=>'pca',parameter=>['Laa','mmw','gravy','aroma','P2']); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $output = opt_val("output"); my $filename = opt_val("filename"); my $translate = opt_val("translate"); my $del_key = opt_val("del_key"); my $data = opt_val("data"); my $naxis = opt_val("naxis"); my $method = opt_val("method"); my $parameter = opt_val("parameter"); my %calc; foreach(@$parameter){ if($_ =~ /mmw|gravy|aroma|acidic|basic|neutral|polar|Haau/){ $calc{aaui} ++; } elsif($_ =~ /E_g/){ $calc{phx} ++; } elsif($_ =~ /enc|cbi|icdi|Ew|P2|fop|cai/){ $calc{$_} ++; } } if($data eq 'AA'){ $data = 'A0'; } elsif($data eq 'RAAU'){ $data = 'A1'; } elsif($data eq 'AF'){ $data = 'R0'; } elsif($data eq 'RF'){ $data = 'R2'; } elsif($data eq 'RSCU'){ $data = 'R3'; } elsif($data eq 'W'){ $data = 'R4'; } # my @pp = ('','1','2','3');
my @pp = ('3'); foreach (@pp){ $parameter = [@$parameter,"gcc$_","gtc$_"]; } &Ew($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key) if($calc{Ew}); &P2($gb, -output=>'n') if($calc{P2}); &fop($gb, -output=>'n') if($calc{fop}); &cai($gb, -output=>'n', -w_output=>'n') if($calc{cai}); &phx($gb, -output=>'n') if($calc{phx}); my $inHigh = 'ribosomal.*protein|elongation.*factor'; my $exHigh = 'modification|alanine|serine|transferase|synthase|acetylase|acetylating enzyme|methylase|precursor|regulat|initiates|RNA|DNA|transcription|selenocysteine-specific'; my $inRp = 'ribosomal.*protein'; my $exRp = 'modification|alanine|serine|transferase|synthase|acetylase|acetylating enzyme|methylase|precursor|regulat|initiates|RNA'; # 'ase|[cl]ation' # synth(et)?ase
my $cds; foreach $cds ($gb->cds()){ $gb->{$cds}->{High} = (($gb->{$cds}->{product} =~ /$inHigh/i && $gb->{$cds}->{product} !~ /$exHigh/i || $gb->{$cds}->{note} =~ /$inHigh/i && $gb->{$cds}->{note} !~ /$exHigh/i ) ? 1 : 0 ); $gb->{$cds}->{Rp} = (($gb->{$cds}->{product} =~ /$inRp/i && $gb->{$cds}->{product} !~ /$exRp/i || $gb->{$cds}->{note} =~ /$inRp/i && $gb->{$cds}->{note} !~ /$exRp/i ) ? 1 : 0 ); #my $tmp = $gb->{$cds}->{note}.$gb->{$cds}->{product};
my $tmp = $gb->{$cds}->{product}; $gb->{$cds}->{High} = ( ( $tmp =~ /$inHigh/i && $tmp !~ /$exHigh/i) ? 1 : 0); $gb->{$cds}->{Rp} = ( ( $tmp =~ /$inRp/i && $tmp !~ /$exRp/i) ? 1 : 0); $gb->{$cds}->{High} = $gb->{$cds}->{Rp}; &codon_compiler($gb, -output=>'n', -translate=>$translate, -del_key=>$del_key, -data=>$data, -id=>$cds) unless(exists $gb->{$cds}->{$data}); $gb->{$cds}->{Laa} = length($gb->{$cds}->{translation}) - 1; $gb->{$cds}->{enc} = &enc($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{enc}); $gb->{$cds}->{cbi} = &cbi($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{cbi}); $gb->{$cds}->{icdi} = &icdi($gb, -translate=>$translate, -del_key=>$del_key, -id=>$cds) if($calc{icdi}); &aaui($gb, -output=>'n', -id=>$cds) if($calc{aaui}); foreach (@pp){ &bui($gb, -output=>'n', -position=>$_, -del_key=>$del_key, -id=>$cds); } foreach(@$parameter){ $gb->{$cds}->{on} = 0 unless(exists $gb->{$cds}->{$_}); } } $parameter = [@$parameter, 'High']; open(OUT, ">dat$data.csv"); my @allkey = sort keys %{$gb->{All}->{$data}}; foreach(@allkey){ print OUT "$_,"; } foreach(@$parameter){ print OUT "$_,"; } print OUT "\n"; foreach $cds ($gb->cds()){ foreach(@allkey){ print OUT (exists $gb->{$cds}->{$data}->{$_}) ? "$gb->{$cds}->{$data}->{$_}," : "0,"; } foreach(@$parameter){ print OUT "$gb->{$cds}->{$_},"; } #foreach(@$parameter){ print OUT (exists $gb->{$cds}->{$_}) ? "$gb->{$cds}->{$_}," : "0,"; }
print OUT "\n"; } close(OUT); my $ndc = scalar @allkey; my $rcmd= new Rcmd; my @result = $rcmd->exec( qq//
output = "
$output"
naxis =
$naxis
ndc =
$ndc
method = "
$method"
dat = read.csv("dat
$data.csv")
dat = dat[,-ncol(dat)]
usage = as.matrix(dat[,1:ndc])
index = dat[,(ndc+1):(ncol(dat)-1)] # -c("High")
########## haruo
rda = as.dist(1 - cor(t(usage))); eda = dist(usage)
rdh = edh = NA
if(sum(dat[,"High"]) > 1){ usg.h = usage[dat[,"High"] == 1,]; rdh = as.dist(1 - cor(t(usg.h))); edh = dist(usg.h) }
# omits columns where SD is zero
omit = c()
for(i in 1:ncol(usage)) if(sd(usage[,i]) == 0) omit = c(omit, i)
if(length(omit)) usage = usage[,-omit]
# multivariate analysis
if(method == "coa"){ # correspondence analysis
library(ade4);
coa = dudi.coa(as.data.frame(usage), scannf = FALSE, nf = min(dim(usage)));
score = coa\
$li[,1:naxis]; # \$
contribution = (100*coa\$eig\/sum(coa\$eig))[1:naxis]; # \$ \/
} else if(method == "wca"){ # within-group correspondence analysis
library(ade4);
coa = dudi.coa(as.data.frame(t(usage)), scannf = FALSE, nf = min(dim(usage)));
facaa = as.factor(substring(rownames(coa\$tab),1,1)); # \$
coa = within(coa, fac = facaa, scannf = FALSE, nf=min(dim(usage)));
score = coa\$co[,1:naxis]; # \$
contribution = (100*coa\$eig\/sum(coa\$eig))[1:naxis]; # \$ \/
} else if(method == "pca"){ # principal component analysis
pr = prcomp(usage, center = T, scale = F);
score = pr\$x[,1:naxis]; # \$
contribution = 100*summary(pr)\$importance[2,][1:naxis]; # \$
factorloading = cor(usage, score)[,1:naxis];
}
#
z.score = apply(score, 2, scale)
zz = rep(NA, naxis); if(sum(dat[,"High"]) > 1) zz = round(abs(apply(z.score[dat[,"High"]==1,], 2, mean)), digits=4)
rr = cor(index, z.score, method="pearson") # pearson spearman
# gene parameter with highest r^2 value
key = rownames(rr)[ apply(rr^2,2,order)[nrow(rr),] ]
# output
if(output == "show"){
postscript("codon_mva.ps", horizontal=FALSE, width=10, height=10, pointsize=15)
par(mfrow=c(ceiling(sqrt(naxis)), ceiling(sqrt(naxis))))
for(y in 1:naxis){
x = key[y]
r = round(rr[x,y], digits=4)
if(sum(dat[,"High"]) > 1){
plot(dat[,x],z.score[,y],xlab=x, ylab=paste(colnames(z.score)[y]," (",round(contribution[y],digits=1),"% of variance)"), type="n", main=paste("z =", zz[y], "\nr =",r))
text(dat[dat[,"High"]==0,x],z.score[dat[,"High"]==0,y],label="+")
text(dat[dat[,"High"]==1,x],z.score[dat[,"High"]==1,y],label="o",col="red")
} else plot(dat[,x],z.score[,y],xlab=x, ylab=paste(colnames(z.score)[y]," (",round(contribution[y],digits=1),"%) of variance"), main=paste("r =",r))
}
} else if(output == "f"){
out = round(rbind(contribution, factorloading, rr), digits=4);
write.csv(out, file = "$filename", quote = F);
}
########## haruo
out = round(rbind(contribution,abs(rr)), digits=4)
val = round(apply(abs(rr), 2, max), digits=4)
gpd = key # gene parameter detected
gpd[val < 0.7] = "nd"; zc = qnorm(1-0.01)
gfd = gpd # gene feature detected (gpd + Replication + Expression)
if(sum(dat[,"High"]) > 1) gfd[zz > zc & gfd != "nd"] = paste(gfd[zz > zc & gfd != "nd"], "(Expression)")
gfd[zz > zc & gfd == "nd"] = "Expression"
res = gfd
res[substr(res,nchar(res),nchar(res)) == ")"] = "two"
write.csv(rbind(gpd,gfd,res,out,key,val,zz), file = "haruo$data.csv", quote = F)
# returned value
as.vector(c(mean(rda),mean(eda),mean(rdh),mean(edh), contribution, z.score, rr))
/
);
shift @result if($result[0] eq ''); $gb->{rdam} = sprintf "%.4f", shift @result; $gb->{edam} = sprintf "%.4f", shift @result; $gb->{rdhm} = sprintf "%.4f", shift @result; $gb->{edhm} = sprintf "%.4f", shift @result; for(1..$naxis){ $gb->{"Axis$_"}->{contribution} = sprintf "%.3f", shift @result; } for(1..$naxis){ foreach $cds ($gb->cds()){ $gb->{$cds}->{"Axis$_"} = sprintf "%+.3f", shift @result; } } for(1..$naxis){ foreach my $key (@$parameter){ next if($key =~ /High/); $gb->{"Axis$_"}->{$key} = sprintf "%+.3f", shift @result; } } msg_gimv("codon_mva.ps") if($output =~ /g|show/); if($output =~ /stdout|show/){ msg_send("\n\nMultivariate analysis of codon usage data $data\n"); msg_send("\nContribution of each axis\n\t"); for(1..$naxis){ msg_send("Axis$_\t"); } msg_send("\n\t"); for(1..$naxis){ msg_send($gb->{"Axis$_"}->{contribution}, "\t"); } msg_send("\n\nCorrelations between each axis and each gene parameter\n\t"); for(1..$naxis){ msg_send("Axis$_\t"); } foreach my $key (@$parameter){ next if($key =~ /High/); msg_send("\n$key\t"); for(1..$naxis){ msg_send($gb->{"Axis$_"}->{$key}, "\t"); } } } } #__END__
=head2 codon_counter Name: codon_counter - count codons Description: This program calculates codon counts (absolute codon frequencies). Usage: (pointer HASH) = &codon_counter(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'codon.csv') -CDSid CDS ID number (default: '') Author: Koya Mori (mory@g-language.org) History: 20091203 deprecated 20010721 initial posting =cut sub codon_counter { warn('This method is deprecated. Use &codon_compiler(@_, -data=>'C0') instead.'); return &codon_compiler(@_, -data=>'C0'); } =head2 amino_counter Name: amino_counter - count amino acids Description: This program calculates absolute amino acid frequencies. Usage: (pointer HASH) = &amino_counter(G instance); Options: -output output option (default: 'stdout') 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'amino.csv') -CDSid CDS ID number (default: '') Author: Koya Mori (mory@g-language.org) History: 20091203 deprecated 20010721 initial posting =cut sub amino_counter { warn('This method is deprecated. Use &codon_compiler(@_, -data=>'A0') instead.'); return &codon_compiler(@_, -data=>'A0'); } =head2 codon_usage Name: codon_usage - calculates codon usage Description: This program calculates codon usage. Usage: (pointer HASH) = &codon_usage(G instance); (pointer HASH) = &codon_usage('tRNAscan-SE_output_file'); Options: -output output option (default: 'show') 'g' for graph, 'show' for showing the graph, 'stdout' for standard output, 'f' for file output in directory "data" -filename output filename (default: 'codon_usage.png') -CDSid CDS ID number (default: '') Author: Koya Mori (mory@g-language.org) History: 20090908-01 now accepts tRNAscan-SE output as first argument 20090312-01 fixed bug in filename option 20010721-01 initial posting =cut #sub codon_usage {
# warn('This method is deprecated. Use &codon_compiler(@_, -data=>'C2') instead.');
# return &codon_compiler(@_, -data=>'C2');
#}
sub codon_usage{ &opt_default(CDSid=>"",filename=>'codon_usage.png',output=>'show',version=>1); my @args=opt_get(@_); my $gb=shift @args; my $key=opt_val("CDSid"); my $filename=opt_val("filename"); my $output=opt_val("output"); my $version = opt_val("version"); my $codon; my $amino; my %result; my %usage; my %option; if(-e $gb){ require G::Seq::AminoAcid; foreach my $line (readFile($gb, 1)){ my @tmp = split(/\s+/, $line); next unless($tmp[1] =~ /^\d+$/); my $code = G::Seq::AminoAcid::three2one($tmp[4]); $code = 'Pseudo' unless length($code); $usage{$code}{G::Seq::Primitive::complement(lc($tmp[5]))} ++; } $usage{'/'}{taa} = 0.0001; $usage{'/'}{tga} = 0.0001; $usage{'/'}{tag} = 0.0001; $option{'-display'} = 'number'; }elsif(length $key){ %usage=codon_amino_counter($gb,$key); }else{ foreach($gb->cds()){ %result=(); %result=codon_amino_counter($gb,$_); foreach $amino (keys(%result)){ foreach $codon (keys(%{$result{$amino}})){ $usage{$amino}{$codon}+=$result{$amino}{$codon}; } } } } if($output eq "f"){ _codon_usage_printer(\%usage,-output=>"f",-filename=>"$filename"); } if($output!~/[fn]/){ _codon_usage_printer(\%usage,-output=>"stdout"); } if ($version == 2){ _codon_table2(\%usage,-output=>$output, -filename=>$filename) if($output=~/(g|show)/ && $key eq ""); }else{ _codon_table(\%usage,-output=>$output, -filename=>$filename, %option) if($output=~/(g|show)/ && $key eq ""); } return\% usage; } sub _codon_table{ &opt_default(output=>"show",filename=>"codon_table.png",display=>"percent"); my @args=opt_get(@_); my $result=shift @args; my $filename=opt_val("filename"); my $output=opt_val("output"); my $display=opt_val("display"); my ($x, $y, %amino, %data, %per, $amino_total, $codon, $amino, $v, $h); my %exception; my $CoDoN; my %color; my $im = new GD::Image(500,550); my $white = $im->colorAllocate(255,255,255); my $black = $im->colorAllocate(0,0,0); my $red = $im->colorAllocate(255,0,0); my $yellow = $im->colorAllocate(200,200,0); my $green = $im->colorAllocate(0,150,0); my $blue = $im->colorAllocate(0,0,255); $color{$_} = $yellow for (qw/D E/); $color{$_} = $red for (qw/R K H/); $color{$_} = $blue for (qw/N Q S T Y/); $color{$_} = $green for (qw/A G V L I P F M W C/); $color{'/'}=$black; foreach((10,50,450,490)){ $x=$_; for($y=10;$y<450;$y++){ $im->setPixel($x,$y,$black); } } foreach((150,250,350)){ $x=$_; for($y=30;$y<450;$y++){ $im->setPixel($x,$y,$black); } } $y=30; for($x=50;$x<450;$x++){ $im->setPixel($x,$y,$black); } foreach((10,50,150,250,350,450)){ $y=$_; for($x=10;$x<490;$x++){ $im->setPixel($x,$y,$black); } } $im->string(gdSmallFont,15,25,"first",$red); $im->string(gdSmallFont,233,15,"second",$green); $im->string(gdSmallFont,455,25,"third",$blue); $im->string(gdSmallFont,30,95,"T",$red); $im->string(gdSmallFont,30,195,"C",$red); $im->string(gdSmallFont,30,295,"A",$red); $im->string(gdSmallFont,30,395,"G",$red); $im->string(gdSmallFont,100,35,"T",$green); $im->string(gdSmallFont,200,35,"C",$green); $im->string(gdSmallFont,300,35,"A",$green); $im->string(gdSmallFont,400,35,"G",$green); $im->string(gdSmallFont,470,65,"T",$blue); $im->string(gdSmallFont,470,85,"C",$blue); $im->string(gdSmallFont,470,105,"A",$blue); $im->string(gdSmallFont,470,125,"G",$blue); $im->string(gdSmallFont,470,165,"T",$blue); $im->string(gdSmallFont,470,185,"C",$blue); $im->string(gdSmallFont,470,205,"A",$blue); $im->string(gdSmallFont,470,225,"G",$blue); $im->string(gdSmallFont,470,265,"T",$blue); $im->string(gdSmallFont,470,285,"C",$blue); $im->string(gdSmallFont,470,305,"A",$blue); $im->string(gdSmallFont,470,325,"G",$blue); $im->string(gdSmallFont,470,365,"T",$blue); $im->string(gdSmallFont,470,385,"C",$blue); $im->string(gdSmallFont,470,405,"A",$blue); $im->string(gdSmallFont,470,425,"G",$blue); foreach $amino (keys(%{$result})){ $amino_total=0; foreach $codon (keys(%{$$result{$amino}})){ $amino_total+=$$result{$amino}{$codon}; } foreach $codon (keys(%{$$result{$amino}})){ if($$result{$amino}{$codon} > $data{$codon}){ if($data{$codon}!=""){ $exception{$codon}{amino}=$amino{$codon}; $exception{$codon}{per}=$per{$codon}; } $data{$codon}=$$result{$amino}{$codon}; $amino{$codon}=$amino; if($display eq 'percent'){ $per{$codon}=sprintf("%.3f",$$result{$amino}{$codon}/$amino_total);
}else{ $per{$codon}=sprintf("%s",$$result{$amino}{$codon}); } } else{ $exception{$codon}{amino}=$amino; if($display eq 'percent'){ $exception{$codon}{per}=sprintf("%.3f",$$result{$amino}{$codon}/$amino_total);
}else{ $exception{$codon}{per}=sprintf("%s",$$result{$amino}{$codon}); } } } } $im->string(gdSmallFont,60,65,"TTT $amino{ttt} $per{ttt}",$color{$amino{ttt}}); $im->string(gdSmallFont,60,85,"TTC $amino{ttc} $per{ttc}",$color{$amino{ttc}}); $im->string(gdSmallFont,60,105,"TTA $amino{tta} $per{tta}",$color{$amino{tta}}); $im->string(gdSmallFont,60,125,"TTG $amino{ttg} $per{ttg}",$color{$amino{ttg}}); $im->string(gdSmallFont,60,165,"CTT $amino{ctt} $per{ctt}",$color{$amino{ctt}}); $im->string(gdSmallFont,60,185,"CTC $amino{ctc} $per{ctc}",$color{$amino{ctc}}); $im->string(gdSmallFont,60,205,"CTA $amino{cta} $per{cta}",$color{$amino{cta}}); $im->string(gdSmallFont,60,225,"CTG $amino{ctg} $per{ctg}",$color{$amino{ctg}}); $im->string(gdSmallFont,60,265,"ATT $amino{att} $per{att}",$color{$amino{att}}); $im->string(gdSmallFont,60,285,"ATC $amino{atc} $per{atc}",$color{$amino{atc}}); $im->string(gdSmallFont,60,305,"ATA $amino{ata} $per{ata}",$color{$amino{ata}}); $im->string(gdSmallFont,60,325,"ATG $amino{atg} $per{atg}",$color{$amino{atg}}); $im->string(gdSmallFont,60,365,"GTT $amino{gtt} $per{gtt}",$color{$amino{gtt}}); $im->string(gdSmallFont,60,385,"GTC $amino{gtc} $per{gtc}",$color{$amino{gtc}}); $im->string(gdSmallFont,60,405,"GTA $amino{gta} $per{gta}",$color{$amino{gta}}); $im->string(gdSmallFont,60,425,"GTG $amino{gtg} $per{gtg}",$color{$amino{gtg}}); $im->string(gdSmallFont,160,65,"TCT $amino{tct} $per{tct}",$color{$amino{tct}}); $im->string(gdSmallFont,160,85,"TCC $amino{tcc} $per{tcc}",$color{$amino{tcc}}); $im->string(gdSmallFont,160,105,"TCA $amino{tca} $per{tca}",$color{$amino{tca}}); $im->string(gdSmallFont,160,125,"TCG $amino{tcg} $per{tcg}",$color{$amino{tcg}}); $im->string(gdSmallFont,160,165,"CCT $amino{cct} $per{cct}",$color{$amino{cct}}); $im->string(gdSmallFont,160,185,"CCC $amino{ccc} $per{ccc}",$color{$amino{ccc}}); $im->string(gdSmallFont,160,205,"CCA $amino{cca} $per{cca}",$color{$amino{cca}}); $im->string(gdSmallFont,160,225,"CCG $amino{ccg} $per{ccg}",$color{$amino{ccg}}); $im->string(gdSmallFont,160,265,"ACT $amino{act} $per{act}",$color{$amino{act}}); $im->string(gdSmallFont,160,285,"ACC $amino{acc} $per{acc}",$color{$amino{acc}}); $im->string(gdSmallFont,160,305,"ACA $amino{aca} $per{aca}",$color{$amino{aca}}); $im->string(gdSmallFont,160,325,"ACG $amino{acg} $per{acg}",$color{$amino{acg}}); $im->string(gdSmallFont,160,365,"GCT $amino{gct} $per{gct}",$color{$amino{gct}}); $im->string(gdSmallFont,160,385,"GCC $amino{gcc} $per{gcc}",$color{$amino{gcc}}); $im->string(gdSmallFont,160,405,"GCA $amino{gca} $per{gca}",$color{$amino{gca}}); $im->string(gdSmallFont,160,425,"GCG $amino{gcg} $per{gcg}",$color{$amino{gcg}}); $im->string(gdSmallFont,260,65,"TAT $amino{tat} $per{tat}",$color{$amino{tat}}); $im->string(gdSmallFont,260,85,"TAC $amino{tac} $per{tac}",$color{$amino{tac}}); $im->string(gdSmallFont,260,105,"TAA $amino{taa} $per{taa}",$color{$amino{taa}}); $im->string(gdSmallFont,260,125,"TAG $amino{tag} $per{tag}",$color{$amino{tag}}); $im->string(gdSmallFont,260,165,"CAT $amino{cat} $per{cat}",$color{$amino{cat}}); $im->string(gdSmallFont,260,185,"CAC $amino{cac} $per{cac}",$color{$amino{cac}}); $im->string(gdSmallFont,260,205,"CAA $amino{caa} $per{caa}",$color{$amino{caa}}); $im->string(gdSmallFont,260,225,"CAG $amino{cag} $per{cag}",$color{$amino{cag}}); $im->string(gdSmallFont,260,265,"AAT $amino{aat} $per{aat}",$color{$amino{aat}}); $im->string(gdSmallFont,260,285,"AAC $amino{aac} $per{aac}",$color{$amino{aac}}); $im->string(gdSmallFont,260,305,"AAA $amino{aaa} $per{aaa}",$color{$amino{aaa}}); $im->string(gdSmallFont,260,325,"AAG $amino{aag} $per{aag}",$color{$amino{aag}}); $im->string(gdSmallFont,260,365,"GAT $amino{gat} $per{gat}",$color{$amino{gat}}); $im->string(gdSmallFont,260,385,"GAC $amino{gac} $per{gac}",$color{$amino{gac}}); $im->string(gdSmallFont,260,405,"GAA $amino{gaa} $per{gaa}",$color{$amino{gaa}}); $im->string(gdSmallFont,260,425,"GAG $amino{gag} $per{gag}",$color{$amino{gag}}); $im->string(gdSmallFont,360,65,"TGT $amino{tgt} $per{tgt}",$color{$amino{tgt}}); $im->string(gdSmallFont,360,85,"TGC $amino{tgc} $per{tgc}",$color{$amino{tgc}}); $im->string(gdSmallFont,360,105,"TGA $amino{tga} $per{tga}",$color{$amino{tga}}); $im->string(gdSmallFont,360,125,"TGG $amino{tgg} $per{tgg}",$color{$amino{tgg}}); $im->string(gdSmallFont,360,165,"CGT $amino{cgt} $per{cgt}",$color{$amino{cgt}}); $im->string(gdSmallFont,360,185,"CGC $amino{cgc} $per{cgc}",$color{$amino{cgc}}); $im->string(gdSmallFont,360,205,"CGA $amino{cga} $per{cga}",$color{$amino{cga}}); $im->string(gdSmallFont,360,225,"CGG $amino{cgg} $per{cgg}",$color{$amino{cgg}}); $im->string(gdSmallFont,360,265,"AGT $amino{agt} $per{agt}",$color{$amino{agt}}); $im->string(gdSmallFont,360,285,"AGC $amino{agc} $per{agc}",$color{$amino{agc}}); $im->string(gdSmallFont,360,305,"AGA $amino{aga} $per{aga}",$color{$amino{aga}}); $im->string(gdSmallFont,360,325,"AGG $amino{agg} $per{agg}",$color{$amino{agg}}); $im->string(gdSmallFont,360,365,"GGT $amino{ggt} $per{ggt}",$color{$amino{ggt}}); $im->string(gdSmallFont,360,385,"GGC $amino{ggc} $per{ggc}",$color{$amino{ggc}}); $im->string(gdSmallFont,360,405,"GGA $amino{gga} $per{gga}",$color{$amino{gga}}); $im->string(gdSmallFont,360,425,"GGG $amino{ggg} $per{ggg}",$color{$amino{ggg}}); $im->string(gdSmallFont,15,465,"yellow minus charge",$yellow); $im->string(gdSmallFont,165,465,"red plus charge",$red); $im->string(gdSmallFont,285,465,"blue noncharge",$blue); $im->string(gdSmallFont,400,465,"green nonpolar",$green); $im->string(gdSmallFont,20,485,"exception",$black); $v=485; $h=100; foreach(sort keys(%exception)){ $color{$exception{$_}{amino}}=$black if($color{$exception{$_}{amino}}==""); $CoDoN=uc $_; $im->string(gdSmallFont,$h,$v,"$CoDoN $exception{$_}{amino} $exception{$_}{per}",$color{$exception{$_}{amino}}); $v+=20; if($v == 545){ $v=485; $h+=100; } } mkdir ("graph",0777); open(OUT,'>graph/'."$filename"); binmode OUT; print OUT $im->png; close(OUT); msg_gimv("graph/$filename") if($output eq "show"); } #_codon_table2 ver.20020902-01
#scripting by Kazuharu Gaou Arakawa(gaou@g-language.org)
#Inspire interface for the codon_usage()
sub _codon_table2{ &opt_default(output=>"show",filename=>"codon_table.html"); my @args = opt_get(@_); my $result = shift @args; my $filename = opt_val("filename"); my $output = opt_val("output"); my (%amino, %data, %per, $amino_total, $amino); my (%exception, %color); foreach my $tmp (qw(D E)){ $color{$tmp} = 'gold' } foreach my $tmp (qw(R K H)){ $color{$tmp} = 'red' } foreach my $tmp (qw(N Q S T Y)){ $color{$tmp} = 'blue' } foreach my $tmp (qw(A G V L I P F M W C)){ $color{$tmp} = 'green' } $color{'/'} = 'black'; foreach $amino (keys(%{$result})){ $amino_total = 0; foreach my $codon (keys(%{$$result{$amino}})){ $amino_total += $$result{$amino}{$codon}; } foreach my $codon (keys(%{$$result{$amino}})){ if ($$result{$amino}{$codon} > $data{$codon}){ if ($data{$codon} != ''){ $exception{$codon}{amino} = $amino{$codon}; $exception{$codon}{per} = $per{$codon}; } $data{$codon} = $$result{$amino}{$codon}; $amino{$codon} = $amino; $per{$codon} = sprintf("%.3f", $$result{$amino}{$codon} / $amino_total);
} else{ $exception{$codon}{amino} = $amino; $exception{$codon}{per} = sprintf("%.3f",$$result{$amino}{$codon} / $amino_total);
} } } mkdir ('graph', 0777); open(FILE, '>graph/' . $filename); print FILE qq( <HTML> <HEAD><TITLE>Codon Table v.2</TITLE> <style type="text/css"> <!-- FONT{font-size:8pt} --> </style> </HEAD> <BODY bgcolor="white"> <table border=0 cellpadding=0 cellspacing=0><tr><td align=center valign=top> <table border rules=all cellspacing=0 cellpadding=6> <tr><td align=center rowspan=2><font color="red">first</font></td> <td colspan=4 align=center><font color="green">second</font></td> <td align=center rowspan=2><font color="blue">third</font></td> </tr><tr> <td align=center><font color="green">T</font></td> <td align=center><font color="green">C</font></td> <td align=center><font color="green">A</font></td> <td align=center><font color="green">G</font></td> </tr> ); foreach my $first (qw(T C A G)){ print FILE qq(<tr><td align=center><font color="red">$first</font></td>); foreach my $second (qw(T C A G)){ print FILE qq(<td>); foreach my $third (qw(T C A G)){ my $triplet = $first . $second . $third; print FILE qq(<table border=0 cellpadding=4 width="100\%"><tr>); print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}}; print FILE '">', $triplet, '</td>', "\n"; print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}}; print FILE '">', $amino{lc($triplet)}, '</td>', "\n"; print FILE '<td align=center><font color="', $color{$amino{lc($triplet)}}; print FILE '">', $per{lc($triplet)}, '</td>', "\n"; } print FILE qq(</tr></table></td>); } print FILE qq( <td align=center> <table border=0 cellpadding=4> <tr><td><font color="blue">T</font></td></tr> <tr><td><font color="blue">C</font></td></tr> <tr><td><font color="blue">A</font></td></tr> <tr><td><font color="blue">G</font></td></tr> </table> </td> ); } print FILE qq( </table><BR> <font color="gold" style="font-size:9pt">yellow minus charge</font>\& nbsp;\&nbsp;\&nbsp;\&nbsp;\&nbsp; <font color="red" style="font-size:9pt">red plus charge</font>\& nbsp;\&nbsp;\&nbsp;\&nbsp;\&nbsp; <font color="blue" style="font-size:9pt">blue noncharge</font>\& nbsp;\&nbsp;\&nbsp;\&nbsp;\&nbsp; <font color="green" style="font-size:9pt">green nonpolar</font>\& nbsp;\&nbsp;\&nbsp;\&nbsp;\&nbsp; <a href="http://prowl.rockefeller.edu/aainfo/contents.htm" style="font-size:9pt" target="_blank">(learn more here)</a>
<BR><BR>
<div align=left>exceptions:</div
> <table border=0 cellpadding=10 cellspacing=0><tr> ); my $i = 0; foreach my $triplet (sort keys(%exception)){ $color{$exception{$triplet}{amino}} = 'black' if($color{$exception{$triplet}{amino}} eq ''); print FILE '<td><font color="', $color{$exception{$triplet}{amino}}; print FILE '">', uc $triplet, '&nbsp;&nbsp;&nbsp;', $exception{$triplet}{amino}, '&nbsp;&nbsp;&nbsp;'; print FILE $exception{$triplet}{per}, '</font></td>'; if ($i == 4){ print FILE '</tr><tr>'; $i = -1; } $i ++; } print FILE qq( </tr></table> <BR> <form method="GET" action="http://www.kazusa.or.jp/codon/cgi-bin/spsearch.cgi"> Search <a href="http://www.kazusa.or.jp/codon/">Kazusa Codon Usage Database</a>: <input size="20" name="species"> <input type=hidden name="c" value="i"> <input type=submit value="Submit"> </form> ); print FILE "</td></tr></table></body></html>"; close(FILE); msg_gimv("graph/$filename") if($output eq "show"); } 1;
}
General documentation
No general documentation available.