G::Seq Util
SummaryPackage variablesDescriptionGeneral documentationMethods
Summary
  G::Seq::Util - Miscellaneous analysis methods related to sequence analysis.
Package variables
Privates (from "my" definitions)
%COG_fcode = ( J=>"Translation, ribosomal structure and biogenesis", K=>"Transcription", L=>"DNA replication, recombination and repair", D=>"Cell division and chromosome partitioning", O=>"Posttranslational modification, protein turnover, chaperones", M=>"Cell envelope biogenesis, outer membrane", N=>"Cell motility and secretion", P=>"Inorganic ion transport and metabolism", T=>"Signal transduction mechanisms", C=>"Energy production and conservation", G=>"Carbohydrate transport and metabolism", E=>"Amino acid transport and metabolism", F=>"Nucleotide transport and metabolism", H=>"Coenzyme metabolism", I=>"Lipid metabolism", Q=>"Secondary metabolites biosynthesis, transport and catabolism", R=>"General function prediction only", S=>"Function unknown", '-'=>"Non COG" )
%COG_fcolor = ( J=>"plum", K=>"fuchsia", L=>"pink", D=>"lightgreen", O=>"green", M=>"khaki", N=>"greenyellow", P=>"darkkhaki", T=>"cyan", C=>"blue", G=>"mediumturquoise", E=>"lightskyblue", F=>"mediumpurple", H=>"aqua", I=>"blueviolet", Q=>"lightskyblue", R=>"gainsboro", S=>"darkgrey", '-'=>"aliceblue" )
Included modules
Cwd
G::Messenger
G::Seq::Primitive
G::Tools::Graph
SelfLoader
SubOpt
Synopsis
No synopsis!
Description
    This class is a part of G-language Genome Analysis Environment, 
    collecting miscellaneous sequence analysis methods.
Methods
atcgcon
No description
Code
cds_echo
No description
Code
find_king_of_gene
No description
Code
maskseq
No description
Code
molecular_weightDescriptionCode
oligomer_translation
No description
Code
pasteseq
No description
Code
print_gene_function_list
No description
Code
seqinfoDescriptionCode
view_cdsDescriptionCode
Methods description
molecular_weightcode    nextTop
  Name: molecular_weight   -   calculates the molecular weight of given nucleotide sequence

  Description:
    This method calculates the molecular weight of the given
    nucleotide sequence, taking into account the hydrogen bonds 
    between molecules.. Molecular weight used in this method
    is as follows:
       A: 313.15
       T: 304.19
       G: 329.19
       C: 289.13
       N: 308.915

 Usage: 
    -strand      "single" or "double" strand DNA molecule (default:single)

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20011029-01 initial posting
seqinfocodeprevnextTop
  Name: seqinfo   -   prints out basic nucleotide sequence statistics

  Description:
    This method prints out basic compositional statistics of the 
    given nucleotide sequence, in a format similar to the one printed
    right after calling new G().

 Usage: 
    seqinfo($genome)

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20020207-01 initial posting
view_cdscodeprevnextTop
  Name: view_cds   -   displays a graph of nucleotide contents around start and stop codons

  Description:
    This method creates a graph showing the average A,T,G,C contents
    around start/stop codons. This is useful to view consensus around
    start/stop codons and to find characteristic pattern in CDS. 
    
  Usage : 
    view_cds(G instance);

  Options:
    -length    length in bases to show around start/stop codons
               (default: 100)
    -gap       gap shown in graph in between start/stop codon neighbors
               (default: 3)
    -filename  outfile name   (default: view_cds.png for graph, 
               view_cds.csv for file)
    -output    "f" for file, "g" for graph, "show" to display graph. 
               (default: "show")

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20070707-01 moved to G::Seq::Util from G::Seq::GCskew 20010906-01 initial posting
Methods code
atcgcondescriptionprevnextTop
sub atcgcon {
    &opt_default(output=>"stdout",filename=>"cds_info.csv");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $output=opt_val("output");
    my $filename=opt_val("filename");
    my $start;
    my $end;
    my $seq;
    my $num=1;
    my %hash;


    foreach($gb->feature()){
	if($gb->{"FEATURE$num"}->{type} eq 'CDS'){
	    $start=$gb->{"FEATURE$num"}->{start};
	    $end=$gb->{"FEATURE$num"}->{end};
	    $seq=$gb->getseq($start-1,$end-1);
	    $hash{a} += $seq =~tr/a/a/;
$hash{t} += $seq =~tr/t/t/;
$hash{g} += $seq =~tr/g/g/;
$hash{c} += $seq =~tr/c/c/;
$hash{total}+=length($seq); } $num++; } if($output eq "stdout"){ &msg_send(sprintf("total:\t%10d base\n",$hash{total})); &msg_send(sprintf("a:\t%10d / %2.2f\%\n", $hash{a}, 100.0*$hash{a}/$hash{total}));
&msg_send(sprintf("t:\t%10d / %2.2f\%\n", $hash{t}, 100.0*$hash{t}/$hash{total}));
&msg_send(sprintf("c:\t%10d / %2.2f\%\n", $hash{c}, 100.0*$hash{c}/$hash{total}));
&msg_send(sprintf("g:\t%10d / %2.2f\%\n", $hash{g}, 100.0*$hash{g}/$hash{total}));
&msg_send(sprintf("GC content:\t%.2f\%\n", 100.0*($hash{c} + $hash{g}) / $hash{total}));
} if($output eq "f"){ open(FILE,">$filename"); printf FILE "total:\t%10d base\n",$hash{total}; printf FILE "a:\t%10d / %2.2f\%\n", $hash{a}, 100.0*$hash{a}/$hash{total};
printf FILE "t:\t%10d / %2.2f\%\n", $hash{t}, 100.0*$hash{t}/$hash{total};
printf FILE "c:\t%10d / %2.2f\%\n", $hash{c}, 100.0*$hash{c}/$hash{total};
printf FILE "g:\t%10d / %2.2f\%\n", $hash{g}, 100.0*$hash{g}/$hash{total};
printf FILE "GC content:\t%.2f\%\n", 100.0*($hash{c} + $hash{g}) / $hash{total};
close(FILE); } return\% hash;
}
cds_echodescriptionprevnextTop
sub cds_echo {
    my $gb=opt_as_gb(shift);
    my $start;
    my $end;
    my $i=1;

    foreach($gb->feature()){
        if($gb->{"FEATURE$i"}->{type} eq 'CDS'){
	    if($gb->{"FEATURE$i"}->{direction} eq 'direct'){
		$start = $gb->{"FEATURE$i"}->{start};
		$end = $gb->{"FEATURE$i"}->{end};
		&msg_send(sprintf("%d..%d\n",$start,$end));
	    }
	        
	    elsif($gb->{"FEATURE$i"}->{direction} eq 'complement'){
		$start = $gb->{"FEATURE$i"}->{end};
		$end = $gb->{"FEATURE$i"}->{start};
		&msg_send(sprintf("%d..%d\n",$start,$end));
	    }
	}
	$i++;
    }
}
find_king_of_genedescriptionprevnextTop
sub find_king_of_gene {
    my $nuc=shift;
    my $gene='you have just found the king of genes.'."\n";
    
    system("wget http://www.stagnightout.com/pics/what-to-wear/21280.jpg -O /tmp/afro.jpg -q");
    msg_gimv('/tmp/afro.jpg');
    
    return $gene;
}
maskseqdescriptionprevnextTop
sub maskseq {
    &opt_default(pattern=>"",start=>1,end=>"");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $seq=\$gb->{SEQ};
    my $start=opt_val("start");
    my $end=opt_val("end");
    my $pat=opt_val("pattern");
    my $masked;
    my $null;
 

    $$seq=~tr/ \n[0-9]//d;
$$seq=~tr/A-Z/a-z/;
$end=length($$seq) if($end eq ""); for(my $i=0;$i<length($pat);$i++){ $null.="n"; } if($pat){ $masked=substr($$seq,$start-1,$end-$start+1); $masked=~s/$pat/$null/g;
substr($$seq,$start-1,$end-$start+1)=$masked; } else{ $masked=substr($$seq,$start-1,$end-$start+1); $masked=~tr/a-zA-Z/n/;
substr($$seq,$start-1,$end-$start+1)=$masked; } return $seq;
}
molecular_weightdescriptionprevnextTop
sub molecular_weight {
    opt_default(strand=>"single");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $strand = opt_val("strand");

    my %mw = ("a", 313.15, "t", 304.19, "g", 329.19, "c", "289.13", "n", 308.915);
    my $i = 0;
    my $weight = 0;

    while(substr($gb->{SEQ}, $i, 1) ne ''){
	if (substr($gb->{SEQ}, $i, 1) =~ /[atgc]/){
	    $weight += $mw{substr($gb->{SEQ}, $i, 1)};
	}else{
	    $weight += $mw{"n"};
	}
	$i ++;
    }
    my $double = $weight * 2;

    msg_send(sprintf "  Molecular Weight of Nucleotides:\n");
    msg_send(sprintf "    single strand:  %12d\n",$weight); 
    msg_send(sprintf "    double strand:  %12d\n\n\n",$double); 

    $weight *= 2 if ($strand eq "double");


    return $weight;
}
oligomer_translationdescriptionprevnextTop
sub oligomer_translation {
    my @args = opt_get(@_);
    my $seq = shift @args;
    my $frame = shift @args;
    my $len = length($seq);
    if ($frame > 3){
	$seq = G::Seq::Util::complement($seq);
	$frame -= 3;
    }

    my %CodonTable = (
               'gac', 'D', 'caa', 'Q', 'gca', 'A', 'ctg', 'L',
               'gat', 'D', 'cag', 'Q', 'gcc', 'A', 'ctt', 'L',
               'gaa', 'E', 'agc', 'S', 'gcg', 'A', 'ata', 'I',
               'gag', 'E', 'agt', 'S', 'gct', 'A', 'atc', 'I',
               'aga', 'R', 'tca', 'S', 'gga', 'G', 'att', 'I',
               'agg', 'R', 'tcc', 'S', 'ggc', 'G', 'cca', 'P',
               'cga', 'R', 'tcg', 'S', 'ggg', 'G', 'ccc', 'P',
               'cgc', 'R', 'tct', 'S', 'ggt', 'G', 'ccg', 'P',
               'cgg', 'R', 'aca', 'T', 'gta', 'V', 'cct', 'P',
               'cgt', 'R', 'acc', 'T', 'gtc', 'V', 'atg', 'M',
               'aaa', 'K', 'acg', 'T', 'gtg', 'V', 'tgg', 'W',
               'aag', 'K', 'act', 'T', 'gtt', 'V', 'tgc', 'C',
               'cac', 'H', 'tac', 'Y', 'tta', 'L', 'tgt', 'C',
               'cat', 'H', 'tat', 'Y', 'ttg', 'L', 'taa', '/',
               'aac', 'N', 'ttc', 'F', 'cta', 'L', 'tag', '/',
               'aat', 'N', 'ttt', 'F', 'ctc', 'L', 'tga', '/'
                  );

    my $return = '';
    my $i;
    for ($i = 0; $i < $len; $i ++){
	if ($i < $frame - 1){
	    $return .= substr($seq, $i, $frame - 1) . '-';
	    $i += $frame - 2;
	} elsif ($i + 3 <= $len){
	    $return .= $CodonTable{substr($seq, $i, 3)};
	    $i += 2;
	    $return .= '-' unless ($i >= $len - 1);
	} else {
	    $return .= substr($seq, $i);
	    last;
	}
    }
    return $return;
}
pasteseqdescriptionprevnextTop
sub pasteseq {
    &opt_default();
    my @args=opt_get(@_);
    
    my $gb=opt_as_gb(shift);
    my $seq=\$gb->{SEQ};
    my $paste=shift @_;
    my $pos=shift @_;
    
    $$seq=~tr/ \n[0-9]//d;
$$seq=~tr/A-Z/a-z/;
$$paste=~tr/A-Z/a-z/;
substr($$seq,$pos-1,0)=$$paste; return $seq;
}
print_gene_function_listdescriptionprevnextTop
sub print_gene_function_list {
    my $gb = opt_as_gb(shift);
    my $seq = shift;
    $seq = 'gctggtgg' unless ($seq);
    my $revseq = complement($seq);
    my $i = 1;
    my %chi;
    my %cds;
    my $key;

    while(defined(%{$gb->{"CDS$i"}})){
        my $id = $gb->{"CDS$i"}->{feature};
        my ($function, $tmp) = split(/;/, $gb->{"FEATURE$id"}->{function}, 2);
        my $cdsseq = $gb->get_cdsseq("CDS$i");
        $cds{$function}++;
        
        my $iStart = -1;
        while(0 <= ($iStart = index($cdsseq, $seq, $iStart +1))){
            $chi{$function}++;
        }
        $iStart = -1;
        while(0 <= ($iStart = index($cdsseq, $revseq, $iStart +1))){
	    $chi{$function}++;
	}
        
        $i++;
    }
    
    &msg_send("=== $seq ===\n");
    my $tot = 0;
    foreach $key (sort keys %chi){
        &msg_send(sprintf("%20s: %8d\n",$key, $chi{$key}));
        $tot += $chi{$key};
    }
    
    &msg_send("total: $tot\n\n");
    
    
    &msg_send("=== CDS ===\n");
    $tot = 0;
    foreach $key (sort keys %cds){
        next if ($key !~ /[a-z]/);
        &msg_send(sprintf("%20s: %8d\n", $key, $cds{$key}));
        $tot += $cds{$key};
    }
    
    &msg_send("total: $tot\n");

    return 1;
}
seqinfodescriptionprevnextTop
sub seqinfo {
    my @args = opt_get(@_);
    my $this = opt_as_gb(shift @args);
    my $length = length($this->{SEQ});

    my $a = $this->{SEQ} =~ tr/a/a/;
    my $t = $this->{SEQ} =~ tr/t/t/;
    my $g = $this->{SEQ} =~ tr/g/g/;
    my $c = $this->{SEQ} =~ tr/c/c/;
    my $others = $length - $a - $t - $g - $c;
    my $msg;

    $msg .= sprintf "\n  Length of Sequence : %9d\n" , $length;
    $msg .= sprintf "           A Content : %9d (%.2f\%)\n" , 
    $a , $a / $length * 100;
$msg .= sprintf " T Content : %9d (%.2f\%)\n" , $t , $t / $length * 100;
$msg .= sprintf " G Content : %9d (%.2f\%)\n" , $g , $g / $length * 100;
$msg .= sprintf " C Content : %9d (%.2f\%)\n" , $c , $c / $length * 100;
$msg .= sprintf " Others : %9d (%.2f\%)\n" , $others, $others / $length * 100;
$msg .= sprintf " AT Content : %.2f\%\n", ($a + $t) / $length * 100;
$msg .= sprintf " GC Content : %.2f\%\n\n", ($g + $c) / $length * 100;
&msg_send($msg); return ($a, $t, $g, $c);
}
view_cdsdescriptionprevnextTop
sub view_cds {
    &opt_default(length=>100, filename=>"view_cds.png", 
		  gap=>3, output=>"show", application=>"gimv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my (@a, @t, @g, @c, @pos);
    my $numcds = 0;
    my $i = 0;
    my $length = opt_val("length");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my $application = opt_val("application");

    $filename = "view_cds.csv" if ($output eq "f" &&
				   opt_val("filename") eq "view_cds.png");
    my $gap = opt_val("gap");

    while(defined %{$gb->{"CDS$numcds"}}){ $numcds ++ }

    for ($i = 0; $i < $length * 4 + 6 + $gap; $i++){
	$a[$i] = 0;
	$t[$i] = 0;
	$g[$i] = 0;
	$c[$i] = 0;
    }

    foreach my $cds ($gb->cds()){
	my $seq;
	$seq  = $gb->before_startcodon($cds, $length);
	$seq .= $gb->startcodon($cds);
	$seq .= $gb->after_startcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i] += 100/$numcds;
}elsif (substr($seq, $i, 1) eq 't'){ $t[$i] += 100/$numcds;
}elsif (substr($seq, $i, 1) eq 'g'){ $g[$i] += 100/$numcds;
}elsif (substr($seq, $i, 1) eq 'c'){ $c[$i] += 100/$numcds;
} } $seq = $gb->before_stopcodon($cds, $length); $seq .= $gb->stopcodon($cds); $seq .= $gb->after_stopcodon($cds, $length); for ($i = 0; $i < length($seq); $i ++){ if (substr($seq, $i, 1) eq 'a'){ $a[$i + length($seq) + $gap] += 100/$numcds;
}elsif (substr($seq, $i, 1) eq 't'){ $t[$i + length($seq) + $gap] += 100/$numcds;
}elsif (substr($seq, $i, 1) eq 'g'){ $g[$i + length($seq) + $gap] += 100/$numcds;
}elsif (substr($seq, $i, 1) eq 'c'){ $c[$i + length($seq) + $gap] += 100/$numcds;
} } } for ($i = 1; $i <= $length * 4 + 6 + $gap; $i ++){ push(@pos, $i); } if ($output eq "g" || $output eq "show"){ _UniMultiGrapher(\@ pos, -x => "position", -y => "percentage",\@ a, -x1=>"A",\@ t, -x2=>"T",\@ g, -x3=>"G",\@ c, -x4=>"C", -filename => $filename, -title => "Base Contents Around Start/Stop Codons" ); msg_gimv("graph/$filename") if($output eq "show"); }elsif ($output eq "f"){ open(OUT, '>data/' . $filename); print OUT "position,A,T,G,C\n"; for ($i = 0; $i < $length * 4 + 6 + $gap; $i ++){ printf OUT "%d,%3.2f,%3.2f,%3.2f,%3.2f\n", $i + 1, $a[$i], $t[$i], $g[$i], $c[$i]; } close(OUT); }
}
General documentation
No general documentation available.