Util documentation.

G::Seq Util

Summary

  G::Seq::Util - Miscellaneous analysis methods related to sequence analysis.

Package variables

Privates (from "my" definitions)

%COG_fcode = ( J=>"Translation, ribosomal structure and biogenesis", K=>"Transcription", L=>"DNA replication, recombination and repair", D=>"Cell division and chromosome partitioning", O=>"Posttranslational modification, protein turnover, chaperones", M=>"Cell envelope biogenesis, outer membrane", N=>"Cell motility and secretion", P=>"Inorganic ion transport and metabolism", T=>"Signal transduction mechanisms", C=>"Energy production and conservation", G=>"Carbohydrate transport and metabolism", E=>"Amino acid transport and metabolism", F=>"Nucleotide transport and metabolism", H=>"Coenzyme metabolism", I=>"Lipid metabolism", Q=>"Secondary metabolites biosynthesis, transport and catabolism", R=>"General function prediction only", S=>"Function unknown", '-'=>"Non COG" )

%COG_fcolor = ( J=>"plum", K=>"fuchsia", L=>"pink", D=>"lightgreen", O=>"green", M=>"khaki", N=>"greenyellow", P=>"darkkhaki", T=>"cyan", C=>"blue", G=>"mediumturquoise", E=>"lightskyblue", F=>"mediumpurple", H=>"aqua", I=>"blueviolet", Q=>"lightskyblue", R=>"gainsboro", S=>"darkgrey", '-'=>"aliceblue" )

Included modules

Cwd

SelfLoader

Synopsis

No synopsis!

Description

    This class is a part of G-language Genome Analysis Environment, 
    collecting miscellaneous sequence analysis methods.

Methods

atcgcon	No description	Code
cds_echo	No description	Code
find_king_of_gene	No description	Code
maskseq	No description	Code
molecular_weight	Description	Code
oligomer_translation	No description	Code
pasteseq	No description	Code
print_gene_function_list	No description	Code
seqinfo	Description	Code
view_cds	Description	Code

Methods description

molecular_weight

code

Top

  Name: molecular_weight   -   calculates the molecular weight of given nucleotide sequence

  Description:
    This method calculates the molecular weight of the given
    nucleotide sequence, taking into account the hydrogen bonds 
    between molecules.. Molecular weight used in this method
    is as follows:
       A: 313.15
       T: 304.19
       G: 329.19
       C: 289.13
       N: 308.915

 Usage: 
    -strand      "single" or "double" strand DNA molecule (default:single)

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20011029-01 initial posting

seqinfo

code

Top

  Name: seqinfo   -   prints out basic nucleotide sequence statistics

  Description:
    This method prints out basic compositional statistics of the 
    given nucleotide sequence, in a format similar to the one printed
    right after calling new G().

 Usage: 
    seqinfo($genome)

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20020207-01 initial posting

view_cds

code

Top

  Name: view_cds   -   displays a graph of nucleotide contents around start and stop codons

  Description:
    This method creates a graph showing the average A,T,G,C contents
    around start/stop codons. This is useful to view consensus around
    start/stop codons and to find characteristic pattern in CDS. 
    
  Usage : 
    view_cds(G instance);

  Options:
    -length    length in bases to show around start/stop codons
               (default: 100)
    -gap       gap shown in graph in between start/stop codon neighbors
               (default: 3)
    -filename  outfile name   (default: view_cds.png for graph, 
               view_cds.csv for file)
    -output    "f" for file, "g" for graph, "show" to display graph. 
               (default: "show")

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070707-01 moved to G::Seq::Util from G::Seq::GCskew
    20010906-01 initial posting

Methods code

atcgcon

description

Top

sub atcgcon {

    &opt_default(output=>"stdout",filename=>"cds_info.csv");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $output=opt_val("output");
    my $filename=opt_val("filename");
    my $start;
    my $end;
    my $seq;
    my $num=1;
    my %hash;


    foreach($gb->feature()){
	if($gb->{"FEATURE$num"}->{type} eq 'CDS'){
	    $start=$gb->{"FEATURE$num"}->{start};
	    $end=$gb->{"FEATURE$num"}->{end};
	    $seq=$gb->getseq($start-1,$end-1);
	    $hash{a} += $seq =~tr/a/a/;
	    $hash{t} += $seq =~tr/t/t/;
	    $hash{g} += $seq =~tr/g/g/;
	    $hash{c} += $seq =~tr/c/c/;
	    $hash{total}+=length($seq);
	}
	$num++;
    }

    if($output eq "stdout"){
	&msg_send(sprintf("total:\t%10d base\n",$hash{total}));
	&msg_send(sprintf("a:\t%10d / %2.2f\%\n", $hash{a}, 100.0*$hash{a}/$hash{total}));
	&msg_send(sprintf("t:\t%10d / %2.2f\%\n", $hash{t}, 100.0*$hash{t}/$hash{total}));
	&msg_send(sprintf("c:\t%10d / %2.2f\%\n", $hash{c}, 100.0*$hash{c}/$hash{total}));
	&msg_send(sprintf("g:\t%10d / %2.2f\%\n", $hash{g}, 100.0*$hash{g}/$hash{total}));
	
	&msg_send(sprintf("GC content:\t%.2f\%\n", 100.0*($hash{c} + $hash{g}) / $hash{total}));
    }
    if($output eq "f"){
	open(FILE,">$filename");
	printf FILE "total:\t%10d base\n",$hash{total};
	printf FILE "a:\t%10d / %2.2f\%\n", $hash{a}, 100.0*$hash{a}/$hash{total};
	printf FILE "t:\t%10d / %2.2f\%\n", $hash{t}, 100.0*$hash{t}/$hash{total};
	printf FILE "c:\t%10d / %2.2f\%\n", $hash{c}, 100.0*$hash{c}/$hash{total};
	printf FILE "g:\t%10d / %2.2f\%\n", $hash{g}, 100.0*$hash{g}/$hash{total};
	
	printf FILE "GC content:\t%.2f\%\n", 100.0*($hash{c} + $hash{g}) / $hash{total};
	close(FILE);
    }

    return\% hash;

}

cds_echo

description

Top

sub cds_echo {

    my $gb=opt_as_gb(shift);
    my $start;
    my $end;
    my $i=1;

    foreach($gb->feature()){
        if($gb->{"FEATURE$i"}->{type} eq 'CDS'){
	    if($gb->{"FEATURE$i"}->{direction} eq 'direct'){
		$start = $gb->{"FEATURE$i"}->{start};
		$end = $gb->{"FEATURE$i"}->{end};
		&msg_send(sprintf("%d..%d\n",$start,$end));
	    }
	        
	    elsif($gb->{"FEATURE$i"}->{direction} eq 'complement'){
		$start = $gb->{"FEATURE$i"}->{end};
		$end = $gb->{"FEATURE$i"}->{start};
		&msg_send(sprintf("%d..%d\n",$start,$end));
	    }
	}
	$i++;
    }

}

find_king_of_gene

description

Top

sub find_king_of_gene {

    my $nuc=shift;
    my $gene='you have just found the king of genes.'."\n";
    
    system("wget http://www.stagnightout.com/pics/what-to-wear/21280.jpg -O /tmp/afro.jpg -q");
    msg_gimv('/tmp/afro.jpg');
    
    return $gene;

}

maskseq

description

Top

sub maskseq {

    &opt_default(pattern=>"",start=>1,end=>"");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $seq=\$gb->{SEQ};
    my $start=opt_val("start");
    my $end=opt_val("end");
    my $pat=opt_val("pattern");
    my $masked;
    my $null;
 

    $$seq=~tr/ \n[0-9]//d;
    $$seq=~tr/A-Z/a-z/;
    
    $end=length($$seq) if($end eq "");

    for(my $i=0;$i<length($pat);$i++){
	$null.="n";
    }

    if($pat){
	$masked=substr($$seq,$start-1,$end-$start+1);
	$masked=~s/$pat/$null/g;
	substr($$seq,$start-1,$end-$start+1)=$masked;
    }
    else{
	$masked=substr($$seq,$start-1,$end-$start+1);
	$masked=~tr/a-zA-Z/n/;
	substr($$seq,$start-1,$end-$start+1)=$masked;
    }

    return $seq;

}

molecular_weight

description

Top

sub molecular_weight {

    opt_default(strand=>"single");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $strand = opt_val("strand");

    my %mw = ("a", 313.15, "t", 304.19, "g", 329.19, "c", "289.13", "n", 308.915);
    my $i = 0;
    my $weight = 0;

    while(substr($gb->{SEQ}, $i, 1) ne ''){
	if (substr($gb->{SEQ}, $i, 1) =~ /[atgc]/){
	    $weight += $mw{substr($gb->{SEQ}, $i, 1)};
	}else{
	    $weight += $mw{"n"};
	}
	$i ++;
    }
    my $double = $weight * 2;

    msg_send(sprintf "  Molecular Weight of Nucleotides:\n");
    msg_send(sprintf "    single strand:  %12d\n",$weight); 
    msg_send(sprintf "    double strand:  %12d\n\n\n",$double); 

    $weight *= 2 if ($strand eq "double");


    return $weight;

}

oligomer_translation

description

Top

sub oligomer_translation {

    my @args = opt_get(@_);
    my $seq = shift @args;
    my $frame = shift @args;
    my $len = length($seq);
    if ($frame > 3){
	$seq = G::Seq::Util::complement($seq);
	$frame -= 3;
    }

    my %CodonTable = (
               'gac', 'D', 'caa', 'Q', 'gca', 'A', 'ctg', 'L',
               'gat', 'D', 'cag', 'Q', 'gcc', 'A', 'ctt', 'L',
               'gaa', 'E', 'agc', 'S', 'gcg', 'A', 'ata', 'I',
               'gag', 'E', 'agt', 'S', 'gct', 'A', 'atc', 'I',
               'aga', 'R', 'tca', 'S', 'gga', 'G', 'att', 'I',
               'agg', 'R', 'tcc', 'S', 'ggc', 'G', 'cca', 'P',
               'cga', 'R', 'tcg', 'S', 'ggg', 'G', 'ccc', 'P',
               'cgc', 'R', 'tct', 'S', 'ggt', 'G', 'ccg', 'P',
               'cgg', 'R', 'aca', 'T', 'gta', 'V', 'cct', 'P',
               'cgt', 'R', 'acc', 'T', 'gtc', 'V', 'atg', 'M',
               'aaa', 'K', 'acg', 'T', 'gtg', 'V', 'tgg', 'W',
               'aag', 'K', 'act', 'T', 'gtt', 'V', 'tgc', 'C',
               'cac', 'H', 'tac', 'Y', 'tta', 'L', 'tgt', 'C',
               'cat', 'H', 'tat', 'Y', 'ttg', 'L', 'taa', '/',
               'aac', 'N', 'ttc', 'F', 'cta', 'L', 'tag', '/',
               'aat', 'N', 'ttt', 'F', 'ctc', 'L', 'tga', '/'
                  );

    my $return = '';
    my $i;
    for ($i = 0; $i < $len; $i ++){
	if ($i < $frame - 1){
	    $return .= substr($seq, $i, $frame - 1) . '-';
	    $i += $frame - 2;
	} elsif ($i + 3 <= $len){
	    $return .= $CodonTable{substr($seq, $i, 3)};
	    $i += 2;
	    $return .= '-' unless ($i >= $len - 1);
	} else {
	    $return .= substr($seq, $i);
	    last;
	}
    }
    return $return;

}

pasteseq

description

Top

sub pasteseq {

    &opt_default();
    my @args=opt_get(@_);
    
    my $gb=opt_as_gb(shift);
    my $seq=\$gb->{SEQ};
    my $paste=shift @_;
    my $pos=shift @_;
    
    $$seq=~tr/ \n[0-9]//d;
    $$seq=~tr/A-Z/a-z/;
    $$paste=~tr/A-Z/a-z/;

    substr($$seq,$pos-1,0)=$$paste;

    return $seq;

}

print_gene_function_list

description

Top

sub print_gene_function_list {

    my $gb = opt_as_gb(shift);
    my $seq = shift;
    $seq = 'gctggtgg' unless ($seq);
    my $revseq = complement($seq);
    my $i = 1;
    my %chi;
    my %cds;
    my $key;

    while(defined(%{$gb->{"CDS$i"}})){
        my $id = $gb->{"CDS$i"}->{feature};
        my ($function, $tmp) = split(/;/, $gb->{"FEATURE$id"}->{function}, 2);
        my $cdsseq = $gb->get_cdsseq("CDS$i");
        $cds{$function}++;
        
        my $iStart = -1;
        while(0 <= ($iStart = index($cdsseq, $seq, $iStart +1))){
            $chi{$function}++;
        }
        $iStart = -1;
        while(0 <= ($iStart = index($cdsseq, $revseq, $iStart +1))){
	    $chi{$function}++;
	}
        
        $i++;
    }
    
    &msg_send("=== $seq ===\n");
    my $tot = 0;
    foreach $key (sort keys %chi){
        &msg_send(sprintf("%20s: %8d\n",$key, $chi{$key}));
        $tot += $chi{$key};
    }
    
    &msg_send("total: $tot\n\n");
    
    
    &msg_send("=== CDS ===\n");
    $tot = 0;
    foreach $key (sort keys %cds){
        next if ($key !~ /[a-z]/);
        &msg_send(sprintf("%20s: %8d\n", $key, $cds{$key}));
        $tot += $cds{$key};
    }
    
    &msg_send("total: $tot\n");

    return 1;

}

seqinfo

description

Top

sub seqinfo {

    my @args = opt_get(@_);
    my $this = opt_as_gb(shift @args);
    my $length = length($this->{SEQ});

    my $a = $this->{SEQ} =~ tr/a/a/;
    my $t = $this->{SEQ} =~ tr/t/t/;
    my $g = $this->{SEQ} =~ tr/g/g/;
    my $c = $this->{SEQ} =~ tr/c/c/;
    my $others = $length - $a - $t - $g - $c;
    my $msg;

    $msg .= sprintf "\n  Length of Sequence : %9d\n" , $length;
    $msg .= sprintf "           A Content : %9d (%.2f\%)\n" , 
    $a , $a / $length * 100;
    $msg .= sprintf "           T Content : %9d (%.2f\%)\n" , 
    $t , $t / $length * 100;
    $msg .= sprintf "           G Content : %9d (%.2f\%)\n" , 
    $g , $g / $length * 100;
    $msg .= sprintf "           C Content : %9d (%.2f\%)\n" , 
    $c , $c / $length * 100;
    $msg .= sprintf "              Others : %9d (%.2f\%)\n" , 
    $others,  $others / $length * 100;
    $msg .= sprintf "          AT Content :    %.2f\%\n", 
    ($a + $t) / $length * 100;
    $msg .= sprintf "          GC Content :    %.2f\%\n\n", 
    ($g + $c) / $length * 100;

    &msg_send($msg);

    return ($a, $t, $g, $c);

}

view_cds

description

Top

sub view_cds {

    &opt_default(length=>100, filename=>"view_cds.png", 
		  gap=>3, output=>"show", application=>"gimv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my (@a, @t, @g, @c, @pos);
    my $numcds = 0;
    my $i = 0;
    my $length = opt_val("length");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my $application = opt_val("application");

    $filename = "view_cds.csv" if ($output eq "f" &&
				   opt_val("filename") eq "view_cds.png");
    my $gap = opt_val("gap");

    while(defined %{$gb->{"CDS$numcds"}}){ $numcds ++ }

    for ($i = 0; $i < $length * 4 + 6 + $gap; $i++){
	$a[$i] = 0;
	$t[$i] = 0;
	$g[$i] = 0;
	$c[$i] = 0;
    }

    foreach my $cds ($gb->cds()){
	my $seq;
	$seq  = $gb->before_startcodon($cds, $length);
	$seq .= $gb->startcodon($cds);
	$seq .= $gb->after_startcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i] += 100/$numcds;
	    }
        }
	
	$seq  = $gb->before_stopcodon($cds, $length);
	$seq .= $gb->stopcodon($cds);
	$seq .= $gb->after_stopcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i + length($seq) + $gap] += 100/$numcds;
	    }
	}
    }
    
    for ($i = 1; $i <= $length * 4 + 6 + $gap; $i ++){
	push(@pos, $i);
    }

    if ($output eq "g" || $output eq "show"){
	_UniMultiGrapher(\@
			 pos, -x => "position", -y => "percentage",\@
			 a, -x1=>"A",\@ t, -x2=>"T",\@
			 g, -x3=>"G",\@ c, -x4=>"C",
			 -filename => $filename,
			 -title => "Base Contents Around Start/Stop Codons"
			 );
	msg_gimv("graph/$filename") if($output eq "show");
    }elsif ($output eq "f"){
	open(OUT, '>data/' . $filename);
	print OUT "position,A,T,G,C\n";
	
	for ($i = 0; $i < $length * 4 + 6 + $gap; $i ++){
	    printf OUT "%d,%3.2f,%3.2f,%3.2f,%3.2f\n", $i + 1, 
	    $a[$i], $t[$i], $g[$i], $c[$i];
	}
	close(OUT);
    }

}

General documentation

No general documentation available.