G::Seq PatSearch
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Summary
  G::Seq::PatSearch - component of G-language Genome Analysis Environment
Package variables
No package variables defined.
Included modules
Carp
File::Temp
G::Messenger
G::Seq::Primitive
G::Tools::GMap
G::Tools::Graph
G::Tools::Statistics
Rcmd
SelfLoader
SubOpt
Inherit
Exporter
Synopsis
Description
    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to pattern searches
    for oligonucleotides.
Methods
markovDescriptionCode
signatureDescriptionCode
Methods description
markovcode    nextTop
  Name: markov   -   calculate O/E values of oligonucleotides with Markov statistics

  Description:
    This program calculates the Markov probability for all oligomers of specified length.
    Returned value is a reference to a hash with keys as oligomer 
    sequences, and valus as the O/E values at (length - 2) order Markov.
    
  Usage:
    $ref_hash = &markov(pointer G instance);  

  Options:
   -length    length of oligomer to analyze (default:6)
   -mincount  minimum number of oligomer count to report (default:10)
   -filename  output filename (default:'markov.csv')
   -output    "f" for file output, "stdout" for STDOUT output

  Author: 
   Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20010830-01 initial posting
signaturecodeprevnextTop
  Name: signature   -   calculate oligonucleotide usage (genomic signature)

  Description:
   This program calculates short oligonucleotide usage (genomic signature),
   defined as the ratio of observed (O) to expected (E) oligonucleotide frequencies.
   O/E value of dinucleotide CG will be accessible at 
   $gb->{signature}->{'cg'}.
    
  Usage: 
   NULL = &signature(G instance);

  Options:
   -output     output option (default: 'stdout')
               'stdout' for standard output,
               'f' for file output
   -filename   output filename (default: 'signature.csv')
   -wordlength word length (default: 2)
   -bothstrand use direct (0) or both (1) strand (default: 1)
   -oe         use observed (0) or O/E (1) value (default: 1)
   -seq        sequence (default: '')
               '' when using whole genome sequence; i.e. -seq=>$gb->{SEQ}
   -memo       memo (default: '')
   -header     include (1) or exclude (0) variable names (default: 1)

  Author: 
   Haruo Suzuki (haruo@g-language.org)
History: 20120617 added -header option 20110222 added -bothstrand, -oe, -seq, and -memo options 20101010 initial posting References: Campbell A et al. (1999) Proc Natl Acad Sci U S A. 96(16):9184-9. Karlin S. (2001) Trends Microbiol. 9(7):335-43. Requirements: none.
Methods code
markovdescriptionprevnextTop
sub markov {
    &opt_default(length=>6, mincount=>10, filename=>"markov.csv",output=>"stdout");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $filename = opt_val("filename");
    my @aSortedTable = ( );
    my $iTotalNucs = 0;
    my @ahNucsTable = ( );
    my %oe;

    for (my $iCounter = 0; $iCounter <= 32; $iCounter++) {
	$ahNucsTable[$iCounter] = { };
    }

    my $rhTmp;
    foreach $rhTmp (@ahNucsTable) { undef %$rhTmp; }
    $iTotalNucs = 0;

    my($nucs, $char);
    $nucs = '';
    foreach $char (split(//, $gb->{SEQ})) {
	$iTotalNucs++;
	$nucs .= $char;
	if (opt_val("length") < $iTotalNucs) {
	    substr($nucs, 0, 1) = '';
	}
	;# Now $nucs contains tail of sequence.
my $iLoopEnd = opt_val("length"); if ($iTotalNucs < $iLoopEnd) { $iLoopEnd = $iTotalNucs; } my $iLen; for ($iLen = 1; $iLen <= $iLoopEnd; $iLen++) { $ahNucsTable[$iLen - 1]->{substr($nucs, -$iLen, $iLen)}++; } } { my @aTmpTable1 = ( ); my @aTmpTable2 = ( ); my @aTmpTable3 = ( ); my $sKey; foreach $sKey (keys(%{$ahNucsTable[opt_val("length") - 1]})) { my $iTmp = $ahNucsTable[opt_val("length") - 1]->{$sKey}; if (opt_val("mincount") <= $iTmp) { my $sTmp = sprintf("%08d %s", $iTmp, $sKey); if ($iTmp == 1) { push(@aTmpTable1, $sTmp); } elsif ($iTmp == 2) { push(@aTmpTable2, $sTmp); } else { push(@aTmpTable3, $sTmp); } } } @aSortedTable = sort {$b cmp $a;} @aTmpTable3; push(@aSortedTable, @aTmpTable2); push(@aSortedTable, @aTmpTable1); } if (opt_val("output") eq "f"){ mkdir ('data', 0777); open(TABLEFILE, '>data/' . $filename) || die; print TABLEFILE "oligomer,O-value,E-value,"; my $i; for ($i = 1; $i <= opt_val("length") - 2; $i ++){ printf TABLEFILE "%d degree Markov,", $i; } print TABLEFILE "O/E value\n"; } foreach my $sRecord (@aSortedTable) { my($iOVal, $sKey) = split(' ', $sRecord); my $klen = length($sKey); $iOVal =~ s/^0+//; my ($order, $iEVal); if (opt_val("output") eq "f"){ printf TABLEFILE "%s,%d,", $sKey, $iOVal; }elsif(opt_val("output") eq "stdout"){ &msg_send(sprintf("%s %5d", $sKey, $iOVal)); } if (opt_val("length") == 1){ if (opt_val("output") eq "f"){ printf TABLEFILE "\n"; }elsif(opt_val("output") eq "stdout"){ &msg_send("\n"); } }else{ for ($order = 0; $order <= opt_val("length") - 2; $order++) { my $numerator = $iTotalNucs + 1 - $klen; my $denominator = 1.0; my $offset; for ($offset = 0; $offset <= $klen - $order - 1; $offset++) { my $key = substr($sKey, $offset, $order + 1); my $len = length($key); $numerator *= $ahNucsTable[$len - 1]->{$key} /
(
$iTotalNucs + 1 - $len);
} if (1 <= $order) { for ($offset = 1; $offset <= $klen - $order - 1; $offset++) { my $key = substr($sKey, $offset, $order); my $len = length($key); $denominator *= $ahNucsTable[$len - 1]->{$key} /
(
$iTotalNucs + 1 - $len);
} } else { $denominator = 1.0; } if ($denominator <= 0.0) { $iEVal = 0.0; } else { $iEVal = $numerator / $denominator;
} if (opt_val("output") eq "f"){ printf TABLEFILE "%d,", $iEVal if (opt_val("output") eq "f"); }elsif(opt_val("output") eq "stdout"){ &msg_send(sprintf(" %8d", $iEVal)); } } if (opt_val("output") eq "f"){ printf TABLEFILE "%.4f\n", $iOVal/$iEVal;
}elsif(opt_val("output") eq "stdout"){ &msg_send(sprintf(" %3.4f\n", $iOVal/$iEVal));
} $oe{$sKey} = $iOVal/$iEVal;
} } close(TABLEFILE) if (opt_val("output") eq "f"); return\% oe;
}
signaturedescriptionprevnextTop
sub signature {
    &opt_default(output=>'stdout', filename=>'signature.csv', wordlength=>2, bothstrand=>1, oe=>1, seq=>'', memo=>'', header=>1);
    my @args       = opt_get(@_);
    my $gb         = opt_as_gb(shift @args);
    my $output     = opt_val("output");
    my $filename   = opt_val("filename");
    my $wordlength = opt_val("wordlength");
    my $bothstrand = opt_val("bothstrand");
    my $oe         = opt_val("oe");
    my $seq        = opt_val("seq");
    my $memo       = opt_val("memo");
    my $header     = opt_val("header");

    my ($total, %mono, %obs, %exp, %val);

    $seq = $gb->{SEQ} unless($seq);
    $seq .= ' '.complement($seq) if($bothstrand);

    $total = $seq =~ tr/acgt/acgt/;
    unless($total){
	carp("No regular nucleotide found in this sequence.");
	return;
    }
    $mono{a} = $seq =~ tr/a/a/;
    $mono{c} = $seq =~ tr/c/c/;
    $mono{g} = $seq =~ tr/g/g/;
    $mono{t} = $seq =~ tr/t/t/;
    foreach (keys %mono){ $mono{$_} =  $mono{$_} / $total; }

for(my
$i = 0; $i < length($seq) - $wordlength + 1; $i ++){
$obs{substr($seq, $i, $wordlength)} ++;
} $total = 0; foreach (keys %obs){ delete $obs{$_} if($_ =~ /[^acgt]/); $total += $obs{$_}; } foreach (keys %obs){ $exp{$_} = 1; for(my $i = 0; $i < $wordlength; $i ++){ $exp{$_} *= $mono{substr($_,$i,1)}; } if($oe){ $val{$_} = sprintf("%.3f", ($obs{$_}/$total)/$exp{$_}); } else { $val{$_} = sprintf("%.3f", $obs{$_}/$total); }
}

if(0){
print "mono=",
%mono, "\n";
print "mono.total = [$total]\n"; print "obs=", %obs, "\n"; print "oligo.total = [$total]\n"; } my @allkey = qw(a c g t); for(1..($wordlength-1)){ my @tmp = (); for my $base (qw(a c g t)){ foreach my $key (@allkey){ push(@tmp, $key.$base); } } @allkey = @tmp; } @allkey = sort @allkey; #####
if(0){ my %hashslice; @hashslice{@allkey} = undef; foreach my $key (sort keys %hashslice){ my $key2 = reverse($key); $key2 =~ tr [atgc][tacg];
if($key ne $key2 && exists $hashslice{$key}){ $hashslice{"$key/$key2"} = undef; delete $hashslice{$key}; delete $hashslice{$key2}; $val{"$key/$key2"} = sprintf("%.3f", ($val{$key} + $val{$key2})/2 );
delete $val{$key}; delete $val{$key2}; } } @allkey = sort keys %hashslice; } #####
if($output =~ /stdout/){ foreach(@allkey){ msg_send("$_\t"); } msg_send("memo\n"); foreach(@allkey){ msg_send("$val{$_}\t"); } msg_send("$memo\n"); } if($output eq 'f'){ mkdir ("data", 0777); open(OUT, ">>data/$filename"); if($header){ print OUT join(',', @allkey), ",memo\n"; } my @tmp; foreach (@allkey){ push(@tmp, $val{$_} || 0); } print OUT join(',', @tmp), ",$memo\n"; close(OUT); } $gb->{signature} =\% val; # input in the G instance
return\% val; } =head2 signature_dist Name: signature_dist - calculate genomic signature difference Description: This program computes and returns the dissimilarity (distance) and similarity (P-value) in genomic signature between plasmid and chromosome. Usage: ($distance, $p_value) = &signature_dist($gb_plasmid, $gb_chromosome); Options: -output output option (default: 'stdout') -wordlength word length (default: 2) -window size (bp) of sliding window by which sequences are sampled (default: 5000) -step step size (bp) of sliding window (default: 5000) -method the distance measure to be used. (default: 'mahalanobis') This must be one of "mahalanobis", "delta", "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski". Examples: &signature_dist($gb_plasmid, $gb_chromosome, -wordlength=>2, -window=>50000, -step=>50000, -method=>'delta'); calculates the average absolute dinucleotide relative abundance difference (delta-distance) between a plasmid and non-overlapping 50-kb chromosomal segments (Campbell A et al., 1999). Author: Haruo Suzuki (haruo@g-language.org) History: 20120617 use File::Temp; 20101010 initial posting References: Campbell A et al. (1999) Proc Natl Acad Sci U S A. 96(16):9184-9. Suzuki H et al. (2008) Nucleic Acids Res. 36(22):e147. Suzuki H et al. (2010) J Bacteriol. 192(22):6045-55. Requirements: signature() =cut sub signature_dist { &opt_default(output=>'stdout', wordlength=>2, window=>5000, step=>5000, method=>'mahalanobis'); my @args = opt_get(@_); my $gb1 = shift @args; my $gb2 = shift @args; unless(defined($gb1) && defined($gb2)){ msg_error("At least two different genomes should be provided as follows. &signature_dist(\$gb_plasmid,\$ gb_chromosome);\n"); return; } my $query = opt_as_gb($gb1); my $sbjct = opt_as_gb($gb2); my $output = opt_val("output"); my $wordlength = opt_val("wordlength"); my $window = opt_val("window"); my $step = opt_val("step"); my $method = opt_val("method"); my $fh_query = File::Temp->new; my $fname_query = $fh_query->filename; my $fh_sbjct = File::Temp->new; my $fname_sbjct = $fh_sbjct->filename; my $val = &signature($query, -output=>"n", -filename=>$fname_query, -wordlength=>$wordlength); my @allkey = sort keys %$val; print $fh_query join(',',@allkey),"\n"; my @tmp; foreach (@allkey){ push(@tmp, $$val{$_} || 0); } print $fh_query join(',', @tmp), "\n"; print $fh_sbjct join(',',@allkey),"\n"; for(my $i = 1; $i <= length($sbjct->{SEQ}) - $window; $i += $step){ my $start = $i; my $end = $i + $window - 1; my $seq = $sbjct->get_gbkseq($start, $end); my $val = &signature($sbjct, -output=>"n", -filename=>$fname_sbjct, -wordlength=>$wordlength, -seq=>$seq); my @tmp; foreach (@allkey){ push(@tmp, $$val{$_} || 0); } print $fh_sbjct join(',', @tmp), "\n"; } my $rcmd = new Rcmd; my @result = $rcmd->exec( qq|| method = "$method" X = read.csv("$fname_query") Y = read.csv("$fname_sbjct") if(method == "mahalanobis"){ library(MASS); dX = try(mahalanobis(X, apply(Y,2,mean), ginv(var(Y)), inverted=TRUE), silent=TRUE); dY = try(mahalanobis(Y, apply(Y,2,mean), ginv(var(Y)), inverted=TRUE), silent=TRUE); } else if(method == "delta"){ dX = dist(rbind(apply(Y,2,mean),X), method="manhattan") * ncol(Y)^-1 * 1000; dY = dist(rbind(apply(Y,2,mean),Y), method="manhattan")[1:nrow(Y)] * ncol(Y)^-1 * 1000; } else { dX = dist(rbind(apply(Y,2,mean),X), method=method); dY = dist(rbind(apply(Y,2,mean),Y), method=method)[1:nrow(Y)]; } dX = ifelse(is.numeric(dX), dX, NA) D = dX P = ifelse(is.numeric(dY), mean(dY > dX), NA) as.vector(c(D, P)) | ); #print "@result \n";
my $distance = sprintf "%.4f", shift @result; my $p_value = sprintf "%.4f", shift @result; &msg_send("\n$wordlength-mer genomic signature difference:\n $method distance = $distance\n p-value = $p_value\n") if ($output eq 'stdout'); return ($distance, $p_value); } =head2 palindrome Name: palindrome - searches palindrome sequences Description: Searches palindrome sequences Usage: palindrome(sequence); Options: -shortest shortest palindrome to search (default:4) -loop longest stem loop to allow (default: 0) -gtmatch if 1, allows g-t match (default: 0) -output "f" for file output Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20090312-01 fixed bug in default option 20010829-01 initial posting =cut sub palindrome { &opt_default(gtmatch=>0, loop=>0, shortest=>4, output=>"stdout", filename=>"palindrome.csv"); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $length = int(opt_val("shortest") / 2);
my $output = opt_val("output"); my $filename = opt_val("filename"); my %palindrome; my $i = $length - 1; my ($len, $j, $k, $stem); if (opt_val("output") eq "f"){ open(OUT, '>' . $filename) || &msg_error("G::Seq::PatSearch::palindrome() $! $filename"); print OUT "Length, start, end, sequence\n"; } while($i <= length($gb->{SEQ}) - 1 - $length - opt_val("loop")){ $stem = opt_val("loop"); while($stem >= 0){ $j = $i; $k = $stem + 1 + $i; $len = 0; last if ($k > length($gb->{SEQ}) - 1); while(&baseParingTest(substr($gb->{SEQ}, $j, 1), substr($gb->{SEQ}, $k, 1), &opt_val("gtmatch")) ) { $j --; $k ++; last if ($j < 0 || $k > length($gb->{SEQ}) - 1); $len += 2; } if ($len >= opt_val("shortest")){ &msg_send(sprintf("Length: %2d Position: %7d %7d Sequence: %s %s %s\n", $len, $j + 1, $k - 2, substr($gb->{SEQ}, $j + 1, $len/2),
substr(
$gb->{SEQ}, $j + 1 + $len/2, $stem), substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2))) if ($output eq 'stdout'); if ($output eq "f"){ printf OUT "%d,%d,%d,%s %s %s\n", $len, $j + 1, $k - 2, substr($gb->{SEQ}, $j + 1, $len/2),
substr(
$gb->{SEQ}, $j + 1 + $len/2, $stem), substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2); } $palindrome{$j + 1} = sprintf("%s %s %s", substr($gb->{SEQ}, $j + 1, $len/2),
substr(
$gb->{SEQ}, $j + 1 + $len/2, $stem), substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2) ); } $stem --; } $i ++; } close(OUT) if ($output eq "f"); return\% palindrome; } =head2 find_dif Name: find_dif - finds dif sequence (chromosome partitioning site recognized by XerCD) Description: Finds dif sequence (chromosome partitioning site recognized by XerCD) in both strands. dif is a 28bp sequence element recognized by XerCD located near the replication terminus used for chromosome dimer resolution by recombination. For E. coli, 5'-GGTGCGCATAATGTATATTATGTTAAAT-3', (Blakely and Sherratt, 1994) for Proteobacteria, 5'-RNTKCGCATAATGTATATTATGTTAAAT-3', (Hendrickson and Lawrence, 2007) for B. subtilis, 5'-ACTTCCTAGAATATATATTATGTAAACT-3', (Sciochetti et al., 2001) for Firmicute, 5'-ACTKYSTAKAATRTATATTATGTWAACT-3', (Hendrickson and Lawrence, 2007) for Actinobacteria, 5'-TTSRCCGATAATVNACATTATGTCAAGT-3'. (Hendrickson and Lawrence, 2007) Usage: @position = find_dif($genome) Options: -type ecoli for E.coli dif (default) proteobacteria, bsub, firmicute, actinobacteria, for corresponding dif sequences. -output stdout to print data (default: stdout) References: 1. Hendrickson H, Lawrence JG (2007) "Mutational bias suggests that replication termination occurs near the dif site, not at Ter sites", Mol Microbiol. 64(1):42-56. 2. Blakely G, Sherratt D (1994) "Determinants of selectivity in Xer site-specific recombination" Genes Dev. 10:762-773. 3. Sciochetti SA, Piggot PJ, Blakely GW (2001) "Identification and characterization of the dif site from Bacillus subtilis." J Bacteriol. 183:1058-1068. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20081124-01 fixed a bug for handling -type option 20071028-01 fixed a bug that did not properly search the complement 20071023-01 updated to include dif consensus of other organisms 20060711-01 initial posting =cut sub find_dif { opt_default(-type=>'ecoli'); my @argv = opt_get(@_); my $type = lc opt_val('type'); my $dif = 'ggtgcgcataatgtatattatgttaaat'; $dif = 'rntkcgcataatgtatattatgttaaat' if ($type eq 'proteobacteria'); $dif = 'acttcctagaatatatattatgtaaact' if ($type eq 'bsub'); $dif = 'actkystakaatrtatattatgtwaact' if ($type eq 'firmicute'); $dif = 'ttsrccgataatvnacattatgtccagt' if ($type eq 'actinobacteria'); return find_pattern(@_, $dif); } =head2 find_ter Name: find_ter - finds ter sequence (replication termination site) Description: Finds ter sequence (replication termination site, recognized by Ter protein) in both strands. For E. coli, 5'-AGNATGTTGTAAYKAA-3', (Coskun-Ari and Hill, 1997) for B. subtilis, 5'-KMACTAANWNNWCTATGTACYAAATNTTC-3', (Wake, 1997) Note that E.coli Ter consensus allows substitutions at bases 1, 4, and 16, that are NOT considered in this method. Usage: @position = find_ter($sequence) Options: -type ecoli for E.coli ter (default) bsub, for corresponding ter sequence. -output stdout to print data (default: stdout) References: 1. Hendrickson H, Lawrence JG (2007) "Mutational bias suggests that replication termination occurs near the dif site, not at Ter sites", Mol Microbiol. 64(1):42-56 2. Coskun-Ari, FF, Hill TM (1997) "Sequence-specific interactions in the Tus-Ter complex and the effect of base pair substitutions on arrest of DNA replication in Escherichia coli", J Biol Chem. 272:26448-26456. 3. Wake RG (1997) "Replication fork arrest and termination of chromosome replication in Bacillus subtilis", FEMS Microbiol Lett. 153:24-56. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20071028-01 fixed a bug that did not properly search the complement 20071022-01 initial posting =cut sub find_ter { opt_default(-type=>'ecoli'); my @argv = opt_get(@_); my $type = lc opt_val('ecoli'); my $ter = 'agnatgttgtaaykaa'; $ter = 'kmactaanwnnwctatgtacyaaatnttc' if ($type eq 'bsub'); return find_pattern(@_, $ter); } =head2 find_dnaAbox Name: find_dnaAbox - finds dnaA box in both strands Description: Finds dnaA box(TT A/T TNCACA) in both strands.

Usage:
@positions = find_dnaAbox($genome)

Options:
-output stdout to print data (default: stdout)

References:
1. Schaper S, Messer W (1995) "Interaction of the initiator protein DnaA of
Escherichia coli with its DNA target", J Biol Chem, 270(29):17622-17626

Author:
Kazuharu Arakawa (gaou
@sfc.keio.ac.jp)

History:
20071028-01 fixed a bug that did not properly search the complement
20071022-01 updated the code to use oligomer_search()
20021125-01 initial posting

=cut

sub find_dnaAbox {
return find_pattern(
@_, "ttwtncaca");
} =head2 find_iteron Name: find_iteron - finds iteron in both strands Description: Finds iteron (TGAGGG G/A C/T) in both strands. Usage: @positions = find_iteron($genome) Options: -output stdout to print data (default: stdout) References: 1. Haines AS, Akhtar P, Stephens ER, Jones K, Thomas CM, Perkins CD, Williams JR, Day MJ, Fry JC (2006) "Plasmids from freshwater environments capable of IncQ retrotransfer are diverse and include pQKH54, a new IncP-1 subgroup archetype.", Microbiology, 152(Pt 9):2689-2701 Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20071107-01 initial posting =cut sub find_iteron { return find_pattern(@_, "tgaggry"); } =head2 find_pattern Name: find_pattern - finds oligomer pattern in both strands Description: Finds given oligomer pattern specified in degenerate nucleotide code in both strands. This method serves as the basic function for other find_* methods. Usage: @positions = find_pattern($genome, "pattern") Options: -output stdout to print data (default: stdout) Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20071107-01 initial posting =cut sub find_pattern { opt_default('output'=>'stdout'); my @argv = opt_get(@_); my $gb = opt_as_gb(shift @argv); my $pattern = shift @argv; my $output = opt_val("output"); my %data = (oligomer_search($gb, $pattern, -return=>"both"), oligomer_search($gb, complement($pattern), -return=>"both")); if($output eq 'stdout'){ foreach my $pos (sort {$a<=>$b}keys %data){ msg_send(sprintf "%d %s\n", $pos, $data{$pos}); } } return sort keys %data; } =head2 oligomer_counter Name: oligomer_counter - counts the number of given oligomers in a sequence Description: Counts the number of oligomers in a sequence (by windows optionally). Oligomer can be specified using degenerate nucleotide alphabet, or by regular expressions. Usage: $count = oligomer_counter($genome, $oligomer); or %octamers = oligomer_counter($genome, -length=>8); Options: -window int window size. If specified, seeks oligomer in specified windows Method returns an array of numbers at each windows If not specified, seeks oligomer in the genome Method returns the number of oligomers -output "f" for file output, "g" for graph output, "show" to display the graph. Only available when -window option is specified -length If specified, returns a hash containing number counts for all n-mers. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) -based on atg7.wind + gcwind [rsaito] History: 20071107-01 added -length option 20071022-01 oligomer can be now degenerate nucleotide code or regular expressions 20010829-01 initial posting =cut sub oligomer_counter { opt_default("window"=>0); my @args = opt_get(@_); my $gb = opt_as_gb(shift @args); my $seq = shift @args; my $window = opt_val("window"); my $output = opt_val("output"); my $length = opt_val("length"); $window = length($gb->{SEQ}) if($window <= 0); if (opt_val("window")){ die("Error in oligomer_counter: oligomer not specified.") unless(length($seq)); open(OUT, '>oligo_count.csv') || &msg_error($!) if ($output eq "f"); my (@wincount, @winnum); for (my $i = 0; $i <= length($gb->{SEQ}) - $window; $i += $window){ my $partial = substr($gb->{SEQ}, $i, $window); my $count = 0; if (length($seq) == 1 && $seq =~ /[atgc]/){ $count = $partial =~ tr/a/a/ if ($seq eq 'a'); $count = $partial =~ tr/t/t/ if ($seq eq 't'); $count = $partial =~ tr/g/g/ if ($seq eq 'g'); $count = $partial =~ tr/c/c/ if ($seq eq 'c'); }else{ $count = scalar oligomer_search($partial, $seq); } push (@wincount, $count); push (@winnum, $i); printf OUT "%d,%d\n", $i, $count if ($output eq "f"); } close(OUT) if ($output eq 'f'); if ($output eq 'g' || $output eq 'show'){ grapher(\@winnum,\@ wincount, -x=>'window(bp)', -y=>'number of oligomer', -title=>'oligomer by window', -outfile=>'oligo_count.png', -output=>$output ); } return (@wincount); }elsif($length){ my %oligo; if($length == 1){ $oligo{'a'} = $gb->{SEQ} =~ tr/a/a/; $oligo{'t'} = $gb->{SEQ} =~ tr/t/t/; $oligo{'g'} = $gb->{SEQ} =~ tr/g/g/; $oligo{'c'} = $gb->{SEQ} =~ tr/c/c/; }else{ for(my $i = 0; $i <= length($gb->{SEQ}) - $length; $i ++){ $oligo{substr($gb->{SEQ}, $i, $length)} ++; } } return %oligo; }else{ die("Error in oligomer_counter: oligomer not specified.") unless(length($seq)); return scalar oligomer_search($gb, $seq); } } =head2 oligomer_search Name: oligomer_search - searches oligomers in given sequence Description: Searches for the given oligomer in given sequence. Oligomer can be specified using degenerate nucleotide alphabet, or by regular expressions. Performance is optimized for fast searching. This method changes the returning value according to the given options. Usage: @positions = oligomer_search($genome, $oligomer); @oligomers = oligomer_search($genome, $oligomer, -return=>"oligo"); %positions_to_oligomers = oligomer_search($genome, $oligomer, -return=>"both"); ($number_direct, $number_complement, $number_total, $ratio_direct) = oligomer_search($genome, $oligomer, -return=>"distribution"); $oligomer can be degenerate nucleotide alphabet or regular expressions. ex: "grtggngg" (degenerate code), or "g[ag]tgg[a-z]gg" (regular expression) Options: -return "position" to return list of positions where oligomers are found (default), "oligo" to return list of oligomers found ordered by positions, "both" to return a hash with positions as keys and oligomers as values, "distribution" to return four values (see above) about the distribution of given oligomer. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20071022-01 initial posting =cut sub oligomer_search{ opt_default(return=>"position"); my @argv = opt_get(@_); my $gb = opt_as_gb(shift @argv); my $oligo = lc(shift @argv); my $return = opt_val("return"); if($oligo !~ /[^atgc]/ && $return eq 'position'){ my $start = 0; my @result; while(0 <= ($start = index($gb->{SEQ}, $oligo, $start + 1))){ push(@result, $start); } return @result; }elsif($return eq 'distribution'){ my $direct = scalar oligomer_search($gb, $oligo); my $comp = scalar oligomer_search($gb, complement $oligo); return ($direct, $comp, $direct + $comp, $direct/($direct + $comp));
} unless($oligo =~ /[^a-z]/){ $oligo =~ s/r/[ag]/g; $oligo =~ s/k/[gt]/g; $oligo =~ s/s/[gc]/g; $oligo =~ s/y/[ct]/g; $oligo =~ s/m/[ac]/g; $oligo =~ s/w/[at]/g; $oligo =~ s/b/[gct]/g; $oligo =~ s/h/[act]/g; $oligo =~ s/n/[a-z]/g; $oligo =~ s/d/[agt]/g; $oligo =~ s/v/[acg]/g; } my @result; { no strict 'refs'; while($gb->{SEQ} =~ m/($oligo)/g){ if($return eq 'oligo'){ push(@result, ${1}); }elsif($return eq 'both'){ push(@result, $-[1], ${1}); }else{ push(@result, $-[1]); } } } return @result; } =head2 baseParingTest Name: baseParingTest - checks if the two bases forms a pair Description: Base pairing check. 1 if the two bases pair, and 0 if they do not pair. G-T match is also considered when third argument is given. Usage: boolean $match = match_test(char $first, char $second, boolean $gtmatch); Options: none Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20010829-01 initial posting =cut sub baseParingTest { my $first = lc(shift); my $second = lc(shift); my $gtmatch = shift; die("First two arguments must be single base (i.e. a, t, g, or c).\n") unless(length($first) == 1 && length($second) == 1); if ($first eq 'a' && $second eq 't' || $first eq 't' && $second eq 'a' || $first eq 'g' && $second eq 'c' || $first eq 'c' && $second eq 'g' || $first eq 't' && $second eq 'g' && $gtmatch || $first eq 'g' && $second eq 't' && $gtmatch ) { return 1; }else{ return 0; } } =head2 nucleotide_periodicity Name: nucleotide_periodicity - checks the periodicity of certain oligonucleotides Description: Checks the periodicity of certain nucleotide (best known with AA dinucleotide) Usage: array data = nucleotide_periodicity(sequence); Options: -nucleotide nucleotide to search (default:aa) -window window size to seek periodicity (default:50) -filename output filename (default:nucleotide_periodicity.png) -output "g" for graph file output only, "show" for graph file output and display. (default: show) ToDo: data output Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20090312-01 minor bug fix 20070206-01 initial posting =cut sub nucleotide_periodicity { opt_default("nucleotide"=>"aa", "window"=>50, "filename"=>"nucleotide_periodicity.png", "output"=>"show"); my @argv = opt_get(@_); my $gb = opt_as_gb(shift @argv); my $nuc = opt_val("nucleotide"); my $window = opt_val("window"); my $filename = opt_val("filename"); my $output = opt_val("output"); my @data = (); $data[$_] = 0 for (0..($window - 1)); my $start = -1; while(0 <= ($start = index($gb->{SEQ}, $nuc, $start + 1))){ my $innerPos = -1; my $localSeq = substr($gb->{SEQ}, $start + length($nuc), $window); while(0 <= ($innerPos = index($localSeq, $nuc, $innerPos + 1))){ $data[$innerPos]++; } } if($output eq 'f'){ $filename = 'nucleotide_periodicity.csv' if ($filename eq 'nucleotide_periodicity.png'); mkdir ("data", 0777); open(OUT, ">>data/$filename"); print OUT "window,value\n"; for my $win (0..($window-1)){ print OUT $win, ',', $data[$win], "\n"; } close(OUT); }else{ _UniMultiGrapher([0..($window - 1)],\@ data, -filename=>$filename, -title=>"Periodicity of $nuc", -x=>"position", -y=>"count"); msg_gimv("graph/$filename") if ($output eq 'show'); } return @data; } =head2 kmer_table Name: kmer_table - create an image showing all k-mer abundance within a sequence Description: This program creates an image showing the abundance of all k-mers (oligonucleotides of length k) in a given sequence. For example, for tetramers (k=4), resulting image is composed of 4^4 = 256 boxes, each representing an oligomer. Oligomer name and abundance is written within these boxes, and abundance is also visualized with the box color, from white (none) to black (highly frequent). This k-mer table is alternatively known as the FCGR (frequency matrices extracted from Chaos Game Representation). Position of the oligomers can be recursively located as follows: For each letter in an oligomer, a box is subdivided into four quadrants, where A is upper left, T is lower right, G is upper right, and C is lower left. Therefore, oligomer ATGC is in the A = upper left quadrant T = lower right within the above quadrant G = upper right within the above quadrant C = lower left within the above quadrant With Google Maps representation using -gmap=>1 option, oligomers can be searched incrementally from the top search box. More detailed documentation is available at http://www.g-language.org/wiki/cgr Usage: null = kmer_table($sequence); Options: -k length of oligomer (default: 6) -filename filename for output image (default: kmer.png) -level zoom level for Google Maps representation (default: 5) -gmap set to 1 to use Google Maps API to create zoomable image (default: 0) -output "g" for image output, "show" for image output and display, "gmap" for zoomable image using Google Maps API (default: show) References: 1. Arakawa K, Oshita K, Tomita M (2009) "A web server for interactive and zoomable Chaos Game Representation images", Source Code for Biol Med., 4:6. 2. Almeida JS, Carrico JA, Maretzek A, Noble PA, Fletcher M (2001) "Analysis of genomic sequences by Chaos Game Representation." Bioinformatics. 17(5):429-437. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20090716-01 initial posting =cut sub kmer_table { require GD; opt_default('output'=>'show', 'k'=>6, 'filename'=>'kmer.png', 'gmap'=>0, 'level'=>5); my @argv = opt_get(@_); my $gb = opt_as_gb(shift @argv); my $filename = opt_val('filename'); my $output = opt_val('output'); my $oligo_num = opt_val('k'); my $gmap = opt_val('gmap'); my $level = opt_val('level'); my $apikey = opt_val('apikey'); $output = 'gmap' if($gmap); $gmap = 1 if($output eq 'gmap'); my $font; $font->{a} = [ [2,1], [1,2], [3,2], [1,3],[2,3],[3,3], [1,4], [3,4] ]; $font->{t} = [ [1,1],[2,1],[3,1], [2,2], [2,3], [2,4], ]; $font->{g} = [ [1,1],[2,1],[3,1], [1,2], [1,3], [3,3], [1,4],[2,4],[3,4] ]; $font->{c} = [ [2,1],[3,1], [1,2], [1,3], [2,4],[3,4] ]; $font->{k} = [ [1,1], [3,1], [1,2],[2,2], [1,3], [3,3], [1,4], [3,4] ]; $font->{m} = [
[1,1], [5,1],
[1,2],[2,2], [4,2],[5,2],
[1,3], [3,3], [5,3],
[1,4], [5,4]
];
$font->{null} = [ [1,1],[2,1],[3,1], [1,2],[2,2],[3,2], [1,3],[2,3],[3,3], [1,4],[2,4],[3,4] ]; $font->{0} = [ [1,1],[2,1],[3,1], [1,2], [3,2], [1,3], [3,3], [1,4], [3,4], [1,5],[2,5],[3,5] ]; $font->{1} = [ [1,1],[2,1], [2,2], [2,3], [2,4], [1,5],[2,5],[3,5] ]; $font->{2} = [ [1,1],[2,1],[3,1], [3,2], [1,3],[2,3],[3,3], [1,4], [1,5],[2,5],[3,5] ]; $font->{3} = [ [1,1],[2,1],[3,1], [3,2], [1,3],[2,3],[3,3], [3,4], [1,5],[2,5],[3,5] ]; $font->{4} = [ [1,1], [3,1], [1,2], [3,2], [1,3],[2,3],[3,3], [3,4], [3,5] ]; $font->{5} = [ [1,1],[2,1],[3,1], [1,2], [1,3],[2,3],[3,3], [3,4], [1,5],[2,5],[3,5] ]; $font->{6} = [ [1,1],[2,1],[3,1], [1,2], [1,3],[2,3],[3,3], [1,4], [3,4], [1,5],[2,5],[3,5] ]; $font->{7} = [ [1,1],[2,1],[3,1], [1,2], [3,2], [3,3], [3,4], [3,5] ]; $font->{8} = [ [1,1],[2,1],[3,1], [1,2], [3,2], [1,3],[2,3],[3,3], [1,4], [3,4], [1,5],[2,5],[3,5] ]; $font->{9} = [ [1,1],[2,1],[3,1], [1,2], [3,2], [1,3],[2,3],[3,3], [3,4], [1,5],[2,5],[3,5] ]; my $separate = $oligo_num * 4; $separate = 32 if($separate < 32); my %colorTable = oligomer_counter($gb,-length=>$oligo_num); for (my $i=0;$i<4**$oligo_num;$i++) { my $num = $i; my @seq = (); for (1 .. $oligo_num) { my $tmp = int($num/(4**($oligo_num - $_)));
$num = $num%(4**($oligo_num - $_)); if ($tmp == 0) { $seq[$_] = 'a'; } elsif ($tmp == 1) { $seq[$_] = 't'; } elsif ($tmp == 2) { $seq[$_] = 'g'; } elsif ($tmp == 3) { $seq[$_] = 'c'; } else { $seq[$_] = $tmp; } } my $seq = join('',@seq); if ($colorTable{$seq} eq '') { $colorTable{$seq} = 0; } } my $max = max(values %colorTable); my $wide = 2 ** $oligo_num * $separate; my $img = GD::Image->new($wide, $wide); my @color; for (0..25) { my $val = $_ * 10 + 5; $color[$_] = $img->colorAllocate($val, $val, $val); } $img->fill($wide - 1, $wide - 1, $color[25]); for my $seq (keys(%colorTable)) { my $up_down = $seq; $up_down =~ tr/acgt/0011/; my $r_l = $seq; $r_l =~ tr/agct/0011/; my $x = oct("0b" . $up_down) * $separate; my $y = oct("0b" . $r_l) * $separate; my $contrast = int((255 - 255 * ($colorTable{$seq} / $max))/10); $contrast = 25 if ($colorTable{$seq} == 0); my $color = $color[$contrast]; $img->rectangle($x, $y, $x + $separate-1, $y + $separate-1, $color); $img->fill($x+1, $y+1, $color); my $base_color; if ($contrast < 14){# $max/2 < $colorTable{$seq}) {
$base_color = $color[20]; } else { $base_color = $color[7]; } my $i = 0; for my $tmp (split(//, $seq)){ $img->setPixel($x + $font->{$tmp}[$_][0] + $i * 4, $y + 3 + $font->{$tmp}[$_][1], $base_color) for(0..scalar(@{$font->{$tmp}})-1); $i ++; } my $num = $colorTable{$seq}; if ($oligo_num <= 6 && length($num) - 8 > 5 || $oligo_num > 4 && length($num) - $oligo_num > 5){ substr($num, -9) = 'g'; }elsif($oligo_num <= 6 && length($num) - 8 > 2 || $oligo_num > 4 && length($num) - $oligo_num > 2){ substr($num, -6) = 'm'; }elsif($oligo_num <= 6 && length($num) - 8 > 0 || $oligo_num > 4 && length($num) - $oligo_num > 0){ substr($num, -3) = 'k'; } $i = 0; for my $tmp (split(//, $num)){ $img->setPixel($x + $font->{$tmp}[$_][0] + $i * 4, $y + 10 + $font->{$tmp}[$_][1], $base_color) for(0..scalar(@{$font->{$tmp}})-1); $i ++; } } binmode STDOUT; open FILE,">graph/$filename"; print FILE $img->png; close FILE; msg_gimv("graph/$filename") if ($output eq 'show'); if($gmap){ generateGMap("graph/$filename", -cgr=>1, -level=>$level, -apikey=>$apikey); } } =head2 cgr Name: cgr - create a Chaos Game Representation of a given sequence Description: This program creates a Chaos Game Representation (CGR) image of a given sequence. CGR is generated by the following procedure: 1. Start from position (0,0) or the origin of two dimensional coordinate. Four nucleotides are located at the four corners: A: (-1, 1) upper left T: (1, -1) lower right G: (1, 1) upper right C: (-1, -1) lower left 2. For each nucleotide, move and mark the new location which is halfway between the current location and the nucleotide. For example, if the first letter is A, position is moved from (0,0) to midpoint between (-1, 1) and (0,0), which is (-0.5, 0.5). 3. Repeat this procedure for all nucleotides. CGR is a generalized scale-independent Markov probability table for the sequence, and oligomer tables (see document for "kmer_table" function) can be deduced from CGR image. With Google Maps representation using -gmap=>1 option, oligomers can be searched incrementally from the top search box. More detailed documentation is available at http://www.g-language.org/wiki/cgr Usage: null = cgr($sequence); Options: -width width of image (default: 1024) -filename filename for output image (default: cgr.png) -level zoom level for Google Maps representation (default: 5) -gmap set to 1 to use Google Maps API to create zoomable image (default: 0) -output "g" for image output, "show" for image output and display, "gmap" for zoomable image using Google Maps API (default: show) References: 1. Arakawa K, Oshita K, Tomita M (2009) "A web server for interactive and zoomable Chaos Game Representation images", Source Code for Biol Med., 4:6. 2. Almeida JS, Carrico JA, Maretzek A, Noble PA, Fletcher M (2001) "Analysis of genomic sequences by Chaos Game Representation." Bioinformatics. 17(5):429-437. Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp) History: 20090716-01 initial posting =cut sub cgr { require GD; opt_default('output'=>'show', 'width'=>1024, 'filename'=>'cgr.png', 'gmap'=>0, 'level'=>5); my @argv = opt_get(@_); my $gb = opt_as_gb(shift @argv); my $output = opt_val('output'); my $filename = opt_val('filename'); my $wide = opt_val('width'); my $gmap = opt_val('gmap'); my $level = opt_val('level'); my $apikey = opt_val('apikey'); $output = 'gmap' if($gmap); $gmap = 1 if($output eq 'gmap'); $wide = 2 ** ($level - 1) * 256 if($gmap); my $adjust = int(sqrt($wide)) - 8; $adjust = 1 if($adjust < 1); my @pixel; for my $x (0..$wide-1) { for my $y (0..$wide-1) { $pixel[$x][$y] = 0; } } my $img = GD::Image->new($wide,$wide); my @color; for (0..10){ my $val = $_ * 25 + 5; $color[$_] = $img->colorAllocate($val, $val, $val); } my @pos = (int(($wide-1)/2), int(($wide-1)/2)); my $max = 0; for my $base (reverse(split(//, lc($gb->{SEQ})))){ if ($base eq 'a') { $pos[0] = int($pos[0]/2);
$pos[1] = int($pos[1]/2);
} elsif ($base eq 't') { $pos[0] = int(($pos[0]+$wide-1)/2);
$pos[1] = int(($pos[1]+$wide-1)/2);
} elsif ($base eq 'g') { $pos[0] = int(($pos[0]+$wide-1)/2);
$pos[1] = int($pos[1]/2);
} elsif ($base eq 'c') { $pos[0] = int($pos[0]/2);
$pos[1] = int(($pos[1]+$wide-1)/2);
} $pixel[$pos[0]][$pos[1]]++; $max = $pixel[$pos[0]][$pos[1]] if ($max < $pixel[$pos[0]][$pos[1]]); } for my $x (0..$wide-1) { for my $y (0..$wide-1) { my $colorcode = int(10-$pixel[$x][$y]/$max*$adjust*10);
$colorcode = 0 if ($colorcode < 0); $img->setPixel($x, $y, $color[$colorcode]); } } binmode STDOUT; open FILE,">graph/$filename"; print FILE $img->png; close FILE; msg_gimv("graph/$filename") if ($output eq 'show'); if($gmap){ generateGMap("graph/$filename", -cgr=>1, -level=>$level, -apikey=>$apikey); } } 1;
}
General documentation
AUTHORTop
    
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)