G::Seq PatSearch
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Summary
  G::Seq::PatSearch - component of G-language Genome Analysis Environment
Package variables
No package variables defined.
Included modules
G::Messenger
G::Seq::Primitive
G::Tools::Graph
SelfLoader
SubOpt
Inherit
Exporter
Synopsis
Description
    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to GC skew.
Methods
baseParingTest
No description
Code
find_difDescriptionCode
find_dnaAboxDescriptionCode
find_seqDescriptionCode
nucleotide_periodicityDescriptionCode
oligomer_counterDescriptionCode
palindromeDescriptionCode
Methods description
find_difcode    nextTop
  Description:
    Finds E.coli dif sequence (ggtgcgcataatgtatattatgttaaat) in both strands.
    dif is a 28bp sequence element recognized by XerCD located near the replication
    terminus used for chromosome dimer resolution by recombination.
    
  Usage: 
    (array @position) = find_dif(sequence)
    
  Options:
    none
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

History:
20060711-01 initial posting
find_dnaAboxcodeprevnextTop
  Description:
    Finds dnaA box(TT A/T TNCACA) in both strands.
    
  Usage: 
    (array @position) = find_dnaAbox(sequence)
    
  Options:
    none
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

History:
20021125-01 initial posting
find_seqcodeprevnextTop
  Description:
    Counts an oligomer and its complement.

  Usage:
    (int $direct, int $comp, int $total) = find_seq(sequence, string $oligo);

 Options:
    none

  Author:
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

History:
20010326-01 initial posting
nucleotide_periodicitycodeprevnextTop
  Description:
    Checks the periodicity of certain nucleotide (best known with AA dinucleotide)
    
  Usage: 
    array data = nucleotide_periodicity(sequence);
    
  Options:
    -nucleotide    nucleotide to search (default:aa)
    -window        window size to seek periodicity (default:50)
    -filename      output filename (default:aa_frequency.png)
    -output        "g" for graph file output only,
                   "show" for graph file output and display.
                   (default: show)
    
  ToDo:
    data output

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

History:
20070206-01 initial posting
oligomer_countercodeprevnextTop
  Description:
    Counts the number of oligomers in a sequence (by windows optionally)

  Usage: 
    (array @count || int $count) = oligomer_counter(sequence);

 Options:
    -window      int window size.
                 If specified, seeks oligomer in specified windows
                 Method returns an array of numbers at each windows
                 If not specified, seeks oligomer in the genome
                 Method returns the number of oligomers
    -output      "f" for file output, "g" for graph output
                 Only available when -window option is specified

  Author: 
    Kazuharu Arakawa
    -based on atg7.wind + gcwind [rsaito]

  History:
    20010829-01 initial posting
palindromecodeprevnextTop
  Description:
    Searches palindrome sequences

 Usage: 
    palindrome(sequence); 

 Options:
    -shortest shortest palindrome to search (default:4)
    -loop     longest stem loop to allow (default: 0)
    -gtmatch  if 1, allows g-t match (default: 0)
    -output   "f" for file output
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20010829-01 initial posting
Methods code
baseParingTestdescriptionprevnextTop
sub baseParingTest {
    my $first = lc(shift);
    my $second = lc(shift);
    my $gtmatch = shift;
    die("First two arguments must be single base (i.e. a, t, g, or c).\n")
	unless(length($first) == 1 && length($second) == 1);

    if ($first eq 'a' && $second eq 't' ||
	$first eq 't' && $second eq 'a' ||
	$first eq 'g' && $second eq 'c' ||
	$first eq 'c' && $second eq 'g' ||
	$first eq 't' && $second eq 'g' && $gtmatch ||
	$first eq 'g' && $second eq 't' && $gtmatch
	)
    {
	return 1;
    }else{
	return 0;
    }
}
find_difdescriptionprevnextTop
sub find_dif {
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $start = -1;
    my @pos = ();
    my $dif = "ggtgcgcataatgtatattatgttaaat";

    while(0 <= ($start = index($gb->{SEQ}, $dif, $start + 1))){
	push(@pos, $start);
    }

    while(0 <= ($start = index($gb->{SEQ}, complement($dif), $start + 1))){
	push(@pos, $start);
    }

    return @pos;
}
find_dnaAboxdescriptionprevnextTop
sub find_dnaAbox {
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $i = 0;
    my @pos = ();

    for ($i = 0; $i < length($gb->{SEQ}) - 8; $i ++){
	if (substr($gb->{SEQ}, $i, 9) =~ /(tt[at]t.caca)/){
	    push (@pos, $i);
	    msg_send(sprintf "%d %s\n", $i, $1);
	}elsif(substr($gb->{SEQ}, $i, 9) =~ /(tgtg.a[at]aa)/){
	    push (@pos, $i);
	    msg_send(sprintf "%d %s\n", $i, $1);
	}
    }

    return @pos;
}
find_seqdescriptionprevnextTop
sub find_seq {
    my $gb = opt_as_gb(shift);
    my $ref_Genome =\$ gb->{SEQ};
    my $sSeq = shift;
    my $printer=shift;
    my $sSeq2 = complement($sSeq);
    my $direct = 0;
    my $comp = 0;
    my $iSeqStart = 0;
    
    while(0 <= ($iSeqStart = index($$ref_Genome, $sSeq, $iSeqStart + 1))){
	$direct ++;
    }
    $iSeqStart = 0;
    while(0 <= ($iSeqStart = index($$ref_Genome, $sSeq2, $iSeqStart + 1))){
	$comp ++;
    }

    if($printer eq "f"){
	open(FILE,">>oligomer_count.rst");
	print FILE '--- find_sequence_result ---',"\n";
	print FILE "$sSeq: $direct\n$sSeq2: $comp\nTotal: $direct+$comp\n\n";
	close(FILE);
    }
    return ($direct, $comp, $direct + $comp);
}
nucleotide_periodicitydescriptionprevnextTop
sub nucleotide_periodicity {
    opt_default("nucleotide"=>"aa", "window"=>50, "filename"=>"aa_frequency.png", "output"=>"show");
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $nuc = opt_val("nucleotide");
    my $window = opt_val("window");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my @data = ();
    $data[$_] = 0 for (0..($window - 1));

    my $start = -1;
    while(0 <= ($start = index($gb->{SEQ}, $nuc, $start + 1))){
	my $innerPos = -1;
	my $localSeq = substr($gb->{SEQ}, $start + length($nuc), $window);
	while(0 <= ($innerPos = index($localSeq, $nuc, $innerPos + 1))){
	    $data[$innerPos]++;
	}
    }

    _UniMultiGrapher([0..($window - 1)],\@ data, -filename=>$filename);
    msg_gimv("graph/$filename") if ($output eq 'show');

    return @data;
}
oligomer_counterdescriptionprevnextTop
sub oligomer_counter {
    opt_default("window"=>0);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $seq = shift @args;
    my $window = opt_val("window");
    $window = length($gb->{SEQ}) if($window <= 0);

    if (opt_val("window")){
	open(OUT, '>oligo_count.csv') || &msg_error($!)
	    if (opt_val("output") eq "f");

	my $i = 0;
	my @wincount = ();
	my @winnum = ();
	for ($i = 0; $i <= int(length($gb->{SEQ}) / $window); $i ++){
my
$partial = substr($gb->{SEQ}, $i * $window, $window);
last if (length($partial) < $window); my $start = 0; my $count = 0; if (length($seq) == 1 && $seq =~ /a|t|g|c/){ $count = $partial =~ tr/a/a/ if ($seq eq 'a'); $count = $partial =~ tr/t/t/ if ($seq eq 't'); $count = $partial =~ tr/g/g/ if ($seq eq 'g'); $count = $partial =~ tr/c/c/ if ($seq eq 'c'); }else{ while(0 <= ($start = index($partial, $seq, $start + 1))){ $count ++; } } push (@wincount, $count); push (@winnum, $i * $window); print OUT "%d,%d\n", $i*$window, $count if (opt_val("output") eq "f"); } close(OUT) if (opt_val("output") eq "f"); if (opt_val("output") eq "g"){ _UniMultiGrapher(\@winnum,\@ wincount, -x=>'window(bp)', -y=>'number of oligomer', -title=>'oligomer by window', -outfile=>'oligo_count.png' ); } return (@wincount); }else{ my $start = 0; my $count = 0; while(0 <= ($start = index($gb->{SEQ}, $seq, $start + 1))){ $count ++; } return $count;
}
palindromedescriptionprevnextTop
sub palindrome {
    &opt_default(gtmatch=>0, loop=>0, shortest=>4, -output=>"stdout", -filename=>"palindrome.csv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $length = int(opt_val("shortest") / 2);
my $output = opt_val("output"); my $filename = opt_val("filename"); my %palindrome; my $i = $length - 1; my ($len, $j, $k, $stem); if (opt_val("output") eq "f"){ open(OUT, '>' . $filename) || &msg_error("G::Seq::PatSearch::palindrome() $! $filename"); print OUT "Length, start, end, sequence\n"; } while($i <= length($gb->{SEQ}) - 1 - $length - opt_val("loop")){ $stem = opt_val("loop"); while($stem >= 0){ $j = $i; $k = $stem + 1 + $i; $len = 0; last if ($k > length($gb->{SEQ}) - 1); while(&baseParingTest(substr($gb->{SEQ}, $j, 1), substr($gb->{SEQ}, $k, 1), &opt_val("gtmatch")) ) { $j --; $k ++; last if ($j < 0 || $k > length($gb->{SEQ}) - 1); $len += 2; } if ($len >= opt_val("shortest")){ &msg_send(sprintf("Length: %2d Position: %7d %7d Sequence: %s %s %s\n", $len, $j + 1, $k - 2, substr($gb->{SEQ}, $j + 1, $len/2),
substr(
$gb->{SEQ}, $j + 1 + $len/2, $stem), substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2))) if ($output eq 'stdout'); if ($output eq "f"){ printf OUT "%d,%d,%d,%s %s %s\n", $len, $j + 1, $k - 2, substr($gb->{SEQ}, $j + 1, $len/2),
substr(
$gb->{SEQ}, $j + 1 + $len/2, $stem), substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2); } $palindrome{$j + 1} = sprintf("%s %s %s", substr($gb->{SEQ}, $j + 1, $len/2),
substr(
$gb->{SEQ}, $j + 1 + $len/2, $stem), substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2) ); } $stem --; } $i ++; } close(OUT) if ($output eq "f"); return\% palindrome;
}
General documentation
AUTHORTop
    
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
BaseParingTestTop
  Description:
    Base pairing check
    
  Usage: 
    boolean $match = match_test(char $first, char $second, boolean $gtmatch);
    
  Options:
    none
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

History:
20010829-01 initial posting