G::Seq Align
SummaryIncluded librariesPackage variablesDescriptionGeneral documentationMethods
Summary
  G::Seq::Align - Analysis methods related to sequence alignment
Package variables
No package variables defined.
Included modules
G::Messenger
SelfLoader
SubOpt
Inherit
Exporter
Synopsis
No synopsis!
Description
    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to sequence alignment.
Methods
alignmentDescriptionCode
diffseqDescriptionCode
Methods description
alignmentcode    nextTop
 Name: alignment   -   aligns two sequences with Viterbi algorithm

 Description:
    This method aligns two sequences using dynamic programming with 
    Viterbi algorithm. 

 Usage: 
    \%data = alignment($seq1, $seq2);

 Options:
    -Gap       gap penalty score (default: -5)
    -Match     match score       (default: 10)
    -Miss      mismatch penalty  (default: -7)
    -output    'stdout' for printing, 'f' for file output (default:'stdout')
    -filename  filename for output option 'f' (default: 'alignment.txt')
 
 Author: 
     Koya Mori
     Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20090320-01 code optimization 20010915-01 initial posting
diffseqcodeprevnextTop
 Name: diffseq   -   extracts differences of two sequences

 Description:
    This method identifies the differences (i.e. mutations or SNPs) in two
    sequences using dynamic programming with Viterbi algorithm. 

 Usage: 
    \@data = diffseq($seq1, $seq2);

 Options:
    -Gap       gap penalty score (default: -5)
    -Match     match score       (default: 10)
    -Miss      mismatch penalty  (default: -7)
    -output    'stdout' for printing, 'f' for file output (default:'stdout')
    -filename  filename for output option 'f' (default: diffseq.csv)
    -mode      'align' for two sequences in input, any other to use the 
               data structure returned by alignment() (default: 'align');
 
 Author: 
     Koya Mori
     Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20090320-01 code optimization 20010915-01 initial posting
Methods code
alignmentdescriptionprevnextTop
sub alignment {
    opt_inputType('seq', 'seq');
    opt_default(Gap=>-5, Match=>10, Miss=>-7, output=>'stdout', filename=>'alignment.txt');
    my @args = opt_get(@_);

    my $gb   = opt_as_gb(shift @args);
    my $seq1 =\$ gb->{SEQ};
       $gb   = opt_as_gb(shift @args);
    my $seq2 =\$ gb->{SEQ};
    my %opt  = opt_val();
    my %align;
    my ($H, $V, $D, $NDIR) = (0, 1, 2, 3);
    my @score_path_matrix;

    $$seq1 =~ tr/ \n\t[0-9]//d;
    $$seq2 =~ tr/ \n\t[0-9]//d;
    my $i = length($$seq1);
    my $j = length($$seq2); 

    local *find_max_score = sub{
	my ($i, $j, $seq1, $seq2, $Gap_Penalty, $Match, $Miss) = @_;
	my ($p, $max_score, $max_dir, @score_tmp, @max_score, %tmp_hash);
	my ($H, $V, $D, $NDIR) = (0, 1, 2, 3);
	
	if($score_path_matrix[$i][$j]{score}){
	    return $score_path_matrix[$i][$j]{score};
	}else{
	    if($i == 0){
		$max_score = 0;
		$max_dir = $H;
	    }elsif($j == 0){
		$max_score = 0;
		$max_dir = $V;
	    }else{
		$score_tmp[$V] = find_max_score($i - 1, $j, $seq1, $seq2, $Gap_Penalty, $Match, $Miss) + $Gap_Penalty;
		$score_tmp[$H] = find_max_score($i, $j - 1, $seq1, $seq2, $Gap_Penalty, $Match, $Miss) + $Gap_Penalty;
		if(substr($$seq1, $i - 1, 1) eq substr($$seq2, $j - 1, 1)){
		    $score_tmp[$D] = find_max_score($i - 1, $j - 1, $seq1, $seq2, $Gap_Penalty, $Match, $Miss) + $Match;
		}else{
		    $score_tmp[$D] = find_max_score($i - 1, $j - 1, $seq1, $seq2, $Gap_Penalty, $Match, $Miss) + $Miss;
		}
		
		%tmp_hash=();
		foreach(@score_tmp){
		    $tmp_hash{$_} = $p;
		    $p++;
		} 
		@max_score = sort {$b <=> $a} @score_tmp;
		$max_dir   = $tmp_hash{$max_score[0]};
		$max_score = $score_tmp[$max_dir];
	    }
	    
	    $score_path_matrix[$i][$j]{score} = $max_score;
	    $score_path_matrix[$i][$j]{direction} = $max_dir;
	    
	    return $max_score;
	}
    };
    
    $align{score} = find_max_score($i, $j, $seq1, $seq2, $opt{'Gap'}, $opt{'Match'}, $opt{'Miss'});
    
    while($i > 0 | $j > 0){
	if($score_path_matrix[$i][$j]{direction} == $H){
	    $align{seq1} = '-' . $align{seq1};
	    $align{seq2} = substr($$seq2, $j - 1, 1) . $align{seq2};
	    $align{hit}  = '-' . $align{hit};
	    $j--;
	}
	if($score_path_matrix[$i][$j]{direction} == $V){
	    $align{seq1} = substr($$seq1, $i - 1, 1) . $align{seq1};
	    $align{seq2} = '-' . $align{seq2};
	    $align{hit}  = '-' . $align{hit};
	    $i--;
	}
	if($score_path_matrix[$i][$j]{direction} == $D){
	    my $tmp1 = substr($$seq1, $i - 1, 1);
	    my $tmp2 = substr($$seq2, $j - 1, 1);
	    $align{seq1} = $tmp1 . $align{seq1};
	    $align{seq2} = $tmp2 . $align{seq2};
	    if($tmp1 eq $tmp2){
		$align{hit} = '+' . $align{hit};
	    }else{
		$align{hit} = '-' . $align{hit};
	    }
	    $j--;
	    $i--;
	}
    }
    
    my $hit_num  = $align{hit}  =~ tr/+/+/;
    my $miss_num = $align{hit}  =~ tr/-/-/;
    my $gap_num1 = $align{seq1} =~ tr/-/-/;
    my $gap_num2 = $align{seq2} =~ tr/-/-/;

    my $result;
    $result .= sprintf("Seq1: %d bp   Seq2: %d bp\n", length($$seq1), length($$seq2));
    $result .= sprintf("Score: %d   Match: %d   Miss: %d\n", $align{score}, $hit_num, $miss_num);
    $result .= sprintf("Gap in seq1: %d   Gap in seq2: %d\n", $gap_num1, $gap_num2);
    $result .= sprintf("Match score: %d   Miss score: %d   Gap Penalty: %d\n\n", $opt{'Match'}, $opt{'Miss'}, $opt{'Gap'});
    
    for(my $q = 0; $q * 60 < length($align{seq1}) || $q * 60 < length($align{seq2}); $q++){
	$result .= substr($align{seq1}, $q * 60, 60) .  "\n";
	$result .= substr($align{seq2}, $q * 60, 60) . "\n";
	$result .= substr($align{hit},  $q * 60, 60) . "\n\n";
    }

    if($opt{'output'} eq 'stdout'){
	msg_send($result);
    }elsif($opt{'output'} =~ /f/){
	msg_datafile($result, $opt{'filename'});
    }
    
    return\% align;
}
diffseqdescriptionprevnextTop
sub diffseq {
    opt_inputType('seq', 'seq');
    opt_default(mode=>'align', Gap=>-5, Match=>10, Miss=>-7, output=>'stdout', filename=>'diffseq.csv');
    my @args = opt_get(@_);
    my %opt  = opt_val();

    my ($align, $seq1, $seq2);
    if($opt{'mode'} eq "align"){
	my $gb = opt_as_gb(shift @args);
	$seq1 =\$ gb->{SEQ};
	$gb = opt_as_gb(shift @args);
	$seq2 =\$ gb->{SEQ};
	$align = alignment($seq1, $seq2, -Gap=>$opt{'Gap'}, -Match=>$opt{'Match'}, -Miss=>$opt{'Miss'}, -output=>'NULL');
    }else{
	$align = shift @args;
    }

    my ($i, @diff, @diff_pos);
    while(0 <= ($i = index($$align{hit}, '-', $i + 1))){
	push(@diff_pos, $i);
    }
    
    foreach(@diff_pos){
	my $s1 = substr($$align{seq1}, $_, 1);
	my $s2 = substr($$align{seq2}, $_, 1);
	my $tmp_gap = substr($$align{seq1}, 0, $_ + 1) =~ tr/-/-/;
	my $s1_pos  = $_ - $tmp_gap + 1;
	   $tmp_gap = substr($$align{seq2}, 0, $_ + 1) =~ tr/-/-/;
	my $s2_pos  = $_ - $tmp_gap + 1;
	my $pos = $_ + 1;
	push(@diff, join(',', 'insertion',    $pos, 'seq1', $s1_pos, 'seq2', $s2_pos, $s1, $s2)) if($s1 eq '-');
	push(@diff, join(',', 'deletion',     $pos, 'seq1', $s1_pos, 'seq2', $s2_pos, $s1, $s2)) if($s2 eq '-');
	push(@diff, join(',', 'transversion', $pos, 'seq1', $s1_pos, 'seq2', $s2_pos, $s1, $s2)) if($s1 ne '-' && $s2 ne '-');
    }

    if($opt{'output'} eq "stdout"){
	foreach(@diff){
	    my @line = split(/,/, $_, 8);
	    msg_send(sprintf("%s\t%5d:  seq1\(%d\)\-\>seq2\(%d\)  %s\-\>  %s\n", $line[0], $line[1], $line[3], $line[5], $line[6], $line[7]));
	}
    }elsif($opt{'output'} =~ /f/){
	msg_datafile(join("\n", @diff), $opt{'filename'});
    }
    return\@ diff;
}
General documentation
No general documentation available.