Clustering documentation.

Name: hclust - Hierarchical clustering analysis for given arraies Descriptions: Hierarchical clustering analysis methods for given arrays. Ward method uses the ward.D2 by default. Installation of amap library for R language is required. run R as a super user - sudo R - and type the followings: install.packages('amap') Usage: hclust(\@array1_of_values, \@array2_of_values, ...); or hclust(\@array1_of_values, \@array2_of_values, ..., -label => \@grouping_label); Options: -output output toggle option (default: show) "g" to generate graph without displaying. -filename output filename of the clustering graph (default: hclust.pdf) -method the agglomeration method to be used (default: ward). 'ward', 'single', 'complate', 'average', 'centroid', 'median' or 'mcquitty' -distmethod the distance measure method (default: correlation) 'euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'kendall' 'spearman', 'pearson' (not centered Pearson), 'abspearson' (Absolute Pearson), 'correlation' (Centered Pearson) or 'abscorrelation' (Absolute correlation) this option is based on 'Dist' method in 'amap' library in R. -label labels or names of the data series Author: Kazuki Oshita (cory@g-language.org)
History: 20130321-01 complete rewrite by cory, exported by default 20070612-01 converted to Rcmd::Clustering 20030904-01 initial posting (G::Tools::RCluster)

Name: kmeans() - clustering with K-means method Description: Clustering with K-means method with using R language. Number of cluster centers can be given by -centers option (default: 10) and number of iterations is given by -iter.max (default: 10). Returned value corresponds to result$cluster of kmeans() in R. (a vector of cluster numbers to which each point is allocated) Usage: @cluster = $rcmd->kmeans(\@array1, \@array2, \@array3, ..., -label=>\@label); Arrays correspond to the columns (data series), and labels for each of these arrays can be given by -label option. Options: -label labels or names of the data series. -centers number of cluster centers (default: 5) -iter.max number of iterations (default: 10) -filename output filename of the graph (default: kmeans.pdf) -output output toggle option (default: show) "g" to generate graph without displaying. -sampledata use sample data (default: 0) Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20070612-01 converted to Rcmd::Clustering 20030904-01 initial posting (G::Tools::RCluster)

Name: som() - clustering using Self-Organizing Map Description: Clustering with Self-Organizing Map (SOM) using R language. Installation of GeneSOM library for R language is required. run R as a super user - sudo R - and type the following: install.packages('som')) Returns a two-dimensional array correspondingn to the result$visual of som() in R's GeneSOM library. Usage: @result = som(\@array1, \@array2, \@array3, ..., -label=>\@label); Arrays correspond to the columns (data series), and labels for each of these arrays can be given by -label option. Options: -label labels or names of the data series. -xdim x-dimension of the map (default: 3) -ydim y-dimension of the map (default: 3) -filename output filename of the graph (default: som.pdf) -output output toggle option (default: show) "g" to generate graph without displaying. -sampledata use sample data (default: 0) Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20141110-01 exported by default 20070612-01 converted to Rcmd::Clustering 20030904-01 initial posting (G::Tools::RCluster)

sub hclust {

    &opt_default(output => 'show', label => [], filename => 'hclust.pdf', method => 'ward', distmethod => 'correlation');

    my @args= opt_get(@_);
    my $output= opt_val('output');
    my @label=  @{opt_val('label')};
    my $filename= opt_val('filename');
    my $method= opt_val('method');
    my $distmethod= opt_val('distmethod');

    my @all_methods= ('ward', 'single', 'complate', 'average', 'centroid', 'median', 'mcquitty');
    unless (grep /^$method$/, @all_methods) {
        $method= 'ward';
    }
    $method = 'ward.D2' if ($method = 'word');

    my @all_dist_methods= ('euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'pearson',
                           'abspearson', 'correlation', 'abscorrelation', 'spearman', 'kendall');
    unless (grep /^$distmethod$/, @all_dist_methods) {
        $distmethod= 'pearson';
    }

    my $rcmd = Rcmd->new();

    my (%data_table, @R_names);
    for my $i (0 .. $#args) {
        my $R_name= 'array'.$i;
        push @R_names, $R_name;

        $data_table{$R_name}= $_[$i];
        $rcmd->array($R_name, @{$args[$i]});
    }

    # 'label' list object (if required)
    $rcmd->sarray('label', @label) if $#label > -1;
    my @R_commands= (
                     'CMP <- complete.cases('.join(', ', @R_names).')',
                     ('d.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).', row.names=label)')x!! ($#label > -1),
                     ('d.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).')')x!!                  ($#label == -1)
		     );

    for my $key (@R_names) {
        push @R_commands, $key.' <- '.$key.'[CMP]';
        push @R_commands, 'd.table$'.$key.' <- '.$key;
    }

    $rcmd->exec(
                @R_commands,

                'library("amap")',
                "pdf('./graph/".$filename."')",
                'hc <- hclust(Dist(d.table, method="'.$distmethod.'"), method="'.$method.'")',
                'plot(hc)',
               );

    msg_gimv('graph/'.$filename) if $output eq 'show';

    return '';

}

sub kmeans {

    my $rcmd = new Rcmd();
    $rcmd->set_mode('tmp');

    opt_default("iter.max"=>10, filename=>"kmeans.pdf", output=>"show", sampledata=>0, centers=>5);
    my @args = opt_get(@_);
    my $centers = opt_val("centers");
    my $iter = opt_val("iter.max");
    my $output = opt_val("output");
    my $filename = opt_val("filename");
    my $sampledata = opt_val("sampledata");
    my $label = opt_val("label") || '';

    if($sampledata){
	$rcmd->sample_data_for_clustering();
    }else{
	$rcmd->set_clust_data(@args, -label=>$label);
    }

    my @result = $rcmd->exec(
			     'require(stats)',
			     "rclust.kmeans<-kmeans(rclust,$centers,$iter)",
			     'rclust.kmeans$cluster'
			     );

    if($output =~ /g/ || $output =~ /show/){
	$rcmd->exec(
		    "pdf('./graph/".$filename."')",
		    'plot(rclust,col=rclust.kmeans$cluster)',
		    "points(rclust.kmeans\$centers, col=1:$centers,pch=8)"
		    );

	msg_gimv("graph/$filename") if ($output =~ /show/);
    }
    
    $rcmd->set_mode();
    return @result;

}

sub set_clust_data {

    my $rcmd = shift;

    my @args = opt_get(@_);
    my $label = opt_val("label") || '';

    my $flag = 0;
    foreach (@args){
	if ($flag == 0){
	    $flag ++;

	    $rcmd->array('rclust', @$_);
	}else{
	    $rcmd->array('tmp', @$_);
	    $rcmd->exec('rclust <- rbind(rclust, tmp)');
	}
    }

    if (length $label){
	$rcmd->sarray('label', @$label);
	$rcmd->exec('dimnames(rclust) <- list(label, NULL)');
    }

}

sub som {

    my $rcmd = new Rcmd();
    $rcmd->set_mode('tmp');

    opt_default(filename=>"som.pdf", output=>"show", xdim=>3, ydim=>3, sampledata=>0, topo=>'hexa', neigh=>'gaussian');
    my @args       = opt_get(@_);
    my $xdim       = opt_val("xdim");
    my $ydim       = opt_val("ydim");
    my $filename   = opt_val("filename");
    my $output     = opt_val("output");
    my $sampledata = opt_val("sampledata");
    my $label      = opt_val("label") || '';
    my $topo       = opt_val('topo');
    my $neigh      = opt_val('neigh');

    if($sampledata){
	$rcmd->sample_data_for_clustering();
    }else{
	$rcmd->set_clust_data(@args, -label=>$label);
    }

    $rcmd->exec(
		'require(som)',
		"rclust.som<-som(rclust, $xdim, $ydim, topo='hexa', neigh='gaussian')",
		'rclust.som$visual'
		);

    my @result;
    open(FILE, $rcmd->{log}) || die($!);
    while(<FILE>){
	if(/qerror/){
	    @result = ();
	    while(<FILE>){
		chomp;
		if(/^\d/){
		    my (undef, $x, $y, $qerror) = split(/\s+/, $_, 4);
		    push(@result, [$x, $y, $qerror]);
		}
	    }
	}
    }
    close(FILE);

    if($output =~ /g/ || $output =~ /show/){
	$rcmd->exec(
                    "pdf('./graph/".$filename."')",
		    'plot(rclust.som)'
		    );

	msg_gimv("graph/$filename") if ($output =~ /show/);
    }
    
    $rcmd->set_mode();
    return @result;

}

hclust	Description	Code
kmeans	Description	Code
sample_data_for_clustering	No description	Code
set_clust_data	No description	Code
som	Description	Code