Rcmd Summary
SummaryIncluded librariesPackage variablesDescriptionGeneral documentationMethods
Summary
  Rcmd::Summary - Summary statistics of data via interfaces to R language.
Package variables
No package variables defined.
Included modules
G::Messenger
G::Tools::Statistics
Rcmd::Handler
SubOpt
Inherit
Exporter
Synopsis
No synopsis!
Description
 Provides statistical summary about a given data, through interface method
 and numerous internal methods.
Methods
bivariate_n2c
No description
Code
bivariate_n2n
No description
Code
multivariateDescriptionCode
summaryDescriptionCode
univariate
No description
Code
Methods description
multivariatecode    nextTop
  Name: multivariate   -   shows statistic summary for multi-arraies

  Descriptions:
    Shows statistic summary for bivariate arrays like JMP style.
    This method shows scatter matrix and scatter plot graph.

  Usage:
    multivariate(\@array1_of_values, \@array2_of_values, ...);

  Options:
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -filename     output filename of the scatter plot (default: scatter_plot.pdf)
   -method       "pearson", "spearman", or "kendall" (default:pearson)
                 Method used to calculate the correlation coefficient.
   -cor          show correlation in graph (default: r)
                 'r2' or 'r^2' to show R^2 (coefficient of determination).
   -label        optional array reference with labels for each of the arrays

   Author:
     Kazuki Oshita (cory@g-language.org)
History: 20141113-01 merging to G-language GAE and overhaul 20130321-01 initial posting
summarycodeprevnextTop
  Name: summary   -   shows statistical summary for given data or data set

  Descriptions:
    This method shows statistical summary for given data or data set. 

    For a single array or array reference:
      Data is nominal: Shows boxplot of categories 
      Data is continuous: Shows histogram of distribution
    For two arrays references:
      Both is nominal: Shows cross table with Cramer's V
      Either is nominal: Shows boxplot of categories and ANOVA
      Both is continuous: Shows correlation matrix
    For more array references:
      All data is continuous: Shows correlation matrix

  Usage:
    summary(@array)
       or
    summary(\@array, \@array, ...);

  Options 
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -filename     output filename of the bar plot graph (default: *.pdf)

   Additional Options (continuous v.s. continuous):
   -method       "pearson", "spearman", or "kendall" (default:pearson)
                 Method used to calculate the correlation coefficient.
   -cor          show correlation in graph (default: r)
                 'r2' or 'r^2' to show R^2 (coefficient of determination).
   -label        optional array referece of labels for the given arrays

   Author:
     Kazuki Oshita (cory@g-language.org)
Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
History: 20141114-01 merged to G-language GAE with overhaul 20130321-01 initial posting
Methods code
bivariate_n2cdescriptionprevnextTop
sub bivariate_n2c {
    &opt_default(output => 'show', filename => 'bivariate.pdf');

    my @args     = opt_get(@_);
    my $output   = opt_val('output');
    my $filename = opt_val('filename');

    my (@nominal, @continuous);
    if (_is_nominal(\@{$args[0]})) {
	@nominal=    @{shift @args};
	@continuous= @{shift @args};
    } else {
	@continuous= @{shift @args};
	@nominal=    @{shift @args};
    }

    my (@vector, @names, %table);
    # table of nominal array and continuous array
for my $i (0 .. ($#continuous < $#nominal ? $#continuous : $#nominal)) { push @{$table{$nominal[$i]}}, $continuous[$i]; } print "[Summary]\n Label\tNumber\tMean\tSD\n"; for my $name (sort {$a cmp $b} keys %table) { my @items = @{$table{$name}}; my $num = $#items + 1; my $mean = sprintf("%.3f", mean(@items)); my $sd = sprintf("%.5f", standard_deviation(@items)); print "$name\t$num\t$mean\t$sd\n"; push @vector, 'c('.join(' ,', @items).')'; push @names, $name; } print "\n"; my $rcmd= Rcmd->new(); $rcmd->array('continuous', @continuous); $rcmd->sarray('nominal', @nominal); $rcmd->exec( 'anova(aov(continuous ~ factor(nominal)))', "pdf('./graph/".$filename."')", 'boxplot('.join(', ', @vector).', names=c(\''.join('\',\' ', @names).'\'))', ); msg_gimv('graph/'.$filename) if $output eq 'show';
}
bivariate_n2ndescriptionprevnextTop
sub bivariate_n2n {
    my @args     = opt_get(@_);

    my @array1= @{shift @args};
    my @array2= @{shift @args};

    my $rcmd= Rcmd->new();
    $rcmd->sarray('array1', @array1);
    $rcmd->sarray('array2', @array2);
    
    my $cramer= $rcmd->exec(
			    'cat("[Cross Table]\n")',
			    # complete case about array1 and array2
'safe.case <- complete.cases(array1, array2)', 'array1 <- array1[safe.case]', 'array2 <- array2[safe.case]', # make cross table
'cross.table <- table(array1, array2)', # re-format cross table to readable report
'ftable(cross.table)', ## calculate Cramer's V
# expected value
'ex <- outer(rowSums(cross.table), colSums(cross.table)) / sum(cross.table)', # chi square
'chisq <- sum((cross.table - ex)^2 / ex)', # phi coefficient
'phi <- sqrt(chisq / sum(cross.table))', # cramer's V
'phi / sqrt(min(nrow(cross.table), ncol(cross.table)) - 1)' ); print "\nCramer\'s V : $cramer\n";
}
multivariatedescriptionprevnextTop
sub multivariate {
    &opt_default(output => 'show', label=>[], method => 'pearson', cor => 'r', filename => 'scatter_plot.pdf');

    my @args     = opt_get(@_);
    my $output   = opt_val('output');
    my $filename = opt_val('filename');
    my $label    = opt_val('label');
    my $method   = opt_val('method'); # [pearson] pearson, kendall, spearman
my $show_r = opt_val('cor'); # show R value or R^2 value in plot graph
# Method for the correlation coefficient.
$method= 'pearson' if ($method ne 'kendall' && $method ne 'spearman'); # $show_r is 'R' or 'R^2'
$show_r= 'r' if (lc($show_r) ne 'r^2' && lc($show_r) ne 'r2'); my $rcmd= Rcmd->new(); # create data table
my (%data_table, @R_names); for my $i (0 .. $#args) { my $R_name = scalar(@$label) > 1 ? $$label[$i] : 'array'.$i; push @R_names, $R_name; $data_table{$R_name}= $_[$i]; $rcmd->array($R_name, @{$args[$i]}); } my @R_commands= ( 'CMP <- complete.cases('.join(', ', @R_names).')', 'd.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).')', ); for my $key (@R_names) { push @R_commands, $key.' <- '.$key.'[CMP]'; push @R_commands, 'd.table$'.$key.' <- '.$key; } $rcmd->exec( @R_commands, "pdf('./graph/".$filename."')", # show R (or R^2) in graph
'low.panel <- function (x, y, ...) {', (' text(mean(par("usr")[1:2]), mean(par("usr")[3:4]), paste("r = ", signif(cor(x, y), 2)), cex = 2)')x!! ($show_r eq 'r'), (' text(mean(par("usr")[1:2]), mean(par("usr")[3:4]), paste("r^2 = ", signif(cor(x, y)^2, 2)), cex = 2)')x!! (lc($show_r) eq 'r^2' || lc($show_r) eq 'r2'), '}', 'panel.hist <- function(x, ...){', 'usr <- par("usr"); on.exit(par(usr))', ' par(usr = c(usr[1:2], 0, 1.5) )', ' h <- hist(x, plot = FALSE)', 'breaks <- h$breaks; nB <- length(breaks)', ' y <- h$counts; y <- y/max(y)', 'rect(breaks[-nB], 0, breaks[-1], y, col="gray", ...)', '}', # draw scatter plot
'pairs(d.table, upper.panel=low.panel, diag.panel=panel.hist)', # cor
'cat("[Correlation matrix]\n")', "cor(d.table, method='".$method."')", ); # show scatter plot in X11
msg_gimv('graph/'.$filename) if $output eq 'show'; return '';
}
summarydescriptionprevnextTop
sub summary {
    my @args = opt_get(@_);

    if(scalar(@args) == 1 && ref $args[0] eq 'ARRAY'){
	univariate(@_);
    }elsif(ref $args[0] ne 'ARRAY'){
	univariate([@_]);
    }else{
	# array 1 and 2 are continuous scale (0) or nominal scale (1)
my $a1_is_nominal= _is_nominal($args[0]); my $a2_is_nominal= _is_nominal($args[1]); if ($a1_is_nominal * $a2_is_nominal == 1) { # nominal scale vs nominal scale
bivariate_n2n(@_); } elsif ($a1_is_nominal == $a2_is_nominal) { # continuous scale vs continuous scale
multivariate(@_); } else { # nominal scale vs continuous scale
bivariate_n2c(@_); } } return 1;
}
univariatedescriptionprevnextTop
sub univariate {
    &opt_default(output => 'show', filename => 'distribution.pdf');

    my @args      = opt_get(@_);
    my @array     = @{+shift @args};
    my $output    = opt_val('output');
    my $filename  = opt_val('filename');

    my $N        = scalar(@array);
    printf("[Summary]\n");
    printf("N:              %d\n", $N);

    if (_is_nominal(opt_get(@_))) {
	my %count;
	$count{$_}++ for (@array);

	printf("categories:     %d\n", scalar(keys(%count)));
	# sort given array by descending order
my @sorted_label= sort { $count{$a} <=> $count{$b} } keys %count; my $rcmd= Rcmd->new(); $rcmd->array('summary', map { $count{$_} } @sorted_label); $rcmd->exec( # labelling
'label <- c("'.join('", "', @sorted_label).'")'."\n". # draw barplot graph
"pdf('./graph/".$filename."')\n". "barplot(summary, names.arg=label)" ); }else{ my $mean = mean(@array); my $sd = standard_deviation(@array); # http://www.jmp.com/support/help/Statistical_Details_for_the_Distribution_Platform.shtml
my $skewness; $skewness += ((($array[$_] - $mean)/$sd) ** 3) * $N / (($N-1)*($N-2)) for (0..$#array); my $kurtosis; $kurtosis += ((($array[$_] - $mean)/$sd) ** 4) for (0..$#array);
$kurtosis *= ($N * ($N + 1))/(($N-1)*($N-2)*($N-3));
$kurtosis -= (3*(($N-1)**2))/(($N-2)*($N-3));
my @sorted = sort { $a <=> $b } @array; # summary about basic statistics
printf("Mean: %.3f\n", $mean); printf("Median: %.3f\n", median(@array)); print 'Lower 95%', sprintf(" Mean: %.3f\n", mean(@sorted[0..int($N * 0.95)])); print 'Upper 95%', sprintf(" Mean: %.3f\n", mean(@sorted[int($N*0.05)..$#array])); printf("SD: %.3f\n", $sd); printf("Kurtosis: %.3f\n", $kurtosis); printf("Skewness: %.3f\n", $skewness); my $rcmd= Rcmd->new(); $rcmd->array('array', @array); $rcmd->exec( 'pdf("./graph/'.$filename.'")', 'par(mfrow=c(1,2))', 'hist(array, main="", xlab="")', 'boxplot(array)', ); } # show graph image (when -output => show)
msg_gimv('graph/'.$filename) if $output eq 'show';
}
General documentation
AUTHORTop
Kazuharu Arakawa, gaou@sfc.keio.ac.jp