sub bivariate_n2c
{ &opt_default(output => 'show', filename => 'bivariate.pdf');
my @args = opt_get(@_);
my $output = opt_val('output');
my $filename = opt_val('filename');
my (@nominal, @continuous);
if (_is_nominal(\@{$args[0]})) {
@nominal= @{shift @args};
@continuous= @{shift @args};
} else {
@continuous= @{shift @args};
@nominal= @{shift @args};
}
my (@vector, @names, %table);
for my $i (0 .. ($#continuous < $#nominal ? $#continuous : $#nominal)) {
push @{$table{$nominal[$i]}}, $continuous[$i];
}
print "[Summary]\n Label\tNumber\tMean\tSD\n";
for my $name (sort {$a cmp $b} keys %table) {
my @items = @{$table{$name}};
my $num = $#items + 1;
my $mean = sprintf("%.3f", mean(@items));
my $sd = sprintf("%.5f", standard_deviation(@items));
print "$name\t$num\t$mean\t$sd\n";
push @vector, 'c('.join(' ,', @items).')';
push @names, $name;
}
print "\n";
my $rcmd= Rcmd->new();
$rcmd->array('continuous', @continuous);
$rcmd->sarray('nominal', @nominal);
$rcmd->exec(
'anova(aov(continuous ~ factor(nominal)))',
"pdf('./graph/".$filename."')",
'boxplot('.join(', ', @vector).', names=c(\''.join('\',\' ', @names).'\'))',
);
msg_gimv('graph/'.$filename) if $output eq 'show';} |
sub bivariate_n2n
{ my @args = opt_get(@_);
my @array1= @{shift @args};
my @array2= @{shift @args};
my $rcmd= Rcmd->new();
$rcmd->sarray('array1', @array1);
$rcmd->sarray('array2', @array2);
my $cramer= $rcmd->exec(
'cat("[Cross Table]\n")',
'safe.case <- complete.cases(array1, array2)',
'array1 <- array1[safe.case]',
'array2 <- array2[safe.case]',
'cross.table <- table(array1, array2)',
'ftable(cross.table)',
'ex <- outer(rowSums(cross.table), colSums(cross.table)) / sum(cross.table)',
'chisq <- sum((cross.table - ex)^2 / ex)',
'phi <- sqrt(chisq / sum(cross.table))',
'phi / sqrt(min(nrow(cross.table), ncol(cross.table)) - 1)'
);
print "\nCramer\'s V : $cramer\n";} |
sub multivariate
{ &opt_default(output => 'show', label=>[], method => 'pearson', cor => 'r', filename => 'scatter_plot.pdf');
my @args = opt_get(@_);
my $output = opt_val('output');
my $filename = opt_val('filename');
my $label = opt_val('label');
my $method = opt_val('method'); my $show_r = opt_val('cor');
$method= 'pearson' if ($method ne 'kendall' && $method ne 'spearman');
$show_r= 'r' if (lc($show_r) ne 'r^2' && lc($show_r) ne 'r2');
my $rcmd= Rcmd->new();
my (%data_table, @R_names);
for my $i (0 .. $#args) {
my $R_name = scalar(@$label) > 1 ? $$label[$i] : 'array'.$i;
push @R_names, $R_name;
$data_table{$R_name}= $_[$i];
$rcmd->array($R_name, @{$args[$i]});
}
my @R_commands= (
'CMP <- complete.cases('.join(', ', @R_names).')',
'd.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).')',
);
for my $key (@R_names) {
push @R_commands, $key.' <- '.$key.'[CMP]';
push @R_commands, 'd.table$'.$key.' <- '.$key;
}
$rcmd->exec(
@R_commands,
"pdf('./graph/".$filename."')",
'low.panel <- function (x, y, ...) {',
(' text(mean(par("usr")[1:2]), mean(par("usr")[3:4]), paste("r = ", signif(cor(x, y), 2)), cex = 2)')x!! ($show_r eq 'r'),
(' text(mean(par("usr")[1:2]), mean(par("usr")[3:4]), paste("r^2 = ", signif(cor(x, y)^2, 2)), cex = 2)')x!! (lc($show_r) eq 'r^2' || lc($show_r) eq 'r2'),
'}',
'panel.hist <- function(x, ...){',
'usr <- par("usr"); on.exit(par(usr))',
' par(usr = c(usr[1:2], 0, 1.5) )',
' h <- hist(x, plot = FALSE)',
'breaks <- h$breaks; nB <- length(breaks)',
' y <- h$counts; y <- y/max(y)',
'rect(breaks[-nB], 0, breaks[-1], y, col="gray", ...)',
'}',
'pairs(d.table, upper.panel=low.panel, diag.panel=panel.hist)',
'cat("[Correlation matrix]\n")',
"cor(d.table, method='".$method."')",
);
msg_gimv('graph/'.$filename) if $output eq 'show';
return '';} |
sub univariate
{ &opt_default(output => 'show', filename => 'distribution.pdf');
my @args = opt_get(@_);
my @array = @{+shift @args};
my $output = opt_val('output');
my $filename = opt_val('filename');
my $N = scalar(@array);
printf("[Summary]\n");
printf("N: %d\n", $N);
if (_is_nominal(opt_get(@_))) {
my %count;
$count{$_}++ for (@array);
printf("categories: %d\n", scalar(keys(%count)));
my @sorted_label= sort { $count{$a} <=> $count{$b} } keys %count;
my $rcmd= Rcmd->new();
$rcmd->array('summary', map { $count{$_} } @sorted_label);
$rcmd->exec(
'label <- c("'.join('", "', @sorted_label).'")'."\n".
"pdf('./graph/".$filename."')\n".
"barplot(summary, names.arg=label)"
);
}else{
my $mean = mean(@array);
my $sd = standard_deviation(@array);
my $skewness;
$skewness += ((($array[$_] - $mean)/$sd) ** 3) * $N / (($N-1)*($N-2)) for (0..$#array);
my $kurtosis;
$kurtosis += ((($array[$_] - $mean)/$sd) ** 4) for (0..$#array); $kurtosis *= ($N * ($N + 1))/(($N-1)*($N-2)*($N-3)); $kurtosis -= (3*(($N-1)**2))/(($N-2)*($N-3));
my @sorted = sort { $a <=> $b } @array;
printf("Mean: %.3f\n", $mean);
printf("Median: %.3f\n", median(@array));
print 'Lower 95%', sprintf(" Mean: %.3f\n", mean(@sorted[0..int($N * 0.95)]));
print 'Upper 95%', sprintf(" Mean: %.3f\n", mean(@sorted[int($N*0.05)..$#array]));
printf("SD: %.3f\n", $sd);
printf("Kurtosis: %.3f\n", $kurtosis);
printf("Skewness: %.3f\n", $skewness);
my $rcmd= Rcmd->new();
$rcmd->array('array', @array);
$rcmd->exec(
'pdf("./graph/'.$filename.'")',
'par(mfrow=c(1,2))',
'hist(array, main="", xlab="")',
'boxplot(array)',
);
}
msg_gimv('graph/'.$filename) if $output eq 'show';} |