在数据分析和科学研究中,t检验是最常用的统计方法之一。作为一名长期从事数据分析工作的程序员,我经常需要在C语言环境中实现各种统计检验。今天我要分享的是如何在C语言中实现独立样本t检验和配对样本t检验的完整方案。
这个实现包含了从随机数生成到统计计算的全套流程,特别适合需要在嵌入式系统或高性能计算环境中进行统计分析的开发者。与Python或R等高级语言相比,C语言实现的t检验虽然代码量更大,但执行效率更高,且不依赖外部库,非常适合集成到现有系统中。
c复制double random_normal(double mu, double sigma) {
static double U;
static double V;
static int phase = 0;
double Z;
if (phase == 0) {
U = (rand() + 1.0) / (RAND_MAX + 1.0);
V = (rand() + 1.0) / (RAND_MAX + 1.0);
Z = sqrt(-2.0 * log(U)) * sin(2.0 * M_PI * V);
} else {
Z = sqrt(-2.0 * log(U)) * cos(2.0 * M_PI * V);
}
phase = 1 - phase;
return Z * sigma + mu;
}
Box-Muller变换是将均匀分布随机数转换为正态分布随机数的经典算法。这个实现有几个关键点需要注意:
提示:在实际应用中,如果需要高质量随机数,建议使用更现代的随机数生成器如Mersenne Twister替代标准库的rand()函数。
c复制double incbeta(double a, double b, double x) {
static const double epsilon = 1.0e-30;
static const double condition = 1.0e-8;
if (x < 0.0 || x > 1.0) return INFINITY;
if (x > (a + 1.0) / (a + b + 2.0)) return 1.0 - incbeta(b, a, 1.0 - x);
double f = 1.0;
double c = 1.0;
double d = 0.0;
const double lbeta_ab = lgamma(a) + lgamma(b) - lgamma(a + b);
const double front = exp(a * log(x) + b * log(1.0 - x) - lbeta_ab) / a;
for (int i = 0; i <= 200; ++i) {
int m = i / 2;
double numerator;
if (i == 0) numerator = 1.0;
else if (i % 2 == 0) numerator = (m * (b - m) * x) / ((a + 2.0 * m - 1.0) * (a + 2.0 * m));
else numerator = - ((a + m) * (a + b + m) * x) / ((a + 2.0 * m) * (a + 2.0 * m + 1));
d = 1.0 + numerator * d;
if (fabs(d) < epsilon) d = epsilon;
d = 1.0 / d;
c = 1.0 + numerator / c;
if (fabs(c) < epsilon) c = epsilon;
double cd = c * d;
f = f * cd;
if (fabs(1.0 - cd) < condition) return front * (f - 1.0);
}
return INFINITY;
}
不完全Beta函数是计算t分布累积分布函数的基础。这个实现采用了连分式展开法,具有以下特点:
c复制double ttest_independent(double *group1, double *group2, int n1, int n2, int equal_var) {
// 计算均值和方差
double sum1 = 0.0, sum2 = 0.0;
for (int i = 0; i < n1; ++i) sum1 += group1[i];
for (int i = 0; i < n2; ++i) sum2 += group2[i];
double mean1 = sum1 / n1;
double mean2 = sum2 / n2;
double var1 = 0.0, var2 = 0.0;
for (int i = 0; i < n1; ++i) var1 += (group1[i] - mean1) * (group1[i] - mean1);
for (int i = 0; i < n2; ++i) var2 += (group2[i] - mean2) * (group2[i] - mean2);
var1 /= (n1 - 1);
var2 /= (n2 - 1);
// 计算t统计量和自由度
double t, df;
if (equal_var) {
df = n1 + n2 - 2.0;
double pooled_var = ((n1 - 1.0) * var1 + (n2 - 1.0) * var2) / df;
t = (mean1 - mean2) / sqrt(pooled_var * (1.0 / n1 + 1.0 / n2));
} else {
double satterthwaite1 = var1 / n1;
double satterthwaite2 = var2 / n2;
t = (mean1 - mean2) / sqrt(satterthwaite1 + satterthwaite2);
double numerator = (satterthwaite1 + satterthwaite2) * (satterthwaite1 + satterthwaite2);
double denominator = (satterthwaite1 * satterthwaite1) / (n1 - 1.0) + (satterthwaite2 * satterthwaite2) / (n2 - 1.0);
df = numerator / denominator;
}
// 计算p值
double cdf = cdf_student_t(t, df);
return 2.0 * (cdf > 0.5 ? 1.0 - cdf : cdf);
}
独立样本t检验支持两种变体:
关键参数说明:
equal_var:为1表示假设两组方差相等,为0表示使用Welch校正c复制double ttest_paired(double *state1, double *state2, int n) {
if (n < 2) return NAN;
// 计算差值统计量
double sum_diff = 0.0;
double sum_square_diff = 0.0;
for (int i = 0; i < n; ++i) {
double diff = state1[i] - state2[i];
sum_diff += diff;
sum_square_diff += diff * diff;
}
// 计算t统计量
double mean_diff = sum_diff / n;
double var_diff = (sum_square_diff - (sum_diff * sum_diff / n)) / (n - 1);
double std_error = sqrt(var_diff / n);
double t = mean_diff / std_error;
// 计算p值
double cdf = cdf_student_t(t, n - 1.0);
return 2.0 * (cdf > 0.5 ? 1.0 - cdf : cdf);
}
配对t检验的实现相对简单,核心是计算前后测量的差值,然后对差值进行单样本t检验。注意点:
c复制int n1 = 6, n2 = 4;
double mu1 = 75.0, mu2 = 82.0;
double sigma1 = 8.0, sigma2 = 8.0;
double category1[n1], category2[n2];
for (int i = 0; i < n1; ++i) category1[i] = random_normal(mu1, sigma1);
for (int i = 0; i < n2; ++i) category2[i] = random_normal(mu2, sigma2);
double p = ttest_independent(category1, category2, n1, n2, sigma1 == sigma2);
printf("Independent Samples T-test: p = %.12lf\n", p);
c复制double before[] = {22,20,19,24,25,25,28,22,30,27,24,18,16,19,19,28,24,25,25,23};
double after[] = {24,22,19,22,28,26,28,24,30,29,25,20,17,18,18,28,26,27,27,24};
int n = sizeof(after)/sizeof(after[0]);
double p = ttest_paired(before, after, n);
printf("Paired Sample T-test: p = %.12lf\n", p);
c复制// Welford方法计算方差
double mean = 0.0, M2 = 0.0;
for(int i=0; i<n; i++) {
double delta = data[i] - mean;
mean += delta / (i+1);
M2 += delta * (data[i] - mean);
}
double variance = M2 / (n-1);
如果需要处理大量数据,可以考虑将计算过程并行化:
注意:在实际应用中,建议对p值进行截断处理,避免显示0.000000或1.000000这样的极端值,可以设置为如"<0.0001"这样的格式。
这个基础实现可以进一步扩展:
我在实际项目中使用这套代码进行实时数据分析,处理速度比Python实现快10倍以上,特别适合嵌入式系统和实时数据处理场景。