From 3512adc4250455ed056cf1d793180911c780eb9d Mon Sep 17 00:00:00 2001 From: Vecna Date: Tue, 21 May 2024 17:16:26 -0400 Subject: [PATCH] Stage 2: Model as two distributions, handle 0 standard deviation --- src/analysis.rs | 77 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/src/analysis.rs b/src/analysis.rs index baf485c..d9de29f 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -216,6 +216,28 @@ impl NormalAnalyzer { } } + fn mean(data: &[u32]) -> f64 { + let mut sum = 0.0; + for count in data { + sum += *count as f64; + } + sum / data.len() as f64 + } + + fn std_dev(data: &[u32], mean: f64) -> f64 { + let mut sum = 0.0; + for count in data { + sum += (*count as f64 - mean).powi(2); + } + (sum / data.len() as f64).sqrt() + } + + fn mean_and_std_dev(data: &[u32]) -> (f64, f64) { + let mean = Self::mean(data); + let std = Self::std_dev(data, mean); + (mean, std) + } + // Returns the mean vector, vector of individual standard deviations, and // covariance matrix. If the standard deviation for a variable is 0 and/or // the covariance matrix is not positive definite, add some noise to the @@ -318,7 +340,7 @@ impl Analyzer for NormalAnalyzer { || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today) } - /// Evaluate invite-only bridge based on last 30 days + /// Evaluate invite-only bridge based on historical data fn stage_two( &self, confidence: f64, @@ -332,30 +354,35 @@ impl Analyzer for NormalAnalyzer { let alpha = 1.0 - confidence; - let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]); - let bridge_ips_mean = mean_vec[0]; - let negative_reports_mean = mean_vec[1]; - let bridge_ips_sd = sd_vec[0]; - let negative_reports_sd = sd_vec[1]; + let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips); + let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports); - /* - let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); - let pdf = mvn.pdf(&DVector::from_vec(vec![ - bridge_ips_today as f64, - negative_reports_today as f64, - ])); - */ + // Model each variable with a normal distribution. + let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd); + let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd); - // Model each variable in isolation. We use 1 - the CDF for - // negative reports because more negative reports is worse. - let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap(); - let bip_cdf = bip_normal.cdf(bridge_ips_today as f64); - let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap(); - let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64); + // If we have 0 standard deviation, we need another way to + // evaluate each variable + let bip_test = if bridge_ips_sd > 0.0 { + bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha + } else { + // Consider the bridge blocked if its usage dropped by more + // than 1 bin. (Note that the mean is the exact value + // because we had no deviation.) + (bridge_ips_today as f64) < bridge_ips_mean - 8.0 + }; + let nr_test = if negative_reports_sd > 0.0 { + // We use CCDF because more negative reports is worse. + (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha + } else { + // Consider the bridge blocked negative reports increase by + // more than 1 after a long static period. (Note that the + // mean is the exact value because we had no deviation.) + (negative_reports_today as f64) > negative_reports_mean + 1.0 + }; - // For now, just look at each variable in isolation - // TODO: How do we do a multivariate normal CDF? - bip_cdf < alpha || nr_cdf < alpha + // Return true if any test concluded the bridge is blocked + bip_test || nr_test } /// Evaluate invite-only bridge with lv3+ users submitting positive reports @@ -393,17 +420,17 @@ impl Analyzer for NormalAnalyzer { ])); */ - // Model each variable in isolation. We use 1 - the CDF for + // Model each variable in isolation. We use the CCDF for // negative reports because more negative reports is worse. let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap(); let bip_cdf = bip_normal.cdf(bridge_ips_today as f64); let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap(); - let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64); + let nr_ccdf = 1.0 - nr_normal.cdf(negative_reports_today as f64); let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap(); let pr_cdf = pr_normal.cdf(positive_reports_today as f64); // For now, just look at each variable in isolation // TODO: How do we do a multivariate normal CDF? - bip_cdf < alpha || nr_cdf < alpha || pr_cdf < alpha + bip_cdf < alpha || nr_ccdf < alpha || pr_cdf < alpha } }