diff --git a/src/analysis.rs b/src/analysis.rs index d9de29f..9a84de0 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -1,10 +1,9 @@ use crate::{BridgeInfo, BridgeInfoType}; use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL; -use nalgebra::{Cholesky, DMatrix, DVector}; -use rand::Rng; -use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal}; +use nalgebra::DVector; +use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal}; use std::{ - cmp::{max, min}, + cmp::min, collections::{BTreeMap, HashSet}, }; @@ -234,39 +233,21 @@ impl NormalAnalyzer { fn mean_and_std_dev(data: &[u32]) -> (f64, f64) { let mean = Self::mean(data); - let std = Self::std_dev(data, mean); - (mean, std) + let std_dev = Self::std_dev(data, mean); + (mean, std_dev) } - // Returns the mean vector, vector of individual standard deviations, and - // covariance matrix. If the standard deviation for a variable is 0 and/or - // the covariance matrix is not positive definite, add some noise to the - // data and recompute. - fn stats(data: &[&[u32]]) -> (Vec, Vec, Vec) { + // Returns the mean vector and covariance matrix + fn stats(data: &[&[u32]]) -> (Vec, Vec) { let n = data.len(); - // Compute mean and standard deviation vectors - let (mean_vec, sd_vec) = { + // Compute mean vector + let mean_vec = { let mut mean_vec = Vec::::new(); - let mut sd_vec = Vec::::new(); for var in data { - // Compute mean - let mut sum = 0.0; - for count in *var { - sum += *count as f64; - } - let mean = sum / var.len() as f64; - - // Compute standard deviation - let mut sum = 0.0; - for count in *var { - sum += (*count as f64 - mean).powi(2); - } - let sd = (sum / var.len() as f64).sqrt(); - mean_vec.push(mean); - sd_vec.push(sd); + mean_vec.push(Self::mean(var)); } - (mean_vec, sd_vec) + mean_vec }; // Compute covariance matrix @@ -296,33 +277,7 @@ impl NormalAnalyzer { cov_mat }; - // If any standard deviation is 0 or the covariance matrix is not - // positive definite, add some noise and recompute. - let mut recompute = false; - for sd in &sd_vec { - if *sd <= 0.0 { - recompute = true; - } - } - if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() { - recompute = true; - } - - if !recompute { - (mean_vec, sd_vec, cov_mat) - } else { - // Add random noise and recompute - let mut new_data = vec![vec![0; data[0].len()]; n]; - let mut rng = rand::thread_rng(); - for i in 0..n { - for j in 0..data[i].len() { - // Add 1 to some randomly selected values - new_data[i][j] = data[i][j] + rng.gen_range(0..=1); - } - } - // Compute stats on modified data - Self::stats(&new_data.iter().map(Vec::as_slice).collect::>()) - } + (mean_vec, cov_mat) } } @@ -357,7 +312,7 @@ impl Analyzer for NormalAnalyzer { let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips); let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports); - // Model each variable with a normal distribution. + // Model negative reports separately let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd); let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd); @@ -402,35 +357,50 @@ impl Analyzer for NormalAnalyzer { let alpha = 1.0 - confidence; - let (mean_vec, sd_vec, cov_mat) = - Self::stats(&[bridge_ips, negative_reports, positive_reports]); - let bridge_ips_mean = mean_vec[0]; - let negative_reports_mean = mean_vec[1]; - let positive_reports_mean = mean_vec[2]; - let bridge_ips_sd = sd_vec[0]; - let negative_reports_sd = sd_vec[1]; - let positive_reports_sd = sd_vec[2]; + // Model bridge IPs and positive reports with multivariate + // normal distribution + let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]); + let mvn = MultivariateNormal::new(mean_vec, cov_mat); - /* - let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); - let pdf = mvn.pdf(&DVector::from_vec(vec![ - bridge_ips_today as f64, - negative_reports_today as f64, - positive_reports_today as f64, - ])); - */ + // Model negative reports separately + let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports); + let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd); - // Model each variable in isolation. We use the CCDF for - // negative reports because more negative reports is worse. - let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap(); - let bip_cdf = bip_normal.cdf(bridge_ips_today as f64); - let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap(); - let nr_ccdf = 1.0 - nr_normal.cdf(negative_reports_today as f64); - let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap(); - let pr_cdf = pr_normal.cdf(positive_reports_today as f64); + // If we have 0 standard deviation or a covariance matrix that + // is not positive definite, we need another way to evaluate + // each variable + let positive_test = if mvn.is_ok() { + let mvn = mvn.unwrap(); - // For now, just look at each variable in isolation - // TODO: How do we do a multivariate normal CDF? - bip_cdf < alpha || nr_ccdf < alpha || pr_cdf < alpha + // Estimate the CDF by integrating the PDF by hand with step + // size 1 + let mut cdf = 0.0; + for bip in 0..bridge_ips_today { + for pr in 0..positive_reports_today { + cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64])); + } + } + cdf < alpha + } else { + // Ignore positive reports and compute as in stage 2 + self.stage_two( + confidence, + bridge_ips, + bridge_ips_today, + negative_reports, + negative_reports_today, + ) + }; + let nr_test = if negative_reports_sd > 0.0 { + // We use CCDF because more negative reports is worse. + (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha + } else { + // Consider the bridge blocked negative reports increase by + // more than 1 after a long static period. (Note that the + // mean is the exact value because we had no deviation.) + (negative_reports_today as f64) > negative_reports_mean + 1.0 + }; + + positive_test || nr_test } }