Use estimated multivariate CDF when we have positive reports

This commit is contained in:
Vecna 2024-05-21 19:30:09 -04:00
parent 3512adc425
commit d9aa616d77
1 changed files with 55 additions and 85 deletions

View File

@ -1,10 +1,9 @@
use crate::{BridgeInfo, BridgeInfoType}; use crate::{BridgeInfo, BridgeInfoType};
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL; use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
use nalgebra::{Cholesky, DMatrix, DVector}; use nalgebra::DVector;
use rand::Rng; use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal};
use std::{ use std::{
cmp::{max, min}, cmp::min,
collections::{BTreeMap, HashSet}, collections::{BTreeMap, HashSet},
}; };
@ -234,39 +233,21 @@ impl NormalAnalyzer {
fn mean_and_std_dev(data: &[u32]) -> (f64, f64) { fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
let mean = Self::mean(data); let mean = Self::mean(data);
let std = Self::std_dev(data, mean); let std_dev = Self::std_dev(data, mean);
(mean, std) (mean, std_dev)
} }
// Returns the mean vector, vector of individual standard deviations, and // Returns the mean vector and covariance matrix
// covariance matrix. If the standard deviation for a variable is 0 and/or fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {
// the covariance matrix is not positive definite, add some noise to the
// data and recompute.
fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
let n = data.len(); let n = data.len();
// Compute mean and standard deviation vectors // Compute mean vector
let (mean_vec, sd_vec) = { let mean_vec = {
let mut mean_vec = Vec::<f64>::new(); let mut mean_vec = Vec::<f64>::new();
let mut sd_vec = Vec::<f64>::new();
for var in data { for var in data {
// Compute mean mean_vec.push(Self::mean(var));
let mut sum = 0.0;
for count in *var {
sum += *count as f64;
}
let mean = sum / var.len() as f64;
// Compute standard deviation
let mut sum = 0.0;
for count in *var {
sum += (*count as f64 - mean).powi(2);
}
let sd = (sum / var.len() as f64).sqrt();
mean_vec.push(mean);
sd_vec.push(sd);
} }
(mean_vec, sd_vec) mean_vec
}; };
// Compute covariance matrix // Compute covariance matrix
@ -296,33 +277,7 @@ impl NormalAnalyzer {
cov_mat cov_mat
}; };
// If any standard deviation is 0 or the covariance matrix is not (mean_vec, cov_mat)
// positive definite, add some noise and recompute.
let mut recompute = false;
for sd in &sd_vec {
if *sd <= 0.0 {
recompute = true;
}
}
if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
recompute = true;
}
if !recompute {
(mean_vec, sd_vec, cov_mat)
} else {
// Add random noise and recompute
let mut new_data = vec![vec![0; data[0].len()]; n];
let mut rng = rand::thread_rng();
for i in 0..n {
for j in 0..data[i].len() {
// Add 1 to some randomly selected values
new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
}
}
// Compute stats on modified data
Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
}
} }
} }
@ -357,7 +312,7 @@ impl Analyzer for NormalAnalyzer {
let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips); let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);
let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports); let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
// Model each variable with a normal distribution. // Model negative reports separately
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd); let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd); let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
@ -402,35 +357,50 @@ impl Analyzer for NormalAnalyzer {
let alpha = 1.0 - confidence; let alpha = 1.0 - confidence;
let (mean_vec, sd_vec, cov_mat) = // Model bridge IPs and positive reports with multivariate
Self::stats(&[bridge_ips, negative_reports, positive_reports]); // normal distribution
let bridge_ips_mean = mean_vec[0]; let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);
let negative_reports_mean = mean_vec[1]; let mvn = MultivariateNormal::new(mean_vec, cov_mat);
let positive_reports_mean = mean_vec[2];
let bridge_ips_sd = sd_vec[0];
let negative_reports_sd = sd_vec[1];
let positive_reports_sd = sd_vec[2];
/* // Model negative reports separately
let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
let pdf = mvn.pdf(&DVector::from_vec(vec![ let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
bridge_ips_today as f64,
negative_reports_today as f64,
positive_reports_today as f64,
]));
*/
// Model each variable in isolation. We use the CCDF for // If we have 0 standard deviation or a covariance matrix that
// negative reports because more negative reports is worse. // is not positive definite, we need another way to evaluate
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap(); // each variable
let bip_cdf = bip_normal.cdf(bridge_ips_today as f64); let positive_test = if mvn.is_ok() {
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap(); let mvn = mvn.unwrap();
let nr_ccdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
// For now, just look at each variable in isolation // Estimate the CDF by integrating the PDF by hand with step
// TODO: How do we do a multivariate normal CDF? // size 1
bip_cdf < alpha || nr_ccdf < alpha || pr_cdf < alpha let mut cdf = 0.0;
for bip in 0..bridge_ips_today {
for pr in 0..positive_reports_today {
cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
}
}
cdf < alpha
} else {
// Ignore positive reports and compute as in stage 2
self.stage_two(
confidence,
bridge_ips,
bridge_ips_today,
negative_reports,
negative_reports_today,
)
};
let nr_test = if negative_reports_sd > 0.0 {
// We use CCDF because more negative reports is worse.
(1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
} else {
// Consider the bridge blocked negative reports increase by
// more than 1 after a long static period. (Note that the
// mean is the exact value because we had no deviation.)
(negative_reports_today as f64) > negative_reports_mean + 1.0
};
positive_test || nr_test
} }
} }