Use estimated multivariate CDF when we have positive reports

This commit is contained in:
Vecna 2024-05-21 19:30:09 -04:00
parent 3512adc425
commit d9aa616d77
1 changed files with 55 additions and 85 deletions

View File

@ -1,10 +1,9 @@
use crate::{BridgeInfo, BridgeInfoType};
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
use nalgebra::{Cholesky, DMatrix, DVector};
use rand::Rng;
use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal};
use nalgebra::DVector;
use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
use std::{
cmp::{max, min},
cmp::min,
collections::{BTreeMap, HashSet},
};
@ -234,39 +233,21 @@ impl NormalAnalyzer {
fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
let mean = Self::mean(data);
let std = Self::std_dev(data, mean);
(mean, std)
let std_dev = Self::std_dev(data, mean);
(mean, std_dev)
}
// Returns the mean vector, vector of individual standard deviations, and
// covariance matrix. If the standard deviation for a variable is 0 and/or
// the covariance matrix is not positive definite, add some noise to the
// data and recompute.
fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
// Returns the mean vector and covariance matrix
fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {
let n = data.len();
// Compute mean and standard deviation vectors
let (mean_vec, sd_vec) = {
// Compute mean vector
let mean_vec = {
let mut mean_vec = Vec::<f64>::new();
let mut sd_vec = Vec::<f64>::new();
for var in data {
// Compute mean
let mut sum = 0.0;
for count in *var {
sum += *count as f64;
}
let mean = sum / var.len() as f64;
// Compute standard deviation
let mut sum = 0.0;
for count in *var {
sum += (*count as f64 - mean).powi(2);
}
let sd = (sum / var.len() as f64).sqrt();
mean_vec.push(mean);
sd_vec.push(sd);
mean_vec.push(Self::mean(var));
}
(mean_vec, sd_vec)
mean_vec
};
// Compute covariance matrix
@ -296,33 +277,7 @@ impl NormalAnalyzer {
cov_mat
};
// If any standard deviation is 0 or the covariance matrix is not
// positive definite, add some noise and recompute.
let mut recompute = false;
for sd in &sd_vec {
if *sd <= 0.0 {
recompute = true;
}
}
if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
recompute = true;
}
if !recompute {
(mean_vec, sd_vec, cov_mat)
} else {
// Add random noise and recompute
let mut new_data = vec![vec![0; data[0].len()]; n];
let mut rng = rand::thread_rng();
for i in 0..n {
for j in 0..data[i].len() {
// Add 1 to some randomly selected values
new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
}
}
// Compute stats on modified data
Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
}
(mean_vec, cov_mat)
}
}
@ -357,7 +312,7 @@ impl Analyzer for NormalAnalyzer {
let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);
let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
// Model each variable with a normal distribution.
// Model negative reports separately
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
@ -402,35 +357,50 @@ impl Analyzer for NormalAnalyzer {
let alpha = 1.0 - confidence;
let (mean_vec, sd_vec, cov_mat) =
Self::stats(&[bridge_ips, negative_reports, positive_reports]);
let bridge_ips_mean = mean_vec[0];
let negative_reports_mean = mean_vec[1];
let positive_reports_mean = mean_vec[2];
let bridge_ips_sd = sd_vec[0];
let negative_reports_sd = sd_vec[1];
let positive_reports_sd = sd_vec[2];
// Model bridge IPs and positive reports with multivariate
// normal distribution
let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);
let mvn = MultivariateNormal::new(mean_vec, cov_mat);
/*
let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
let pdf = mvn.pdf(&DVector::from_vec(vec![
bridge_ips_today as f64,
negative_reports_today as f64,
positive_reports_today as f64,
]));
*/
// Model negative reports separately
let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
// Model each variable in isolation. We use the CCDF for
// negative reports because more negative reports is worse.
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
let nr_ccdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
// If we have 0 standard deviation or a covariance matrix that
// is not positive definite, we need another way to evaluate
// each variable
let positive_test = if mvn.is_ok() {
let mvn = mvn.unwrap();
// For now, just look at each variable in isolation
// TODO: How do we do a multivariate normal CDF?
bip_cdf < alpha || nr_ccdf < alpha || pr_cdf < alpha
// Estimate the CDF by integrating the PDF by hand with step
// size 1
let mut cdf = 0.0;
for bip in 0..bridge_ips_today {
for pr in 0..positive_reports_today {
cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
}
}
cdf < alpha
} else {
// Ignore positive reports and compute as in stage 2
self.stage_two(
confidence,
bridge_ips,
bridge_ips_today,
negative_reports,
negative_reports_today,
)
};
let nr_test = if negative_reports_sd > 0.0 {
// We use CCDF because more negative reports is worse.
(1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
} else {
// Consider the bridge blocked negative reports increase by
// more than 1 after a long static period. (Note that the
// mean is the exact value because we had no deviation.)
(negative_reports_today as f64) > negative_reports_mean + 1.0
};
positive_test || nr_test
}
}