Compare commits
No commits in common. "d9aa616d77cdac62df06ecfe47933e2e3dd1d860" and "97d4622cd472007a71e7142716992f02743689e5" have entirely different histories.
d9aa616d77
...
97d4622cd4
205
src/analysis.rs
205
src/analysis.rs
|
@ -1,9 +1,10 @@
|
||||||
use crate::{BridgeInfo, BridgeInfoType};
|
use crate::{BridgeInfo, BridgeInfoType};
|
||||||
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
|
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
|
||||||
use nalgebra::DVector;
|
use nalgebra::{Cholesky, DMatrix, DVector};
|
||||||
use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
|
use rand::Rng;
|
||||||
|
use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal};
|
||||||
use std::{
|
use std::{
|
||||||
cmp::min,
|
cmp::{max, min},
|
||||||
collections::{BTreeMap, HashSet},
|
collections::{BTreeMap, HashSet},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -215,39 +216,35 @@ impl NormalAnalyzer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn mean(data: &[u32]) -> f64 {
|
// Returns the mean vector, vector of individual standard deviations, and
|
||||||
let mut sum = 0.0;
|
// covariance matrix. If the standard deviation for a variable is 0 and/or
|
||||||
for count in data {
|
// the covariance matrix is not positive definite, add some noise to the
|
||||||
sum += *count as f64;
|
// data and recompute.
|
||||||
}
|
fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
|
||||||
sum / data.len() as f64
|
|
||||||
}
|
|
||||||
|
|
||||||
fn std_dev(data: &[u32], mean: f64) -> f64 {
|
|
||||||
let mut sum = 0.0;
|
|
||||||
for count in data {
|
|
||||||
sum += (*count as f64 - mean).powi(2);
|
|
||||||
}
|
|
||||||
(sum / data.len() as f64).sqrt()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
|
|
||||||
let mean = Self::mean(data);
|
|
||||||
let std_dev = Self::std_dev(data, mean);
|
|
||||||
(mean, std_dev)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the mean vector and covariance matrix
|
|
||||||
fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {
|
|
||||||
let n = data.len();
|
let n = data.len();
|
||||||
|
|
||||||
// Compute mean vector
|
// Compute mean and standard deviation vectors
|
||||||
let mean_vec = {
|
let (mean_vec, sd_vec) = {
|
||||||
let mut mean_vec = Vec::<f64>::new();
|
let mut mean_vec = Vec::<f64>::new();
|
||||||
|
let mut sd_vec = Vec::<f64>::new();
|
||||||
for var in data {
|
for var in data {
|
||||||
mean_vec.push(Self::mean(var));
|
// Compute mean
|
||||||
|
let mut sum = 0.0;
|
||||||
|
for count in *var {
|
||||||
|
sum += *count as f64;
|
||||||
}
|
}
|
||||||
mean_vec
|
let mean = sum / var.len() as f64;
|
||||||
|
|
||||||
|
// Compute standard deviation
|
||||||
|
let mut sum = 0.0;
|
||||||
|
for count in *var {
|
||||||
|
sum += (*count as f64 - mean).powi(2);
|
||||||
|
}
|
||||||
|
let sd = (sum / var.len() as f64).sqrt();
|
||||||
|
mean_vec.push(mean);
|
||||||
|
sd_vec.push(sd);
|
||||||
|
}
|
||||||
|
(mean_vec, sd_vec)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Compute covariance matrix
|
// Compute covariance matrix
|
||||||
|
@ -277,7 +274,33 @@ impl NormalAnalyzer {
|
||||||
cov_mat
|
cov_mat
|
||||||
};
|
};
|
||||||
|
|
||||||
(mean_vec, cov_mat)
|
// If any standard deviation is 0 or the covariance matrix is not
|
||||||
|
// positive definite, add some noise and recompute.
|
||||||
|
let mut recompute = false;
|
||||||
|
for sd in &sd_vec {
|
||||||
|
if *sd <= 0.0 {
|
||||||
|
recompute = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
|
||||||
|
recompute = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !recompute {
|
||||||
|
(mean_vec, sd_vec, cov_mat)
|
||||||
|
} else {
|
||||||
|
// Add random noise and recompute
|
||||||
|
let mut new_data = vec![vec![0; data[0].len()]; n];
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..data[i].len() {
|
||||||
|
// Add 1 to some randomly selected values
|
||||||
|
new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Compute stats on modified data
|
||||||
|
Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -295,7 +318,7 @@ impl Analyzer for NormalAnalyzer {
|
||||||
|| f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
|
|| f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evaluate invite-only bridge based on historical data
|
/// Evaluate invite-only bridge based on last 30 days
|
||||||
fn stage_two(
|
fn stage_two(
|
||||||
&self,
|
&self,
|
||||||
confidence: f64,
|
confidence: f64,
|
||||||
|
@ -309,35 +332,30 @@ impl Analyzer for NormalAnalyzer {
|
||||||
|
|
||||||
let alpha = 1.0 - confidence;
|
let alpha = 1.0 - confidence;
|
||||||
|
|
||||||
let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);
|
let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]);
|
||||||
let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
|
let bridge_ips_mean = mean_vec[0];
|
||||||
|
let negative_reports_mean = mean_vec[1];
|
||||||
|
let bridge_ips_sd = sd_vec[0];
|
||||||
|
let negative_reports_sd = sd_vec[1];
|
||||||
|
|
||||||
// Model negative reports separately
|
/*
|
||||||
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
|
let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
|
||||||
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
|
let pdf = mvn.pdf(&DVector::from_vec(vec![
|
||||||
|
bridge_ips_today as f64,
|
||||||
|
negative_reports_today as f64,
|
||||||
|
]));
|
||||||
|
*/
|
||||||
|
|
||||||
// If we have 0 standard deviation, we need another way to
|
// Model each variable in isolation. We use 1 - the CDF for
|
||||||
// evaluate each variable
|
// negative reports because more negative reports is worse.
|
||||||
let bip_test = if bridge_ips_sd > 0.0 {
|
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
|
||||||
bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha
|
let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
|
||||||
} else {
|
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
|
||||||
// Consider the bridge blocked if its usage dropped by more
|
let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
|
||||||
// than 1 bin. (Note that the mean is the exact value
|
|
||||||
// because we had no deviation.)
|
|
||||||
(bridge_ips_today as f64) < bridge_ips_mean - 8.0
|
|
||||||
};
|
|
||||||
let nr_test = if negative_reports_sd > 0.0 {
|
|
||||||
// We use CCDF because more negative reports is worse.
|
|
||||||
(1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
|
|
||||||
} else {
|
|
||||||
// Consider the bridge blocked negative reports increase by
|
|
||||||
// more than 1 after a long static period. (Note that the
|
|
||||||
// mean is the exact value because we had no deviation.)
|
|
||||||
(negative_reports_today as f64) > negative_reports_mean + 1.0
|
|
||||||
};
|
|
||||||
|
|
||||||
// Return true if any test concluded the bridge is blocked
|
// For now, just look at each variable in isolation
|
||||||
bip_test || nr_test
|
// TODO: How do we do a multivariate normal CDF?
|
||||||
|
bip_cdf < alpha || nr_cdf < alpha
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evaluate invite-only bridge with lv3+ users submitting positive reports
|
/// Evaluate invite-only bridge with lv3+ users submitting positive reports
|
||||||
|
@ -357,50 +375,35 @@ impl Analyzer for NormalAnalyzer {
|
||||||
|
|
||||||
let alpha = 1.0 - confidence;
|
let alpha = 1.0 - confidence;
|
||||||
|
|
||||||
// Model bridge IPs and positive reports with multivariate
|
let (mean_vec, sd_vec, cov_mat) =
|
||||||
// normal distribution
|
Self::stats(&[bridge_ips, negative_reports, positive_reports]);
|
||||||
let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);
|
let bridge_ips_mean = mean_vec[0];
|
||||||
let mvn = MultivariateNormal::new(mean_vec, cov_mat);
|
let negative_reports_mean = mean_vec[1];
|
||||||
|
let positive_reports_mean = mean_vec[2];
|
||||||
|
let bridge_ips_sd = sd_vec[0];
|
||||||
|
let negative_reports_sd = sd_vec[1];
|
||||||
|
let positive_reports_sd = sd_vec[2];
|
||||||
|
|
||||||
// Model negative reports separately
|
/*
|
||||||
let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
|
let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
|
||||||
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
|
let pdf = mvn.pdf(&DVector::from_vec(vec![
|
||||||
|
bridge_ips_today as f64,
|
||||||
|
negative_reports_today as f64,
|
||||||
|
positive_reports_today as f64,
|
||||||
|
]));
|
||||||
|
*/
|
||||||
|
|
||||||
// If we have 0 standard deviation or a covariance matrix that
|
// Model each variable in isolation. We use 1 - the CDF for
|
||||||
// is not positive definite, we need another way to evaluate
|
// negative reports because more negative reports is worse.
|
||||||
// each variable
|
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
|
||||||
let positive_test = if mvn.is_ok() {
|
let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
|
||||||
let mvn = mvn.unwrap();
|
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
|
||||||
|
let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
|
||||||
|
let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
|
||||||
|
let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
|
||||||
|
|
||||||
// Estimate the CDF by integrating the PDF by hand with step
|
// For now, just look at each variable in isolation
|
||||||
// size 1
|
// TODO: How do we do a multivariate normal CDF?
|
||||||
let mut cdf = 0.0;
|
bip_cdf < alpha || nr_cdf < alpha || pr_cdf < alpha
|
||||||
for bip in 0..bridge_ips_today {
|
|
||||||
for pr in 0..positive_reports_today {
|
|
||||||
cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cdf < alpha
|
|
||||||
} else {
|
|
||||||
// Ignore positive reports and compute as in stage 2
|
|
||||||
self.stage_two(
|
|
||||||
confidence,
|
|
||||||
bridge_ips,
|
|
||||||
bridge_ips_today,
|
|
||||||
negative_reports,
|
|
||||||
negative_reports_today,
|
|
||||||
)
|
|
||||||
};
|
|
||||||
let nr_test = if negative_reports_sd > 0.0 {
|
|
||||||
// We use CCDF because more negative reports is worse.
|
|
||||||
(1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
|
|
||||||
} else {
|
|
||||||
// Consider the bridge blocked negative reports increase by
|
|
||||||
// more than 1 after a long static period. (Note that the
|
|
||||||
// mean is the exact value because we had no deviation.)
|
|
||||||
(negative_reports_today as f64) > negative_reports_mean + 1.0
|
|
||||||
};
|
|
||||||
|
|
||||||
positive_test || nr_test
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue