1 changed files with 104 additions and 101 deletions
--- a/src/analysis.rs
+++ b/src/analysis.rs
@ -1,9 +1,10 @@
 use crate::{BridgeInfo, BridgeInfoType};
 use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
-use nalgebra::DVector;
+use nalgebra::{Cholesky, DMatrix, DVector};
-use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
+use rand::Rng;
 use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal};
 use std::{
-    cmp::min,
+    cmp::{max, min},
    collections::{BTreeMap, HashSet},
 };
@ -215,39 +216,35 @@ impl NormalAnalyzer {
        }
    }
-    fn mean(data: &[u32]) -> f64 {
+    // Returns the mean vector, vector of individual standard deviations, and
-        let mut sum = 0.0;
+    // covariance matrix. If the standard deviation for a variable is 0 and/or
-        for count in data {
+    // the covariance matrix is not positive definite, add some noise to the
-            sum += *count as f64;
+    // data and recompute.
-        }
+    fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
        sum / data.len() as f64
    }
    fn std_dev(data: &[u32], mean: f64) -> f64 {
        let mut sum = 0.0;
        for count in data {
            sum += (*count as f64 - mean).powi(2);
        }
        (sum / data.len() as f64).sqrt()
    }
    fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
        let mean = Self::mean(data);
        let std_dev = Self::std_dev(data, mean);
        (mean, std_dev)
    }
    // Returns the mean vector and covariance matrix
    fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {
        let n = data.len();
-        // Compute mean vector
+        // Compute mean and standard deviation vectors
-        let mean_vec = {
+        let (mean_vec, sd_vec) = {
            let mut mean_vec = Vec::<f64>::new();
            let mut sd_vec = Vec::<f64>::new();
            for var in data {
-                mean_vec.push(Self::mean(var));
+                // Compute mean
                let mut sum = 0.0;
                for count in *var {
                    sum += *count as f64;
                }
-            mean_vec
+                let mean = sum / var.len() as f64;
                // Compute standard deviation
                let mut sum = 0.0;
                for count in *var {
                    sum += (*count as f64 - mean).powi(2);
                }
                let sd = (sum / var.len() as f64).sqrt();
                mean_vec.push(mean);
                sd_vec.push(sd);
            }
            (mean_vec, sd_vec)
        };
        // Compute covariance matrix
@ -277,7 +274,33 @@ impl NormalAnalyzer {
            cov_mat
        };
-        (mean_vec, cov_mat)
+        // If any standard deviation is 0 or the covariance matrix is not
        // positive definite, add some noise and recompute.
        let mut recompute = false;
        for sd in &sd_vec {
            if *sd <= 0.0 {
                recompute = true;
            }
        }
        if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
            recompute = true;
        }
        if !recompute {
            (mean_vec, sd_vec, cov_mat)
        } else {
            // Add random noise and recompute
            let mut new_data = vec![vec![0; data[0].len()]; n];
            let mut rng = rand::thread_rng();
            for i in 0..n {
                for j in 0..data[i].len() {
                    // Add 1 to some randomly selected values
                    new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
                }
            }
            // Compute stats on modified data
            Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
        }
    }
 }
@ -295,7 +318,7 @@ impl Analyzer for NormalAnalyzer {
            || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
    }
-    /// Evaluate invite-only bridge based on historical data
+    /// Evaluate invite-only bridge based on last 30 days
    fn stage_two(
        &self,
        confidence: f64,
@ -309,35 +332,30 @@ impl Analyzer for NormalAnalyzer {
        let alpha = 1.0 - confidence;
-        let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);
+        let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]);
-        let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
+        let bridge_ips_mean = mean_vec[0];
        let negative_reports_mean = mean_vec[1];
        let bridge_ips_sd = sd_vec[0];
        let negative_reports_sd = sd_vec[1];
-        // Model negative reports separately
+        /*
-        let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
+                let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
-        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
+                let pdf = mvn.pdf(&DVector::from_vec(vec![
                    bridge_ips_today as f64,
                    negative_reports_today as f64,
                ]));
        */
-        // If we have 0 standard deviation, we need another way to
+        // Model each variable in isolation. We use 1 - the CDF for
-        // evaluate each variable
+        // negative reports because more negative reports is worse.
-        let bip_test = if bridge_ips_sd > 0.0 {
+        let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
-            bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha
+        let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
-        } else {
+        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
-            // Consider the bridge blocked if its usage dropped by more
+        let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
            // than 1 bin. (Note that the mean is the exact value
            // because we had no deviation.)
            (bridge_ips_today as f64) < bridge_ips_mean - 8.0
        };
        let nr_test = if negative_reports_sd > 0.0 {
            // We use CCDF because more negative reports is worse.
            (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
        } else {
            // Consider the bridge blocked negative reports increase by
            // more than 1 after a long static period. (Note that the
            // mean is the exact value because we had no deviation.)
            (negative_reports_today as f64) > negative_reports_mean + 1.0
        };
-        // Return true if any test concluded the bridge is blocked
+        // For now, just look at each variable in isolation
-        bip_test || nr_test
+        // TODO: How do we do a multivariate normal CDF?
        bip_cdf < alpha || nr_cdf < alpha
    }
    /// Evaluate invite-only bridge with lv3+ users submitting positive reports
@ -357,50 +375,35 @@ impl Analyzer for NormalAnalyzer {
        let alpha = 1.0 - confidence;
-        // Model bridge IPs and positive reports with multivariate
+        let (mean_vec, sd_vec, cov_mat) =
-        // normal distribution
+            Self::stats(&[bridge_ips, negative_reports, positive_reports]);
-        let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);
+        let bridge_ips_mean = mean_vec[0];
-        let mvn = MultivariateNormal::new(mean_vec, cov_mat);
+        let negative_reports_mean = mean_vec[1];
        let positive_reports_mean = mean_vec[2];
        let bridge_ips_sd = sd_vec[0];
        let negative_reports_sd = sd_vec[1];
        let positive_reports_sd = sd_vec[2];
-        // Model negative reports separately
+        /*
-        let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
+                let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
-        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
+                let pdf = mvn.pdf(&DVector::from_vec(vec![
                    bridge_ips_today as f64,
                    negative_reports_today as f64,
                    positive_reports_today as f64,
                ]));
        */
-        // If we have 0 standard deviation or a covariance matrix that
+        // Model each variable in isolation. We use 1 - the CDF for
-        // is not positive definite, we need another way to evaluate
+        // negative reports because more negative reports is worse.
-        // each variable
+        let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
-        let positive_test = if mvn.is_ok() {
+        let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
-            let mvn = mvn.unwrap();
+        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
        let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
        let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
        let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
-            // Estimate the CDF by integrating the PDF by hand with step
+        // For now, just look at each variable in isolation
-            // size 1
+        // TODO: How do we do a multivariate normal CDF?
-            let mut cdf = 0.0;
+        bip_cdf < alpha || nr_cdf < alpha || pr_cdf < alpha
            for bip in 0..bridge_ips_today {
                for pr in 0..positive_reports_today {
                    cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
                }
            }
            cdf < alpha
        } else {
            // Ignore positive reports and compute as in stage 2
            self.stage_two(
                confidence,
                bridge_ips,
                bridge_ips_today,
                negative_reports,
                negative_reports_today,
            )
        };
        let nr_test = if negative_reports_sd > 0.0 {
            // We use CCDF because more negative reports is worse.
            (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
        } else {
            // Consider the bridge blocked negative reports increase by
            // more than 1 after a long static period. (Note that the
            // mean is the exact value because we had no deviation.)
            (negative_reports_today as f64) > negative_reports_mean + 1.0
        };
        positive_test || nr_test
    }
 }