troll-patrol/src/analysis.rs

use crate::{BridgeInfo, BridgeInfoType};
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
use nalgebra::DVector;
use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
use std::{
    cmp::min,
    collections::{BTreeMap, HashSet},
};

/// Provides a function for predicting which countries block this bridge
pub trait Analyzer {
    /// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
    fn stage_one(
        &self,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool;

    /// Evaluate invite-only bridge without positive reports. Return true if
    /// blocked, false otherwise.
    fn stage_two(
        &self,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool;

    /// Evaluate invite-only bridge with positive reports. Return true if
    /// blocked, false otherwise.
    fn stage_three(
        &self,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
        positive_reports: &[u32],
        positive_reports_today: u32,
    ) -> bool;
}

/// Accepts an analyzer, information about a bridge, and a confidence value.
/// Returns a set of country codes where the bridge is believed to be blocked.
pub fn blocked_in(
    analyzer: &dyn Analyzer,
    bridge_info: &BridgeInfo,
    confidence: f64,
    date: u32,
    min_historical_days: u32,
    max_historical_days: u32,
) -> HashSet<String> {
    let mut blocked_in = HashSet::<String>::new();
    let today = date;
    for (country, info) in &bridge_info.info_by_country {
        let age = today - info.first_seen;
        if info.blocked {
            // Assume bridges never become unblocked
            blocked_in.insert(country.to_string());
        } else {
            // Get today's values
            let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
            // TODO: Evaluate on yesterday if we don't have data for today?
            let today_info = match info.info_by_day.get(&today) {
                Some(v) => v,
                None => &new_map_binding,
            };
            let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
                Some(&v) => v,
                None => 0,
            };
            let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
                Some(&v) => v,
                None => 0,
            };
            let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
                Some(&v) => v,
                None => 0,
            };

            let num_days = min(age, max_historical_days);

            // Get time series for last num_days
            let mut bridge_ips = vec![0; num_days as usize];
            let mut negative_reports = vec![0; num_days as usize];
            let mut positive_reports = vec![0; num_days as usize];

            for i in 0..num_days {
                let date = today - num_days + i - 1;
                let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
                let day_info = match info.info_by_day.get(&date) {
                    Some(v) => v,
                    None => &new_map_binding,
                };
                bridge_ips[i as usize] = match day_info.get(&BridgeInfoType::BridgeIps) {
                    Some(&v) => v,
                    None => 0,
                };
                negative_reports[i as usize] = match day_info.get(&BridgeInfoType::NegativeReports)
                {
                    Some(&v) => v,
                    None => 0,
                };
                positive_reports[i as usize] = match day_info.get(&BridgeInfoType::PositiveReports)
                {
                    Some(&v) => v,
                    None => 0,
                };
            }

            // Evaluate using appropriate stage based on age of the bridge
            if age < UNTRUSTED_INTERVAL || age < min_historical_days {
                // open-entry bridge and/or not enough days of
                // historical days for stages 2 and 3
                if analyzer.stage_one(
                    confidence,
                    &bridge_ips,
                    bridge_ips_today,
                    &negative_reports,
                    negative_reports_today,
                ) {
                    blocked_in.insert(country.to_string());
                }
            } else if info.first_pr.is_none()
                || today < info.first_pr.unwrap() + min_historical_days
            {
                // invite-only bridge without min_historical_days of
                // historical data on positive reports
                if analyzer.stage_two(
                    confidence,
                    &bridge_ips,
                    bridge_ips_today,
                    &negative_reports,
                    negative_reports_today,
                ) {
                    blocked_in.insert(country.to_string());
                }
            } else {
                // invite-only bridge that has min_historical_days or
                // more of historical data since the first positive report
                if analyzer.stage_three(
                    confidence,
                    &bridge_ips,
                    bridge_ips_today,
                    &negative_reports,
                    negative_reports_today,
                    &positive_reports,
                    positive_reports_today,
                ) {
                    blocked_in.insert(country.to_string());
                }
            }
        }
    }
    blocked_in
}

// Analyzer implementations

/// Dummy example that never thinks bridges are blocked
pub struct ExampleAnalyzer {}

impl Analyzer for ExampleAnalyzer {
    fn stage_one(
        &self,
        _confidence: f64,
        _bridge_ips: &[u32],
        _bridge_ips_today: u32,
        _negative_reports: &[u32],
        _negative_reports_today: u32,
    ) -> bool {
        false
    }

    fn stage_two(
        &self,
        _confidence: f64,
        _bridge_ips: &[u32],
        _bridge_ips_today: u32,
        _negative_reports: &[u32],
        _negative_reports_today: u32,
    ) -> bool {
        false
    }

    fn stage_three(
        &self,
        _confidence: f64,
        _bridge_ips: &[u32],
        _bridge_ips_today: u32,
        _negative_reports: &[u32],
        _negative_reports_today: u32,
        _positive_reports: &[u32],
        _positive_reports_today: u32,
    ) -> bool {
        false
    }
}

/// Model data as multivariate normal distribution
pub struct NormalAnalyzer {
    max_threshold: u32,
    scaling_factor: f64,
}

impl NormalAnalyzer {
    pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
        Self {
            max_threshold,
            scaling_factor,
        }
    }

    fn mean(data: &[u32]) -> f64 {
        let mut sum = 0.0;
        for count in data {
            sum += *count as f64;
        }
        sum / data.len() as f64
    }

    fn std_dev(data: &[u32], mean: f64) -> f64 {
        let mut sum = 0.0;
        for count in data {
            sum += (*count as f64 - mean).powi(2);
        }
        (sum / data.len() as f64).sqrt()
    }

    fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
        let mean = Self::mean(data);
        let std_dev = Self::std_dev(data, mean);
        (mean, std_dev)
    }

    // Returns the mean vector and covariance matrix
    fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {
        let n = data.len();

        // Compute mean vector
        let mean_vec = {
            let mut mean_vec = Vec::<f64>::new();
            for var in data {
                mean_vec.push(Self::mean(var));
            }
            mean_vec
        };

        // Compute covariance matrix
        let cov_mat = {
            let mut cov_mat = Vec::<f64>::new();
            // We don't need to recompute Syx, but we currently do
            for i in 0..n {
                for j in 0..n {
                    cov_mat.push({
                        let var1 = data[i];
                        let var1_mean = mean_vec[i];

                        let var2 = data[j];
                        let var2_mean = mean_vec[j];

                        assert_eq!(var1.len(), var2.len());

                        let mut sum = 0.0;
                        for index in 0..var1.len() {
                            sum +=
                                (var1[index] as f64 - var1_mean) * (var2[index] as f64 - var2_mean);
                        }
                        sum / (var1.len() - 1) as f64
                    });
                }
            }
            cov_mat
        };

        (mean_vec, cov_mat)
    }
}

impl Analyzer for NormalAnalyzer {
    /// Evaluate open-entry bridge based on only today's data
    fn stage_one(
        &self,
        _confidence: f64,
        _bridge_ips: &[u32],
        bridge_ips_today: u32,
        _negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool {
        negative_reports_today > self.max_threshold
            || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
    }

    /// Evaluate invite-only bridge based on historical data
    fn stage_two(
        &self,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool {
        assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
        assert_eq!(bridge_ips.len(), negative_reports.len());

        let alpha = 1.0 - confidence;

        // Evaluate based on negative reports
        let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);

        // Only use CCDF test if today's numbers are worse than average
        if (negative_reports_today as f64) > negative_reports_mean {
            let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
            if negative_reports_sd > 0.0 {
                // We use CCDF because more negative reports is worse.
                if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
                    return true;
                }
            } else {
                // If the standard deviation is 0, we need another option.
                // Consider the bridge blocked negative reports increase by
                // more than 1 after a long static period. (Note that the
                // mean is the exact value because we had no deviation.)
                if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
                    return true;
                }
            }
        }

        // Evaluate based on bridge stats
        let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);

        // Only use CDF test if today's numbers are worse than average
        if (bridge_ips_today as f64) < bridge_ips_mean {
            let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
            if bridge_ips_sd > 0.0 {
                if bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha {
                    return true;
                }
            } else {
                // If the standard deviation is 0, we need another option.
                // Consider the bridge blocked if its usage dropped by more
                // than 1 bin. (Note that the mean is the exact value
                // because we had no deviation.)
                if (bridge_ips_today as f64) < bridge_ips_mean - 8.0 {
                    return true;
                }
            }
        }

        // If none of the tests concluded that the bridge is blocked,
        // return false
        false
    }

    /// Evaluate invite-only bridge with lv3+ users submitting positive reports
    fn stage_three(
        &self,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
        positive_reports: &[u32],
        positive_reports_today: u32,
    ) -> bool {
        assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
        assert_eq!(bridge_ips.len(), negative_reports.len());
        assert_eq!(bridge_ips.len(), positive_reports.len());

        let alpha = 1.0 - confidence;

        // Evaluate based on negative reports. It is better to compute
        // negative reports test first because the positive test may be
        // expensive.
        let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);

        // Only use CCDF test if today's numbers are worse than average
        if (negative_reports_today as f64) > negative_reports_mean {
            let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
            if negative_reports_sd > 0.0 {
                // We use CCDF because more negative reports is worse.
                if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
                    return true;
                }
            } else {
                // Consider the bridge blocked negative reports increase by
                // more than 1 after a long static period. (Note that the
                // mean is the exact value because we had no deviation.)
                if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
                    return true;
                }
            }
        }

        // Evaluate based on bridge stats and positive reports.
        let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);

        // Only use CDF test if today's numbers are worse than average
        if (bridge_ips_today as f64) < mean_vec[0] || (positive_reports_today as f64) < mean_vec[1]
        {
            let mvn = MultivariateNormal::new(mean_vec, cov_mat);
            if mvn.is_ok() {
                let mvn = mvn.unwrap();

                // Estimate the CDF by integrating the PDF by hand with step
                // size 1
                let mut cdf = 0.0;
                for bip in 0..bridge_ips_today {
                    for pr in 0..positive_reports_today {
                        cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
                    }
                }
                if cdf < alpha {
                    return true;
                }
            } else {
                // If we have 0 standard deviation or a covariance matrix
                // that is not positive definite, we need another way to
                // evaluate each variable. Ignore positive reports and
                // compute as in stage 2
                if self.stage_two(
                    confidence,
                    bridge_ips,
                    bridge_ips_today,
                    negative_reports,
                    negative_reports_today,
                ) {
                    return true;
                }
            }
        }

        // If none of the tests concluded that the bridge is blocked,
        // return false
        false
    }
}
Pass date for analysis, compute age correctly 2024-04-15 15:21:35 -04:00			`use crate::{BridgeInfo, BridgeInfoType};`
Remove unused import 2024-04-27 14:32:26 -04:00			`use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`use nalgebra::DVector;`
			`use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`use std::{`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`cmp::min,`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`collections::{BTreeMap, HashSet},`
			`};`

			`/// Provides a function for predicting which countries block this bridge`
			`pub trait Analyzer {`
			`/// Evaluate open-entry bridge. Returns true if blocked, false otherwise.`
			`fn stage_one(`
			`&self,`
			`confidence: f64,`
			`bridge_ips: &[u32],`
			`bridge_ips_today: u32,`
			`negative_reports: &[u32],`
			`negative_reports_today: u32,`
			`) -> bool;`

			`/// Evaluate invite-only bridge without positive reports. Return true if`
			`/// blocked, false otherwise.`
			`fn stage_two(`
			`&self,`
			`confidence: f64,`
			`bridge_ips: &[u32],`
			`bridge_ips_today: u32,`
			`negative_reports: &[u32],`
			`negative_reports_today: u32,`
			`) -> bool;`

			`/// Evaluate invite-only bridge with positive reports. Return true if`
			`/// blocked, false otherwise.`
			`fn stage_three(`
			`&self,`
			`confidence: f64,`
			`bridge_ips: &[u32],`
			`bridge_ips_today: u32,`
			`negative_reports: &[u32],`
			`negative_reports_today: u32,`
			`positive_reports: &[u32],`
			`positive_reports_today: u32,`
			`) -> bool;`
			`}`

			`/// Accepts an analyzer, information about a bridge, and a confidence value.`
			`/// Returns a set of country codes where the bridge is believed to be blocked.`
			`pub fn blocked_in(`
			`analyzer: &dyn Analyzer,`
			`bridge_info: &BridgeInfo,`
			`confidence: f64,`
Pass date for analysis, compute age correctly 2024-04-15 15:21:35 -04:00			`date: u32,`
Only re-evaluate new data, analyze based on user-defined interval 2024-05-20 20:38:06 -04:00			`min_historical_days: u32,`
			`max_historical_days: u32,`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`) -> HashSet<String> {`
			`let mut blocked_in = HashSet::<String>::new();`
Pass date for analysis, compute age correctly 2024-04-15 15:21:35 -04:00			`let today = date;`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`for (country, info) in &bridge_info.info_by_country {`
Start stage 3 based on actual observation of a positive report Also track how much historical data we have on a per-country basis 2024-04-27 13:20:05 -04:00			`let age = today - info.first_seen;`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`if info.blocked {`
			`// Assume bridges never become unblocked`
			`blocked_in.insert(country.to_string());`
			`} else {`
			`// Get today's values`
			`let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();`
			`// TODO: Evaluate on yesterday if we don't have data for today?`
			`let today_info = match info.info_by_day.get(&today) {`
			`Some(v) => v,`
			`None => &new_map_binding,`
			`};`
			`let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {`
Don't scale bridge IPs in analysis 2024-04-26 12:58:03 -04:00			`Some(&v) => v,`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`None => 0,`
			`};`
			`let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {`
Improve analysis We seem to get better results when we scale bridge IPs down to multiples of 1 instead of 8 and only mark bridges as blocked if they differ in the 'bad' direction by at least one standard deviation from the mean. 2024-04-18 22:27:57 -04:00			`Some(&v) => v,`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`None => 0,`
			`};`
			`let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {`
Improve analysis We seem to get better results when we scale bridge IPs down to multiples of 1 instead of 8 and only mark bridges as blocked if they differ in the 'bad' direction by at least one standard deviation from the mean. 2024-04-18 22:27:57 -04:00			`Some(&v) => v,`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`None => 0,`
			`};`

Only re-evaluate new data, analyze based on user-defined interval 2024-05-20 20:38:06 -04:00			`let num_days = min(age, max_historical_days);`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00
			`// Get time series for last num_days`
			`let mut bridge_ips = vec![0; num_days as usize];`
			`let mut negative_reports = vec![0; num_days as usize];`
			`let mut positive_reports = vec![0; num_days as usize];`

			`for i in 0..num_days {`
			`let date = today - num_days + i - 1;`
			`let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();`
			`let day_info = match info.info_by_day.get(&date) {`
			`Some(v) => v,`
			`None => &new_map_binding,`
			`};`
			`bridge_ips[i as usize] = match day_info.get(&BridgeInfoType::BridgeIps) {`
Don't scale bridge IPs in analysis 2024-04-26 12:58:03 -04:00			`Some(&v) => v,`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`None => 0,`
			`};`
			`negative_reports[i as usize] = match day_info.get(&BridgeInfoType::NegativeReports)`
			`{`
			`Some(&v) => v,`
			`None => 0,`
			`};`
			`positive_reports[i as usize] = match day_info.get(&BridgeInfoType::PositiveReports)`
			`{`
			`Some(&v) => v,`
			`None => 0,`
			`};`
			`}`

			`// Evaluate using appropriate stage based on age of the bridge`
Only re-evaluate new data, analyze based on user-defined interval 2024-05-20 20:38:06 -04:00			`if age < UNTRUSTED_INTERVAL \|\| age < min_historical_days {`
			`// open-entry bridge and/or not enough days of`
			`// historical days for stages 2 and 3`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`if analyzer.stage_one(`
			`confidence,`
			`&bridge_ips,`
			`bridge_ips_today,`
			`&negative_reports,`
			`negative_reports_today,`
			`) {`
			`blocked_in.insert(country.to_string());`
			`}`
Only re-evaluate new data, analyze based on user-defined interval 2024-05-20 20:38:06 -04:00			`} else if info.first_pr.is_none()`
			`\|\| today < info.first_pr.unwrap() + min_historical_days`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`{`
Only re-evaluate new data, analyze based on user-defined interval 2024-05-20 20:38:06 -04:00			`// invite-only bridge without min_historical_days of`
			`// historical data on positive reports`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`if analyzer.stage_two(`
			`confidence,`
			`&bridge_ips,`
			`bridge_ips_today,`
			`&negative_reports,`
			`negative_reports_today,`
			`) {`
			`blocked_in.insert(country.to_string());`
			`}`
			`} else {`
Only re-evaluate new data, analyze based on user-defined interval 2024-05-20 20:38:06 -04:00			`// invite-only bridge that has min_historical_days or`
			`// more of historical data since the first positive report`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`if analyzer.stage_three(`
			`confidence,`
			`&bridge_ips,`
			`bridge_ips_today,`
			`&negative_reports,`
			`negative_reports_today,`
			`&positive_reports,`
			`positive_reports_today,`
			`) {`
			`blocked_in.insert(country.to_string());`
			`}`
			`}`
			`}`
			`}`
			`blocked_in`
			`}`

			`// Analyzer implementations`

			`/// Dummy example that never thinks bridges are blocked`
			`pub struct ExampleAnalyzer {}`

			`impl Analyzer for ExampleAnalyzer {`
			`fn stage_one(`
			`&self,`
			`_confidence: f64,`
			`_bridge_ips: &[u32],`
			`_bridge_ips_today: u32,`
			`_negative_reports: &[u32],`
			`_negative_reports_today: u32,`
			`) -> bool {`
			`false`
			`}`

			`fn stage_two(`
			`&self,`
			`_confidence: f64,`
			`_bridge_ips: &[u32],`
			`_bridge_ips_today: u32,`
			`_negative_reports: &[u32],`
			`_negative_reports_today: u32,`
			`) -> bool {`
			`false`
			`}`

			`fn stage_three(`
			`&self,`
			`_confidence: f64,`
			`_bridge_ips: &[u32],`
			`_bridge_ips_today: u32,`
			`_negative_reports: &[u32],`
			`_negative_reports_today: u32,`
			`_positive_reports: &[u32],`
			`_positive_reports_today: u32,`
			`) -> bool {`
			`false`
			`}`
			`}`

			`/// Model data as multivariate normal distribution`
			`pub struct NormalAnalyzer {`
			`max_threshold: u32,`
			`scaling_factor: f64,`
			`}`

			`impl NormalAnalyzer {`
			`pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {`
			`Self {`
			`max_threshold,`
			`scaling_factor,`
			`}`
			`}`

Stage 2: Model as two distributions, handle 0 standard deviation 2024-05-21 17:16:26 -04:00			`fn mean(data: &[u32]) -> f64 {`
			`let mut sum = 0.0;`
			`for count in data {`
			`sum += *count as f64;`
			`}`
			`sum / data.len() as f64`
			`}`

			`fn std_dev(data: &[u32], mean: f64) -> f64 {`
			`let mut sum = 0.0;`
			`for count in data {`
			`sum += (*count as f64 - mean).powi(2);`
			`}`
			`(sum / data.len() as f64).sqrt()`
			`}`

			`fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {`
			`let mean = Self::mean(data);`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`let std_dev = Self::std_dev(data, mean);`
			`(mean, std_dev)`
Stage 2: Model as two distributions, handle 0 standard deviation 2024-05-21 17:16:26 -04:00			`}`

Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`// Returns the mean vector and covariance matrix`
			`fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`let n = data.len();`

Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`// Compute mean vector`
			`let mean_vec = {`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`let mut mean_vec = Vec::<f64>::new();`
			`for var in data {`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`mean_vec.push(Self::mean(var));`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`}`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`mean_vec`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`};`

			`// Compute covariance matrix`
			`let cov_mat = {`
			`let mut cov_mat = Vec::<f64>::new();`
			`// We don't need to recompute Syx, but we currently do`
			`for i in 0..n {`
			`for j in 0..n {`
			`cov_mat.push({`
			`let var1 = data[i];`
			`let var1_mean = mean_vec[i];`

			`let var2 = data[j];`
			`let var2_mean = mean_vec[j];`

			`assert_eq!(var1.len(), var2.len());`

			`let mut sum = 0.0;`
			`for index in 0..var1.len() {`
			`sum +=`
			`(var1[index] as f64 - var1_mean) * (var2[index] as f64 - var2_mean);`
			`}`
Don't scale bridge IPs in analysis 2024-04-26 12:58:03 -04:00			`sum / (var1.len() - 1) as f64`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`});`
			`}`
			`}`
			`cov_mat`
			`};`

Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`(mean_vec, cov_mat)`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`}`
			`}`

			`impl Analyzer for NormalAnalyzer {`
			`/// Evaluate open-entry bridge based on only today's data`
			`fn stage_one(`
			`&self,`
			`_confidence: f64,`
Mark unused variable bridge_ips in stage one analysis 2024-04-18 22:42:27 -04:00			`_bridge_ips: &[u32],`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`bridge_ips_today: u32,`
			`_negative_reports: &[u32],`
			`negative_reports_today: u32,`
			`) -> bool {`
			`negative_reports_today > self.max_threshold`
cargo-fmt 2024-04-26 13:11:32 -04:00			`\|\| f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`}`

Stage 2: Model as two distributions, handle 0 standard deviation 2024-05-21 17:16:26 -04:00			`/// Evaluate invite-only bridge based on historical data`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`fn stage_two(`
			`&self,`
			`confidence: f64,`
			`bridge_ips: &[u32],`
			`bridge_ips_today: u32,`
			`negative_reports: &[u32],`
			`negative_reports_today: u32,`
			`) -> bool {`
			`assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);`
			`assert_eq!(bridge_ips.len(), negative_reports.len());`

Improve analysis We seem to get better results when we scale bridge IPs down to multiples of 1 instead of 8 and only mark bridges as blocked if they differ in the 'bad' direction by at least one standard deviation from the mean. 2024-04-18 22:27:57 -04:00			`let alpha = 1.0 - confidence;`

Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`// Evaluate based on negative reports`
Stage 2: Model as two distributions, handle 0 standard deviation 2024-05-21 17:16:26 -04:00			`let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);`
Only use CDF tests if today's numbers are worse than average 2024-05-27 17:47:02 -04:00
			`// Only use CCDF test if today's numbers are worse than average`
			`if (negative_reports_today as f64) > negative_reports_mean {`
			`let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);`
			`if negative_reports_sd > 0.0 {`
			`// We use CCDF because more negative reports is worse.`
			`if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {`
			`return true;`
			`}`
			`} else {`
			`// If the standard deviation is 0, we need another option.`
			`// Consider the bridge blocked negative reports increase by`
			`// more than 1 after a long static period. (Note that the`
			`// mean is the exact value because we had no deviation.)`
			`if (negative_reports_today as f64) > negative_reports_mean + 1.0 {`
			`return true;`
			`}`
Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`}`
			`}`

			`// Evaluate based on bridge stats`
			`let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);`
Only use CDF tests if today's numbers are worse than average 2024-05-27 17:47:02 -04:00
			`// Only use CDF test if today's numbers are worse than average`
			`if (bridge_ips_today as f64) < bridge_ips_mean {`
			`let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);`
			`if bridge_ips_sd > 0.0 {`
			`if bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha {`
			`return true;`
			`}`
			`} else {`
			`// If the standard deviation is 0, we need another option.`
			`// Consider the bridge blocked if its usage dropped by more`
			`// than 1 bin. (Note that the mean is the exact value`
			`// because we had no deviation.)`
			`if (bridge_ips_today as f64) < bridge_ips_mean - 8.0 {`
			`return true;`
			`}`
Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`}`
			`}`
Use CDF, not PDF with artificial 'blocked' data TODO: Figure out proper multivariate CDF 2024-05-20 20:54:28 -04:00
Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`// If none of the tests concluded that the bridge is blocked,`
			`// return false`
			`false`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`}`

			`/// Evaluate invite-only bridge with lv3+ users submitting positive reports`
			`fn stage_three(`
			`&self,`
			`confidence: f64,`
			`bridge_ips: &[u32],`
			`bridge_ips_today: u32,`
			`negative_reports: &[u32],`
			`negative_reports_today: u32,`
			`positive_reports: &[u32],`
			`positive_reports_today: u32,`
			`) -> bool {`
			`assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);`
			`assert_eq!(bridge_ips.len(), negative_reports.len());`
			`assert_eq!(bridge_ips.len(), positive_reports.len());`

Improve analysis We seem to get better results when we scale bridge IPs down to multiples of 1 instead of 8 and only mark bridges as blocked if they differ in the 'bad' direction by at least one standard deviation from the mean. 2024-04-18 22:27:57 -04:00			`let alpha = 1.0 - confidence;`

Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`// Evaluate based on negative reports. It is better to compute`
			`// negative reports test first because the positive test may be`
			`// expensive.`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);`
Only use CDF tests if today's numbers are worse than average 2024-05-27 17:47:02 -04:00
			`// Only use CCDF test if today's numbers are worse than average`
			`if (negative_reports_today as f64) > negative_reports_mean {`
			`let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);`
			`if negative_reports_sd > 0.0 {`
			`// We use CCDF because more negative reports is worse.`
			`if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {`
			`return true;`
			`}`
			`} else {`
			`// Consider the bridge blocked negative reports increase by`
			`// more than 1 after a long static period. (Note that the`
			`// mean is the exact value because we had no deviation.)`
			`if (negative_reports_today as f64) > negative_reports_mean + 1.0 {`
			`return true;`
			`}`
Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`}`
			`}`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00
Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`// Evaluate based on bridge stats and positive reports.`
			`let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);`
Only use CDF tests if today's numbers are worse than average 2024-05-27 17:47:02 -04:00
			`// Only use CDF test if today's numbers are worse than average`
			`if (bridge_ips_today as f64) < mean_vec[0] \|\| (positive_reports_today as f64) < mean_vec[1]`
			`{`
			`let mvn = MultivariateNormal::new(mean_vec, cov_mat);`
			`if mvn.is_ok() {`
			`let mvn = mvn.unwrap();`

			`// Estimate the CDF by integrating the PDF by hand with step`
			`// size 1`
			`let mut cdf = 0.0;`
			`for bip in 0..bridge_ips_today {`
			`for pr in 0..positive_reports_today {`
			`cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));`
			`}`
			`}`
			`if cdf < alpha {`
			`return true;`
			`}`
			`} else {`
			`// If we have 0 standard deviation or a covariance matrix`
			`// that is not positive definite, we need another way to`
			`// evaluate each variable. Ignore positive reports and`
			`// compute as in stage 2`
			`if self.stage_two(`
			`confidence,`
			`bridge_ips,`
			`bridge_ips_today,`
			`negative_reports,`
			`negative_reports_today,`
			`) {`
			`return true;`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00			`}`
			`}`
Only use CDF tests if today's numbers are worse than average 2024-05-27 17:47:02 -04:00			`}`
Use estimated multivariate CDF when we have positive reports 2024-05-21 19:30:09 -04:00
Refactor analysis for efficiency 2024-05-22 16:04:52 -04:00			`// If none of the tests concluded that the bridge is blocked,`
			`// return false`
			`false`
Refactor Analyzer trait 2024-04-15 13:19:56 -04:00			`}`
			`}`