troll-patrol/src/analysis.rs

use crate::{BridgeInfo, BridgeInfoType};
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
use nalgebra::DVector;
use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
use statrs::statistics::Statistics;
use std::{
    cmp::min,
    collections::{BTreeMap, HashSet},
};

#[cfg(feature = "simulation")]
use crate::get_date;

/// Provides a function for predicting which countries block this bridge
pub trait Analyzer {
    /// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
    fn stage_one(
        &self,
        age: u32,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool;

    /// Evaluate invite-only bridge without positive reports. Return true if
    /// blocked, false otherwise.
    fn stage_two(
        &self,
        age: u32,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool;

    /// Evaluate invite-only bridge with positive reports. Return true if
    /// blocked, false otherwise.
    fn stage_three(
        &self,
        age: u32,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
        positive_reports: &[u32],
        positive_reports_today: u32,
    ) -> bool;
}

/// Accepts an analyzer, information about a bridge, and a confidence value.
/// Returns a set of country codes where the bridge is believed to be blocked.
pub fn blocked_in(
    analyzer: &dyn Analyzer,
    bridge_info: &BridgeInfo,
    confidence: f64,
    date: u32,
    min_historical_days: u32,
    max_historical_days: u32,
) -> HashSet<String> {
    let mut blocked_in = HashSet::<String>::new();
    let today = date;
    for (country, info) in &bridge_info.info_by_country {
        // If we haven't seen this bridge yet, return empty set
        if today < info.first_seen {
            return HashSet::<String>::new();
        }

        // (The part above prevents potential errors here.)
        let age = today - info.first_seen;

        if info.blocked {
            // Assume bridges never become unblocked
            blocked_in.insert(country.to_string());
        } else {
            // Get today's values, or yesterday's if no bridge-ips for today
            let today_info = match info.info_by_day.get(&today) {
                Some(v) => {
                    if v.contains_key(&BridgeInfoType::BridgeIps) {
                        v
                    } else {
                        // Evaluate on yesterday if we don't have data for today
                        match info.info_by_day.get(&(today - 1)) {
                            Some(v2) => {
                                if v2.contains_key(&BridgeInfoType::BridgeIps) {
                                    v2
                                } else {
                                    // If we don't have data today or yesterday,
                                    // assume the bridge is down, not blocked.
                                    continue;
                                }
                            }
                            // If we don't have data today or yesterday,
                            // assume the bridge is down, not blocked.
                            None => continue,
                        }
                    }
                }
                None => match info.info_by_day.get(&(today - 1)) {
                    Some(v) => {
                        if v.contains_key(&BridgeInfoType::BridgeIps) {
                            v
                        } else {
                            // If we don't have data today or yesterday,
                            // assume the bridge is down, not blocked.
                            continue;
                        }
                    }
                    // If we don't have data today or yesterday,
                    // assume the bridge is down, not blocked.
                    None => continue,
                },
            };
            let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
                Some(&v) => v,
                None => 0,
            };
            let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
                Some(&v) => v,
                None => 0,
            };
            let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
                Some(&v) => v,
                None => 0,
            };

            let num_days = min(age, max_historical_days);

            // Get time series for last num_days
            let mut bridge_ips = vec![];
            let mut negative_reports = vec![];
            let mut positive_reports = vec![];

            for i in 0..num_days {
                let date = today - num_days + i - 1;
                let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
                let day_info = match info.info_by_day.get(&date) {
                    Some(v) => v,
                    None => &new_map_binding,
                };

                // If the bridge did not publish bridge-ips, ignore this day
                if day_info.contains_key(&BridgeInfoType::BridgeIps) {
                    let bip = *day_info.get(&BridgeInfoType::BridgeIps).unwrap();
                    let nr = match day_info.get(&BridgeInfoType::NegativeReports) {
                        Some(&v) => v,
                        None => 0,
                    };
                    let pr = match day_info.get(&BridgeInfoType::PositiveReports) {
                        Some(&v) => v,
                        None => 0,
                    };

                    // If we have bridge-ips for today, add all 3 values to our time series
                    bridge_ips.push(bip);
                    negative_reports.push(nr);
                    positive_reports.push(pr);
                }
            }

            // Evaluate using appropriate stage based on age of the bridge
            if age < UNTRUSTED_INTERVAL || age < min_historical_days {
                // open-entry bridge and/or not enough days of
                // historical days for stages 2 and 3
                if analyzer.stage_one(
                    age,
                    confidence,
                    &bridge_ips,
                    bridge_ips_today,
                    &negative_reports,
                    negative_reports_today,
                ) {
                    blocked_in.insert(country.to_string());
                }
            } else if info.first_pr.is_none()
                || today < info.first_pr.unwrap() + min_historical_days
            {
                // invite-only bridge without min_historical_days of
                // historical data on positive reports
                if analyzer.stage_two(
                    age,
                    confidence,
                    &bridge_ips,
                    bridge_ips_today,
                    &negative_reports,
                    negative_reports_today,
                ) {
                    blocked_in.insert(country.to_string());
                }
            } else {
                // invite-only bridge that has min_historical_days or
                // more of historical data since the first positive report
                if analyzer.stage_three(
                    age,
                    confidence,
                    &bridge_ips,
                    bridge_ips_today,
                    &negative_reports,
                    negative_reports_today,
                    &positive_reports,
                    positive_reports_today,
                ) {
                    blocked_in.insert(country.to_string());
                } else {
                    // Logging in simulation mode
                    #[cfg(feature = "simulation")]
                    if analyzer.stage_two(
                        age,
                        confidence,
                        &bridge_ips,
                        bridge_ips_today,
                        &negative_reports,
                        negative_reports_today,
                    ) {
                        println!(
                            "{} detected not blocked due to positive reports on day {}",
                            array_bytes::bytes2hex("", bridge_info.fingerprint),
                            get_date()
                        );
                    }
                }
            }
        }
    }
    blocked_in
}

// Analyzer implementations

/// Dummy example that never thinks bridges are blocked
pub struct ExampleAnalyzer {}

impl Analyzer for ExampleAnalyzer {
    fn stage_one(
        &self,
        _age: u32,
        _confidence: f64,
        _bridge_ips: &[u32],
        _bridge_ips_today: u32,
        _negative_reports: &[u32],
        _negative_reports_today: u32,
    ) -> bool {
        false
    }

    fn stage_two(
        &self,
        _age: u32,
        _confidence: f64,
        _bridge_ips: &[u32],
        _bridge_ips_today: u32,
        _negative_reports: &[u32],
        _negative_reports_today: u32,
    ) -> bool {
        false
    }

    fn stage_three(
        &self,
        _age: u32,
        _confidence: f64,
        _bridge_ips: &[u32],
        _bridge_ips_today: u32,
        _negative_reports: &[u32],
        _negative_reports_today: u32,
        _positive_reports: &[u32],
        _positive_reports_today: u32,
    ) -> bool {
        false
    }
}

/// Model data as multivariate normal distribution
pub struct NormalAnalyzer {
    max_threshold: u32,
    scaling_factor: f64,
}

impl NormalAnalyzer {
    pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
        Self {
            max_threshold,
            scaling_factor,
        }
    }
}

impl Analyzer for NormalAnalyzer {
    /// Evaluate open-entry bridge based on only today's data
    fn stage_one(
        &self,
        _age: u32,
        _confidence: f64,
        _bridge_ips: &[u32],
        bridge_ips_today: u32,
        _negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool {
        negative_reports_today > self.max_threshold
            || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
    }

    /// Evaluate invite-only bridge based on historical data
    fn stage_two(
        &self,
        _age: u32,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
    ) -> bool {
        assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
        assert_eq!(bridge_ips.len(), negative_reports.len());

        let alpha = 1.0 - confidence;

        // Convert to f64 for stats
        let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
        let negative_reports_f64 = &negative_reports
            .iter()
            .map(|n| *n as f64)
            .collect::<Vec<f64>>();

        // Evaluate based on negative reports
        let negative_reports_mean = negative_reports_f64.mean();
        let negative_reports_sd = negative_reports_f64.std_dev();

        // Only use CCDF test if today's numbers are worse than average
        if (negative_reports_today as f64) > negative_reports_mean {
            let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
            if negative_reports_sd > 0.0 {
                // We use CCDF because more negative reports is worse.
                if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
                    return true;
                }
            } else {
                // If the standard deviation is 0, we need another option.
                // Consider the bridge blocked negative reports increase by
                // more than 1 after a long static period. (Note that the
                // mean is the exact value because we had no deviation.)
                if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
                    return true;
                }
            }
        }

        // Evaluate based on bridge stats
        let bridge_ips_mean = bridge_ips_f64.mean();
        let bridge_ips_sd = bridge_ips_f64.std_dev();

        // Only use CDF test if today's numbers are worse than average
        if (bridge_ips_today as f64) < bridge_ips_mean {
            let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
            if bridge_ips_sd > 0.0 {
                if bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha {
                    return true;
                }
            } else {
                // If the standard deviation is 0, we need another option.
                // Consider the bridge blocked if its usage dropped by more
                // than 1 bin. (Note that the mean is the exact value
                // because we had no deviation.)
                if (bridge_ips_today as f64) < bridge_ips_mean - 8.0 {
                    return true;
                }
            }
        }

        // If none of the tests concluded that the bridge is blocked,
        // return false
        false
    }

    /// Evaluate invite-only bridge with lv3+ users submitting positive reports
    fn stage_three(
        &self,
        age: u32,
        confidence: f64,
        bridge_ips: &[u32],
        bridge_ips_today: u32,
        negative_reports: &[u32],
        negative_reports_today: u32,
        positive_reports: &[u32],
        positive_reports_today: u32,
    ) -> bool {
        assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
        assert_eq!(bridge_ips.len(), negative_reports.len());
        assert_eq!(bridge_ips.len(), positive_reports.len());

        let alpha = 1.0 - confidence;

        // Convert to f64 for stats
        let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
        let negative_reports_f64 = &negative_reports
            .iter()
            .map(|n| *n as f64)
            .collect::<Vec<f64>>();
        let positive_reports_f64 = &positive_reports
            .iter()
            .map(|n| *n as f64)
            .collect::<Vec<f64>>();

        // Evaluate based on negative reports. It is better to compute
        // negative reports test first because the positive test may be
        // expensive.
        let negative_reports_mean = negative_reports_f64.mean();
        let negative_reports_sd = negative_reports_f64.std_dev();

        // Only use CCDF test if today's numbers are worse than average
        if (negative_reports_today as f64) > negative_reports_mean {
            let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
            if negative_reports_sd > 0.0 {
                // We use CCDF because more negative reports is worse.
                if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
                    return true;
                }
            } else {
                // Consider the bridge blocked negative reports increase by
                // more than 1 after a long static period. (Note that the
                // mean is the exact value because we had no deviation.)
                if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
                    return true;
                }
            }
        }

        // Evaluate based on bridge stats and positive reports.
        let bridge_ips_mean = bridge_ips_f64.mean();
        let positive_reports_mean = positive_reports_f64.mean();

        let cov_mat = {
            let x = bridge_ips_f64;
            let y = positive_reports_f64;
            let xx = x.covariance(x);
            let xy = x.covariance(y);
            let yy = y.covariance(y);
            vec![xx, xy, xy, yy]
        };

        // Only use CDF test if today's numbers are worse than average
        if (bridge_ips_today as f64) < bridge_ips_mean
            || (positive_reports_today as f64) < positive_reports_mean
        {
            let mvn =
                MultivariateNormal::new(vec![bridge_ips_mean, positive_reports_mean], cov_mat);
            if mvn.is_ok() {
                let mvn = mvn.unwrap();

                // Start 3 standard deviations below the mean, based on
                // 68-95-99.7 rule, assuming the confidence will be high
                // enough that 99.7 is close enough to "the whole
                // distribution" to be reasonable
                let bip_start = (bridge_ips_mean - (3.0 * bridge_ips_f64.std_dev()).ceil()) as i32;
                let pr_start =
                    (positive_reports_mean - (3.0 * positive_reports_f64.std_dev()).ceil()) as i32;

                // Estimate the CDF by integrating the PDF by hand with step
                // size 1
                let mut cdf = 0.0;
                for bip in bip_start..bridge_ips_today as i32 {
                    for pr in pr_start..positive_reports_today as i32 {
                        cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
                    }
                }
                if cdf < alpha {
                    return true;
                }
            } else {
                // If we have 0 standard deviation or a covariance matrix
                // that is not positive definite, we need another way to
                // evaluate each variable. Ignore positive reports and
                // compute as in stage 2
                if self.stage_two(
                    age,
                    confidence,
                    bridge_ips,
                    bridge_ips_today,
                    negative_reports,
                    negative_reports_today,
                ) {
                    return true;
                }
            }
        }

        // If none of the tests concluded that the bridge is blocked,
        // return false
        false
    }
}