troll-patrol/src/analysis.rs

495 lines
18 KiB
Rust

use crate::{BridgeInfo, BridgeInfoType};
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
use nalgebra::DVector;
use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
use statrs::statistics::Statistics;
use std::{
cmp::min,
collections::{BTreeMap, HashSet},
};
#[cfg(feature = "simulation")]
use crate::get_date;
/// Provides a function for predicting which countries block this bridge
pub trait Analyzer {
/// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
fn stage_one(
&self,
age: u32,
confidence: f64,
bridge_ips: &[u32],
bridge_ips_today: u32,
negative_reports: &[u32],
negative_reports_today: u32,
) -> bool;
/// Evaluate invite-only bridge without positive reports. Return true if
/// blocked, false otherwise.
fn stage_two(
&self,
age: u32,
confidence: f64,
bridge_ips: &[u32],
bridge_ips_today: u32,
negative_reports: &[u32],
negative_reports_today: u32,
) -> bool;
/// Evaluate invite-only bridge with positive reports. Return true if
/// blocked, false otherwise.
fn stage_three(
&self,
age: u32,
confidence: f64,
bridge_ips: &[u32],
bridge_ips_today: u32,
negative_reports: &[u32],
negative_reports_today: u32,
positive_reports: &[u32],
positive_reports_today: u32,
) -> bool;
}
/// Accepts an analyzer, information about a bridge, and a confidence value.
/// Returns a set of country codes where the bridge is believed to be blocked.
pub fn blocked_in(
analyzer: &dyn Analyzer,
bridge_info: &BridgeInfo,
confidence: f64,
date: u32,
min_historical_days: u32,
max_historical_days: u32,
) -> HashSet<String> {
let mut blocked_in = HashSet::<String>::new();
let today = date;
for (country, info) in &bridge_info.info_by_country {
// If we haven't seen this bridge yet, return empty set
if today < info.first_seen {
return HashSet::<String>::new();
}
// (The part above prevents potential errors here.)
let age = today - info.first_seen;
if info.blocked {
// Assume bridges never become unblocked
blocked_in.insert(country.to_string());
} else {
// Get today's values, or yesterday's if no bridge-ips for today
let today_info = match info.info_by_day.get(&today) {
Some(v) => {
if v.contains_key(&BridgeInfoType::BridgeIps) {
v
} else {
// Evaluate on yesterday if we don't have data for today
match info.info_by_day.get(&(today - 1)) {
Some(v2) => {
if v2.contains_key(&BridgeInfoType::BridgeIps) {
v2
} else {
// If we don't have data today or yesterday,
// assume the bridge is down, not blocked.
continue;
}
}
// If we don't have data today or yesterday,
// assume the bridge is down, not blocked.
None => continue,
}
}
}
None => match info.info_by_day.get(&(today - 1)) {
Some(v) => {
if v.contains_key(&BridgeInfoType::BridgeIps) {
v
} else {
// If we don't have data today or yesterday,
// assume the bridge is down, not blocked.
continue;
}
}
// If we don't have data today or yesterday,
// assume the bridge is down, not blocked.
None => continue,
},
};
let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
Some(&v) => v,
None => 0,
};
let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
Some(&v) => v,
None => 0,
};
let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
Some(&v) => v,
None => 0,
};
let num_days = min(age, max_historical_days);
// Get time series for last num_days
let mut bridge_ips = vec![];
let mut negative_reports = vec![];
let mut positive_reports = vec![];
for i in 0..num_days {
let date = today - num_days + i - 1;
let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
let day_info = match info.info_by_day.get(&date) {
Some(v) => v,
None => &new_map_binding,
};
// If the bridge did not publish bridge-ips, ignore this day
if day_info.contains_key(&BridgeInfoType::BridgeIps) {
let bip = *day_info.get(&BridgeInfoType::BridgeIps).unwrap();
let nr = match day_info.get(&BridgeInfoType::NegativeReports) {
Some(&v) => v,
None => 0,
};
let pr = match day_info.get(&BridgeInfoType::PositiveReports) {
Some(&v) => v,
None => 0,
};
// If we have bridge-ips for today, add all 3 values to our time series
bridge_ips.push(bip);
negative_reports.push(nr);
positive_reports.push(pr);
}
}
// Evaluate using appropriate stage based on age of the bridge
if age < UNTRUSTED_INTERVAL || age < min_historical_days {
// open-entry bridge and/or not enough days of
// historical days for stages 2 and 3
if analyzer.stage_one(
age,
confidence,
&bridge_ips,
bridge_ips_today,
&negative_reports,
negative_reports_today,
) {
blocked_in.insert(country.to_string());
}
} else if info.first_pr.is_none()
|| today < info.first_pr.unwrap() + min_historical_days
{
// invite-only bridge without min_historical_days of
// historical data on positive reports
if analyzer.stage_two(
age,
confidence,
&bridge_ips,
bridge_ips_today,
&negative_reports,
negative_reports_today,
) {
blocked_in.insert(country.to_string());
}
} else {
// invite-only bridge that has min_historical_days or
// more of historical data since the first positive report
if analyzer.stage_three(
age,
confidence,
&bridge_ips,
bridge_ips_today,
&negative_reports,
negative_reports_today,
&positive_reports,
positive_reports_today,
) {
blocked_in.insert(country.to_string());
} else {
// Logging in simulation mode
#[cfg(feature = "simulation")]
if analyzer.stage_two(
age,
confidence,
&bridge_ips,
bridge_ips_today,
&negative_reports,
negative_reports_today,
) {
println!(
"{} detected not blocked due to positive reports on day {}",
array_bytes::bytes2hex("", bridge_info.fingerprint),
get_date()
);
}
}
}
}
}
blocked_in
}
// Analyzer implementations
/// Dummy example that never thinks bridges are blocked
pub struct ExampleAnalyzer {}
impl Analyzer for ExampleAnalyzer {
fn stage_one(
&self,
_age: u32,
_confidence: f64,
_bridge_ips: &[u32],
_bridge_ips_today: u32,
_negative_reports: &[u32],
_negative_reports_today: u32,
) -> bool {
false
}
fn stage_two(
&self,
_age: u32,
_confidence: f64,
_bridge_ips: &[u32],
_bridge_ips_today: u32,
_negative_reports: &[u32],
_negative_reports_today: u32,
) -> bool {
false
}
fn stage_three(
&self,
_age: u32,
_confidence: f64,
_bridge_ips: &[u32],
_bridge_ips_today: u32,
_negative_reports: &[u32],
_negative_reports_today: u32,
_positive_reports: &[u32],
_positive_reports_today: u32,
) -> bool {
false
}
}
/// Model data as multivariate normal distribution
pub struct NormalAnalyzer {
max_threshold: u32,
scaling_factor: f64,
}
impl NormalAnalyzer {
pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
Self {
max_threshold,
scaling_factor,
}
}
}
impl Analyzer for NormalAnalyzer {
/// Evaluate open-entry bridge based on only today's data
fn stage_one(
&self,
_age: u32,
_confidence: f64,
_bridge_ips: &[u32],
bridge_ips_today: u32,
_negative_reports: &[u32],
negative_reports_today: u32,
) -> bool {
negative_reports_today > self.max_threshold
|| f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
}
/// Evaluate invite-only bridge based on historical data
fn stage_two(
&self,
_age: u32,
confidence: f64,
bridge_ips: &[u32],
bridge_ips_today: u32,
negative_reports: &[u32],
negative_reports_today: u32,
) -> bool {
assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
assert_eq!(bridge_ips.len(), negative_reports.len());
let alpha = 1.0 - confidence;
// Convert to f64 for stats
let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
let negative_reports_f64 = &negative_reports
.iter()
.map(|n| *n as f64)
.collect::<Vec<f64>>();
// Evaluate based on negative reports
let negative_reports_mean = negative_reports_f64.mean();
let negative_reports_sd = negative_reports_f64.std_dev();
// Only use CCDF test if today's numbers are worse than average
if (negative_reports_today as f64) > negative_reports_mean {
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
if negative_reports_sd > 0.0 {
// We use CCDF because more negative reports is worse.
if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
return true;
}
} else {
// If the standard deviation is 0, we need another option.
// Consider the bridge blocked negative reports increase by
// more than 1 after a long static period. (Note that the
// mean is the exact value because we had no deviation.)
if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
return true;
}
}
}
// Evaluate based on bridge stats
let bridge_ips_mean = bridge_ips_f64.mean();
let bridge_ips_sd = bridge_ips_f64.std_dev();
// Only use CDF test if today's numbers are worse than average
if (bridge_ips_today as f64) < bridge_ips_mean {
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
if bridge_ips_sd > 0.0 {
if bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha {
return true;
}
} else {
// If the standard deviation is 0, we need another option.
// Consider the bridge blocked if its usage dropped by more
// than 1 bin. (Note that the mean is the exact value
// because we had no deviation.)
if (bridge_ips_today as f64) < bridge_ips_mean - 8.0 {
return true;
}
}
}
// If none of the tests concluded that the bridge is blocked,
// return false
false
}
/// Evaluate invite-only bridge with lv3+ users submitting positive reports
fn stage_three(
&self,
age: u32,
confidence: f64,
bridge_ips: &[u32],
bridge_ips_today: u32,
negative_reports: &[u32],
negative_reports_today: u32,
positive_reports: &[u32],
positive_reports_today: u32,
) -> bool {
assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
assert_eq!(bridge_ips.len(), negative_reports.len());
assert_eq!(bridge_ips.len(), positive_reports.len());
let alpha = 1.0 - confidence;
// Convert to f64 for stats
let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
let negative_reports_f64 = &negative_reports
.iter()
.map(|n| *n as f64)
.collect::<Vec<f64>>();
let positive_reports_f64 = &positive_reports
.iter()
.map(|n| *n as f64)
.collect::<Vec<f64>>();
// Evaluate based on negative reports. It is better to compute
// negative reports test first because the positive test may be
// expensive.
let negative_reports_mean = negative_reports_f64.mean();
let negative_reports_sd = negative_reports_f64.std_dev();
// Only use CCDF test if today's numbers are worse than average
if (negative_reports_today as f64) > negative_reports_mean {
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
if negative_reports_sd > 0.0 {
// We use CCDF because more negative reports is worse.
if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
return true;
}
} else {
// Consider the bridge blocked negative reports increase by
// more than 1 after a long static period. (Note that the
// mean is the exact value because we had no deviation.)
if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
return true;
}
}
}
// Evaluate based on bridge stats and positive reports.
let bridge_ips_mean = bridge_ips_f64.mean();
let positive_reports_mean = positive_reports_f64.mean();
let cov_mat = {
let x = bridge_ips_f64;
let y = positive_reports_f64;
let xx = x.covariance(x);
let xy = x.covariance(y);
let yy = y.covariance(y);
vec![xx, xy, xy, yy]
};
// Only use CDF test if today's numbers are worse than average
if (bridge_ips_today as f64) < bridge_ips_mean
|| (positive_reports_today as f64) < positive_reports_mean
{
let mvn =
MultivariateNormal::new(vec![bridge_ips_mean, positive_reports_mean], cov_mat);
if mvn.is_ok() {
let mvn = mvn.unwrap();
// Start 3 standard deviations below the mean, based on
// 68-95-99.7 rule, assuming the confidence will be high
// enough that 99.7 is close enough to "the whole
// distribution" to be reasonable
let bip_start = (bridge_ips_mean - (3.0 * bridge_ips_f64.std_dev()).ceil()) as i32;
let pr_start =
(positive_reports_mean - (3.0 * positive_reports_f64.std_dev()).ceil()) as i32;
// Estimate the CDF by integrating the PDF by hand with step
// size 1
let mut cdf = 0.0;
for bip in bip_start..bridge_ips_today as i32 {
for pr in pr_start..positive_reports_today as i32 {
cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
}
}
if cdf < alpha {
return true;
}
} else {
// If we have 0 standard deviation or a covariance matrix
// that is not positive definite, we need another way to
// evaluate each variable. Ignore positive reports and
// compute as in stage 2
if self.stage_two(
age,
confidence,
bridge_ips,
bridge_ips_today,
negative_reports,
negative_reports_today,
) {
return true;
}
}
}
// If none of the tests concluded that the bridge is blocked,
// return false
false
}
}