2024-04-15 15:21:35 -04:00
|
|
|
use crate::{BridgeInfo, BridgeInfoType};
|
2024-04-27 14:32:26 -04:00
|
|
|
use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
|
2024-05-04 14:26:18 -04:00
|
|
|
use nalgebra::{Cholesky, DMatrix, DVector};
|
|
|
|
use rand::Rng;
|
2024-05-20 20:54:28 -04:00
|
|
|
use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal};
|
2024-04-15 13:19:56 -04:00
|
|
|
use std::{
|
2024-05-20 20:38:06 -04:00
|
|
|
cmp::{max, min},
|
2024-04-15 13:19:56 -04:00
|
|
|
collections::{BTreeMap, HashSet},
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Provides a function for predicting which countries block this bridge
|
|
|
|
pub trait Analyzer {
|
|
|
|
/// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
|
|
|
|
fn stage_one(
|
|
|
|
&self,
|
|
|
|
confidence: f64,
|
|
|
|
bridge_ips: &[u32],
|
|
|
|
bridge_ips_today: u32,
|
|
|
|
negative_reports: &[u32],
|
|
|
|
negative_reports_today: u32,
|
|
|
|
) -> bool;
|
|
|
|
|
|
|
|
/// Evaluate invite-only bridge without positive reports. Return true if
|
|
|
|
/// blocked, false otherwise.
|
|
|
|
fn stage_two(
|
|
|
|
&self,
|
|
|
|
confidence: f64,
|
|
|
|
bridge_ips: &[u32],
|
|
|
|
bridge_ips_today: u32,
|
|
|
|
negative_reports: &[u32],
|
|
|
|
negative_reports_today: u32,
|
|
|
|
) -> bool;
|
|
|
|
|
|
|
|
/// Evaluate invite-only bridge with positive reports. Return true if
|
|
|
|
/// blocked, false otherwise.
|
|
|
|
fn stage_three(
|
|
|
|
&self,
|
|
|
|
confidence: f64,
|
|
|
|
bridge_ips: &[u32],
|
|
|
|
bridge_ips_today: u32,
|
|
|
|
negative_reports: &[u32],
|
|
|
|
negative_reports_today: u32,
|
|
|
|
positive_reports: &[u32],
|
|
|
|
positive_reports_today: u32,
|
|
|
|
) -> bool;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Accepts an analyzer, information about a bridge, and a confidence value.
|
|
|
|
/// Returns a set of country codes where the bridge is believed to be blocked.
|
|
|
|
pub fn blocked_in(
|
|
|
|
analyzer: &dyn Analyzer,
|
|
|
|
bridge_info: &BridgeInfo,
|
|
|
|
confidence: f64,
|
2024-04-15 15:21:35 -04:00
|
|
|
date: u32,
|
2024-05-20 20:38:06 -04:00
|
|
|
min_historical_days: u32,
|
|
|
|
max_historical_days: u32,
|
2024-04-15 13:19:56 -04:00
|
|
|
) -> HashSet<String> {
|
|
|
|
let mut blocked_in = HashSet::<String>::new();
|
2024-04-15 15:21:35 -04:00
|
|
|
let today = date;
|
2024-04-15 13:19:56 -04:00
|
|
|
for (country, info) in &bridge_info.info_by_country {
|
2024-04-27 13:20:05 -04:00
|
|
|
let age = today - info.first_seen;
|
2024-04-15 13:19:56 -04:00
|
|
|
if info.blocked {
|
|
|
|
// Assume bridges never become unblocked
|
|
|
|
blocked_in.insert(country.to_string());
|
|
|
|
} else {
|
|
|
|
// Get today's values
|
|
|
|
let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
|
|
|
|
// TODO: Evaluate on yesterday if we don't have data for today?
|
|
|
|
let today_info = match info.info_by_day.get(&today) {
|
|
|
|
Some(v) => v,
|
|
|
|
None => &new_map_binding,
|
|
|
|
};
|
|
|
|
let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
|
2024-04-26 12:58:03 -04:00
|
|
|
Some(&v) => v,
|
2024-04-15 13:19:56 -04:00
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
|
2024-04-18 22:27:57 -04:00
|
|
|
Some(&v) => v,
|
2024-04-15 13:19:56 -04:00
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
|
2024-04-18 22:27:57 -04:00
|
|
|
Some(&v) => v,
|
2024-04-15 13:19:56 -04:00
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
|
2024-05-20 20:38:06 -04:00
|
|
|
let num_days = min(age, max_historical_days);
|
2024-04-15 13:19:56 -04:00
|
|
|
|
|
|
|
// Get time series for last num_days
|
|
|
|
let mut bridge_ips = vec![0; num_days as usize];
|
|
|
|
let mut negative_reports = vec![0; num_days as usize];
|
|
|
|
let mut positive_reports = vec![0; num_days as usize];
|
|
|
|
|
|
|
|
for i in 0..num_days {
|
|
|
|
let date = today - num_days + i - 1;
|
|
|
|
let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
|
|
|
|
let day_info = match info.info_by_day.get(&date) {
|
|
|
|
Some(v) => v,
|
|
|
|
None => &new_map_binding,
|
|
|
|
};
|
|
|
|
bridge_ips[i as usize] = match day_info.get(&BridgeInfoType::BridgeIps) {
|
2024-04-26 12:58:03 -04:00
|
|
|
Some(&v) => v,
|
2024-04-15 13:19:56 -04:00
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
negative_reports[i as usize] = match day_info.get(&BridgeInfoType::NegativeReports)
|
|
|
|
{
|
|
|
|
Some(&v) => v,
|
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
positive_reports[i as usize] = match day_info.get(&BridgeInfoType::PositiveReports)
|
|
|
|
{
|
|
|
|
Some(&v) => v,
|
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
// Evaluate using appropriate stage based on age of the bridge
|
2024-05-20 20:38:06 -04:00
|
|
|
if age < UNTRUSTED_INTERVAL || age < min_historical_days {
|
|
|
|
// open-entry bridge and/or not enough days of
|
|
|
|
// historical days for stages 2 and 3
|
2024-04-15 13:19:56 -04:00
|
|
|
if analyzer.stage_one(
|
|
|
|
confidence,
|
|
|
|
&bridge_ips,
|
|
|
|
bridge_ips_today,
|
|
|
|
&negative_reports,
|
|
|
|
negative_reports_today,
|
|
|
|
) {
|
|
|
|
blocked_in.insert(country.to_string());
|
|
|
|
}
|
2024-05-20 20:38:06 -04:00
|
|
|
} else if info.first_pr.is_none()
|
|
|
|
|| today < info.first_pr.unwrap() + min_historical_days
|
2024-04-15 13:19:56 -04:00
|
|
|
{
|
2024-05-20 20:38:06 -04:00
|
|
|
// invite-only bridge without min_historical_days of
|
|
|
|
// historical data on positive reports
|
2024-04-15 13:19:56 -04:00
|
|
|
if analyzer.stage_two(
|
|
|
|
confidence,
|
|
|
|
&bridge_ips,
|
|
|
|
bridge_ips_today,
|
|
|
|
&negative_reports,
|
|
|
|
negative_reports_today,
|
|
|
|
) {
|
|
|
|
blocked_in.insert(country.to_string());
|
|
|
|
}
|
|
|
|
} else {
|
2024-05-20 20:38:06 -04:00
|
|
|
// invite-only bridge that has min_historical_days or
|
|
|
|
// more of historical data since the first positive report
|
2024-04-15 13:19:56 -04:00
|
|
|
if analyzer.stage_three(
|
|
|
|
confidence,
|
|
|
|
&bridge_ips,
|
|
|
|
bridge_ips_today,
|
|
|
|
&negative_reports,
|
|
|
|
negative_reports_today,
|
|
|
|
&positive_reports,
|
|
|
|
positive_reports_today,
|
|
|
|
) {
|
|
|
|
blocked_in.insert(country.to_string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
blocked_in
|
|
|
|
}
|
|
|
|
|
|
|
|
// Analyzer implementations
|
|
|
|
|
|
|
|
/// Dummy example that never thinks bridges are blocked
|
|
|
|
pub struct ExampleAnalyzer {}
|
|
|
|
|
|
|
|
impl Analyzer for ExampleAnalyzer {
|
|
|
|
fn stage_one(
|
|
|
|
&self,
|
|
|
|
_confidence: f64,
|
|
|
|
_bridge_ips: &[u32],
|
|
|
|
_bridge_ips_today: u32,
|
|
|
|
_negative_reports: &[u32],
|
|
|
|
_negative_reports_today: u32,
|
|
|
|
) -> bool {
|
|
|
|
false
|
|
|
|
}
|
|
|
|
|
|
|
|
fn stage_two(
|
|
|
|
&self,
|
|
|
|
_confidence: f64,
|
|
|
|
_bridge_ips: &[u32],
|
|
|
|
_bridge_ips_today: u32,
|
|
|
|
_negative_reports: &[u32],
|
|
|
|
_negative_reports_today: u32,
|
|
|
|
) -> bool {
|
|
|
|
false
|
|
|
|
}
|
|
|
|
|
|
|
|
fn stage_three(
|
|
|
|
&self,
|
|
|
|
_confidence: f64,
|
|
|
|
_bridge_ips: &[u32],
|
|
|
|
_bridge_ips_today: u32,
|
|
|
|
_negative_reports: &[u32],
|
|
|
|
_negative_reports_today: u32,
|
|
|
|
_positive_reports: &[u32],
|
|
|
|
_positive_reports_today: u32,
|
|
|
|
) -> bool {
|
|
|
|
false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Model data as multivariate normal distribution
|
|
|
|
pub struct NormalAnalyzer {
|
|
|
|
max_threshold: u32,
|
|
|
|
scaling_factor: f64,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl NormalAnalyzer {
|
|
|
|
pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
|
|
|
|
Self {
|
|
|
|
max_threshold,
|
|
|
|
scaling_factor,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-18 22:27:57 -04:00
|
|
|
// Returns the mean vector, vector of individual standard deviations, and
|
2024-05-04 14:26:18 -04:00
|
|
|
// covariance matrix. If the standard deviation for a variable is 0 and/or
|
|
|
|
// the covariance matrix is not positive definite, add some noise to the
|
|
|
|
// data and recompute.
|
2024-04-18 22:27:57 -04:00
|
|
|
fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
|
2024-04-15 13:19:56 -04:00
|
|
|
let n = data.len();
|
|
|
|
|
2024-04-18 22:27:57 -04:00
|
|
|
// Compute mean and standard deviation vectors
|
|
|
|
let (mean_vec, sd_vec) = {
|
2024-04-15 13:19:56 -04:00
|
|
|
let mut mean_vec = Vec::<f64>::new();
|
2024-04-18 22:27:57 -04:00
|
|
|
let mut sd_vec = Vec::<f64>::new();
|
2024-04-15 13:19:56 -04:00
|
|
|
for var in data {
|
2024-04-18 22:27:57 -04:00
|
|
|
// Compute mean
|
|
|
|
let mut sum = 0.0;
|
|
|
|
for count in *var {
|
|
|
|
sum += *count as f64;
|
|
|
|
}
|
|
|
|
let mean = sum / var.len() as f64;
|
|
|
|
|
|
|
|
// Compute standard deviation
|
|
|
|
let mut sum = 0.0;
|
|
|
|
for count in *var {
|
|
|
|
sum += (*count as f64 - mean).powi(2);
|
|
|
|
}
|
|
|
|
let sd = (sum / var.len() as f64).sqrt();
|
|
|
|
mean_vec.push(mean);
|
|
|
|
sd_vec.push(sd);
|
2024-04-15 13:19:56 -04:00
|
|
|
}
|
2024-04-18 22:27:57 -04:00
|
|
|
(mean_vec, sd_vec)
|
2024-04-15 13:19:56 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
// Compute covariance matrix
|
|
|
|
let cov_mat = {
|
|
|
|
let mut cov_mat = Vec::<f64>::new();
|
|
|
|
// We don't need to recompute Syx, but we currently do
|
|
|
|
for i in 0..n {
|
|
|
|
for j in 0..n {
|
|
|
|
cov_mat.push({
|
|
|
|
let var1 = data[i];
|
|
|
|
let var1_mean = mean_vec[i];
|
|
|
|
|
|
|
|
let var2 = data[j];
|
|
|
|
let var2_mean = mean_vec[j];
|
|
|
|
|
|
|
|
assert_eq!(var1.len(), var2.len());
|
|
|
|
|
|
|
|
let mut sum = 0.0;
|
|
|
|
for index in 0..var1.len() {
|
|
|
|
sum +=
|
|
|
|
(var1[index] as f64 - var1_mean) * (var2[index] as f64 - var2_mean);
|
|
|
|
}
|
2024-04-26 12:58:03 -04:00
|
|
|
sum / (var1.len() - 1) as f64
|
2024-04-15 13:19:56 -04:00
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cov_mat
|
|
|
|
};
|
|
|
|
|
2024-05-04 14:26:18 -04:00
|
|
|
// If any standard deviation is 0 or the covariance matrix is not
|
|
|
|
// positive definite, add some noise and recompute.
|
|
|
|
let mut recompute = false;
|
|
|
|
for sd in &sd_vec {
|
|
|
|
if *sd <= 0.0 {
|
|
|
|
recompute = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
|
|
|
|
recompute = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if !recompute {
|
|
|
|
(mean_vec, sd_vec, cov_mat)
|
|
|
|
} else {
|
|
|
|
// Add random noise and recompute
|
|
|
|
let mut new_data = vec![vec![0; data[0].len()]; n];
|
|
|
|
let mut rng = rand::thread_rng();
|
|
|
|
for i in 0..n {
|
|
|
|
for j in 0..data[i].len() {
|
|
|
|
// Add 1 to some randomly selected values
|
|
|
|
new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Compute stats on modified data
|
|
|
|
Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
|
|
|
|
}
|
2024-04-15 13:19:56 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Analyzer for NormalAnalyzer {
|
|
|
|
/// Evaluate open-entry bridge based on only today's data
|
|
|
|
fn stage_one(
|
|
|
|
&self,
|
|
|
|
_confidence: f64,
|
2024-04-18 22:42:27 -04:00
|
|
|
_bridge_ips: &[u32],
|
2024-04-15 13:19:56 -04:00
|
|
|
bridge_ips_today: u32,
|
|
|
|
_negative_reports: &[u32],
|
|
|
|
negative_reports_today: u32,
|
|
|
|
) -> bool {
|
|
|
|
negative_reports_today > self.max_threshold
|
2024-04-26 13:11:32 -04:00
|
|
|
|| f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
|
2024-04-15 13:19:56 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Evaluate invite-only bridge based on last 30 days
|
|
|
|
fn stage_two(
|
|
|
|
&self,
|
|
|
|
confidence: f64,
|
|
|
|
bridge_ips: &[u32],
|
|
|
|
bridge_ips_today: u32,
|
|
|
|
negative_reports: &[u32],
|
|
|
|
negative_reports_today: u32,
|
|
|
|
) -> bool {
|
|
|
|
assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
|
|
|
|
assert_eq!(bridge_ips.len(), negative_reports.len());
|
|
|
|
|
2024-04-18 22:27:57 -04:00
|
|
|
let alpha = 1.0 - confidence;
|
|
|
|
|
|
|
|
let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]);
|
2024-05-20 20:54:28 -04:00
|
|
|
let bridge_ips_mean = mean_vec[0];
|
2024-04-15 13:19:56 -04:00
|
|
|
let negative_reports_mean = mean_vec[1];
|
2024-04-18 22:27:57 -04:00
|
|
|
let bridge_ips_sd = sd_vec[0];
|
|
|
|
let negative_reports_sd = sd_vec[1];
|
2024-04-15 13:19:56 -04:00
|
|
|
|
2024-05-20 20:54:28 -04:00
|
|
|
/*
|
|
|
|
let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
|
|
|
|
let pdf = mvn.pdf(&DVector::from_vec(vec![
|
|
|
|
bridge_ips_today as f64,
|
|
|
|
negative_reports_today as f64,
|
|
|
|
]));
|
|
|
|
*/
|
|
|
|
|
|
|
|
// Model each variable in isolation. We use 1 - the CDF for
|
|
|
|
// negative reports because more negative reports is worse.
|
|
|
|
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
|
|
|
|
let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
|
2024-04-26 15:09:30 -04:00
|
|
|
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
|
2024-05-20 20:54:28 -04:00
|
|
|
let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
|
|
|
|
|
|
|
|
// For now, just look at each variable in isolation
|
|
|
|
// TODO: How do we do a multivariate normal CDF?
|
|
|
|
bip_cdf < alpha || nr_cdf < alpha
|
2024-04-15 13:19:56 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Evaluate invite-only bridge with lv3+ users submitting positive reports
|
|
|
|
fn stage_three(
|
|
|
|
&self,
|
|
|
|
confidence: f64,
|
|
|
|
bridge_ips: &[u32],
|
|
|
|
bridge_ips_today: u32,
|
|
|
|
negative_reports: &[u32],
|
|
|
|
negative_reports_today: u32,
|
|
|
|
positive_reports: &[u32],
|
|
|
|
positive_reports_today: u32,
|
|
|
|
) -> bool {
|
|
|
|
assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
|
|
|
|
assert_eq!(bridge_ips.len(), negative_reports.len());
|
|
|
|
assert_eq!(bridge_ips.len(), positive_reports.len());
|
|
|
|
|
2024-04-18 22:27:57 -04:00
|
|
|
let alpha = 1.0 - confidence;
|
|
|
|
|
|
|
|
let (mean_vec, sd_vec, cov_mat) =
|
|
|
|
Self::stats(&[bridge_ips, negative_reports, positive_reports]);
|
2024-05-20 20:54:28 -04:00
|
|
|
let bridge_ips_mean = mean_vec[0];
|
2024-04-15 13:19:56 -04:00
|
|
|
let negative_reports_mean = mean_vec[1];
|
2024-05-20 20:54:28 -04:00
|
|
|
let positive_reports_mean = mean_vec[2];
|
2024-04-18 22:27:57 -04:00
|
|
|
let bridge_ips_sd = sd_vec[0];
|
|
|
|
let negative_reports_sd = sd_vec[1];
|
|
|
|
let positive_reports_sd = sd_vec[2];
|
2024-04-15 13:19:56 -04:00
|
|
|
|
2024-05-20 20:54:28 -04:00
|
|
|
/*
|
|
|
|
let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
|
|
|
|
let pdf = mvn.pdf(&DVector::from_vec(vec![
|
|
|
|
bridge_ips_today as f64,
|
|
|
|
negative_reports_today as f64,
|
|
|
|
positive_reports_today as f64,
|
|
|
|
]));
|
|
|
|
*/
|
|
|
|
|
|
|
|
// Model each variable in isolation. We use 1 - the CDF for
|
|
|
|
// negative reports because more negative reports is worse.
|
|
|
|
let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
|
|
|
|
let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
|
2024-04-26 15:09:30 -04:00
|
|
|
let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
|
2024-05-20 20:54:28 -04:00
|
|
|
let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
|
|
|
|
let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
|
|
|
|
let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
|
|
|
|
|
|
|
|
// For now, just look at each variable in isolation
|
|
|
|
// TODO: How do we do a multivariate normal CDF?
|
|
|
|
bip_cdf < alpha || nr_cdf < alpha || pr_cdf < alpha
|
2024-04-15 13:19:56 -04:00
|
|
|
}
|
|
|
|
}
|