diff --git a/src/analysis.rs b/src/analysis.rs index 005b756..d1d1978 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -4,7 +4,7 @@ use nalgebra::{Cholesky, DMatrix, DVector}; use rand::Rng; use statrs::distribution::{Continuous, MultivariateNormal, Normal}; use std::{ - cmp::min, + cmp::{max, min}, collections::{BTreeMap, HashSet}, }; @@ -52,6 +52,8 @@ pub fn blocked_in( bridge_info: &BridgeInfo, confidence: f64, date: u32, + min_historical_days: u32, + max_historical_days: u32, ) -> HashSet { let mut blocked_in = HashSet::::new(); let today = date; @@ -81,7 +83,7 @@ pub fn blocked_in( None => 0, }; - let num_days = min(age, UNTRUSTED_INTERVAL); + let num_days = min(age, max_historical_days); // Get time series for last num_days let mut bridge_ips = vec![0; num_days as usize]; @@ -112,8 +114,9 @@ pub fn blocked_in( } // Evaluate using appropriate stage based on age of the bridge - if age < UNTRUSTED_INTERVAL { - // open-entry bridge + if age < UNTRUSTED_INTERVAL || age < min_historical_days { + // open-entry bridge and/or not enough days of + // historical days for stages 2 and 3 if analyzer.stage_one( confidence, &bridge_ips, @@ -123,10 +126,11 @@ pub fn blocked_in( ) { blocked_in.insert(country.to_string()); } - } else if info.first_pr.is_none() || today < info.first_pr.unwrap() + UNTRUSTED_INTERVAL + } else if info.first_pr.is_none() + || today < info.first_pr.unwrap() + min_historical_days { - // invite-only bridge without 30+ days of historical data on - // positive reports + // invite-only bridge without min_historical_days of + // historical data on positive reports if analyzer.stage_two( confidence, &bridge_ips, @@ -137,8 +141,8 @@ pub fn blocked_in( blocked_in.insert(country.to_string()); } } else { - // invite-only bridge that has been up long enough that it - // might have 30+ days of historical data on positive reports + // invite-only bridge that has min_historical_days or + // more of historical data since the first positive report if analyzer.stage_three( confidence, &bridge_ips, diff --git a/src/bin/server.rs b/src/bin/server.rs index 3ce6cfe..184619d 100644 --- a/src/bin/server.rs +++ b/src/bin/server.rs @@ -52,6 +52,12 @@ pub struct Config { // scaling_factor * bridge_ips scaling_factor: f64, + // minimum number of historical days for statistical analysis + min_historical_days: u32, + + // maximum number of historical days to consider in historical analysis + max_historical_days: u32, + //require_bridge_token: bool, port: u16, updater_schedule: String, @@ -78,6 +84,8 @@ async fn update_daily_info( confidence: f64, max_threshold: u32, scaling_factor: f64, + min_historical_days: u32, + max_historical_days: u32, ) { update_extra_infos(&db, &extra_infos_base_url) .await @@ -88,6 +96,8 @@ async fn update_daily_info( &db, &analysis::NormalAnalyzer::new(max_threshold, scaling_factor), confidence, + min_historical_days, + max_historical_days, ); report_blockages(&distributors, new_blockages).await; @@ -106,11 +116,13 @@ async fn create_context_manager( confidence: f64, max_threshold: u32, scaling_factor: f64, + min_historical_days: u32, + max_historical_days: u32, context_rx: mpsc::Receiver, mut kill: broadcast::Receiver<()>, ) { tokio::select! { - create_context = context_manager(db_config, distributors, extra_infos_base_url, confidence, max_threshold, scaling_factor, context_rx) => create_context, + create_context = context_manager(db_config, distributors, extra_infos_base_url, confidence, max_threshold, scaling_factor, min_historical_days, max_historical_days, context_rx) => create_context, _ = kill.recv() => {println!("Shut down manager");}, } } @@ -122,6 +134,8 @@ async fn context_manager( confidence: f64, max_threshold: u32, scaling_factor: f64, + min_historical_days: u32, + max_historical_days: u32, mut context_rx: mpsc::Receiver, ) { let db: Db = sled::open(&db_config.db_path).unwrap(); @@ -149,6 +163,8 @@ async fn context_manager( confidence, max_threshold, scaling_factor, + min_historical_days, + max_historical_days, ) .await; } @@ -218,6 +234,8 @@ async fn main() { config.confidence, config.max_threshold, config.scaling_factor, + config.min_historical_days, + config.max_historical_days, request_rx, kill, ) diff --git a/src/lib.rs b/src/lib.rs index f3ac791..6a273eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -561,6 +561,14 @@ pub async fn update_negative_reports(db: &Db, distributors: &BTreeMap bincode::deserialize(&v).unwrap(), None => BTreeMap::>::new(), }; + let mut bridges_to_re_evaluate = match db.get("bridges-to-re-evaluate").unwrap() { + Some(v) => bincode::deserialize(&v).unwrap(), + None => HashMap::::new(), + // We map fingerprint:date where date is the earliest date for + // which we have new reports + }; + let today = get_date(); + // Key is [fingerprint]_[country]_[date] for bridge_country_date in all_negative_reports.keys() { let reports = all_negative_reports.get(bridge_country_date).unwrap(); @@ -571,6 +579,18 @@ pub async fn update_negative_reports(db: &Db, distributors: &BTreeMap 0 && date < today { + let fpr_str = array_bytes::bytes2hex("", fingerprint); + if bridges_to_re_evaluate.contains_key(&fpr_str) { + if *bridges_to_re_evaluate.get(&fpr_str).unwrap() > date { + bridges_to_re_evaluate.insert(fpr_str, date); + } + } else { + bridges_to_re_evaluate.insert(fpr_str, date); + } + } + // Get bridge info or make new one let mut bridge_info = match db.get(fingerprint).unwrap() { Some(v) => bincode::deserialize(&v).unwrap(), @@ -605,6 +625,12 @@ pub async fn update_negative_reports(db: &Db, distributors: &BTreeMap>::new()).unwrap(), ) .unwrap(); + // Commit new set of bridges to re-evaluate + db.insert( + "bridges-to-re-evaluate", + bincode::serialize(&bridges_to_re_evaluate).unwrap(), + ) + .unwrap(); } // Process positive reports @@ -674,6 +700,14 @@ pub async fn update_positive_reports(db: &Db, distributors: &BTreeMap bincode::deserialize(&v).unwrap(), None => BTreeMap::>::new(), }; + let mut bridges_to_re_evaluate = match db.get("bridges-to-re-evaluate").unwrap() { + Some(v) => bincode::deserialize(&v).unwrap(), + None => HashMap::::new(), + // We map fingerprint:date where date is the earliest date for + // which we have new reports + }; + let today = get_date(); + // Key is [fingerprint]_[country]_[date] for bridge_country_date in all_positive_reports.keys() { let reports = all_positive_reports.get(bridge_country_date).unwrap(); @@ -684,6 +718,18 @@ pub async fn update_positive_reports(db: &Db, distributors: &BTreeMap 0 && date < today { + let fpr_str = array_bytes::bytes2hex("", fingerprint); + if bridges_to_re_evaluate.contains_key(&fpr_str) { + if *bridges_to_re_evaluate.get(&fpr_str).unwrap() > date { + bridges_to_re_evaluate.insert(fpr_str, date); + } + } else { + bridges_to_re_evaluate.insert(fpr_str, date); + } + } + // Get bridge info or make new one let mut bridge_info = match db.get(fingerprint).unwrap() { Some(v) => bincode::deserialize(&v).unwrap(), @@ -718,6 +764,12 @@ pub async fn update_positive_reports(db: &Db, distributors: &BTreeMap>::new()).unwrap(), ) .unwrap(); + // Commit new set of bridges to re-evaluate + db.insert( + "bridges-to-re-evaluate", + bincode::serialize(&bridges_to_re_evaluate).unwrap(), + ) + .unwrap(); } // Verdict on bridge reachability @@ -728,6 +780,8 @@ pub fn guess_blockages( db: &Db, analyzer: &dyn Analyzer, confidence: f64, + min_historical_days: u32, + max_historical_days: u32, ) -> HashMap<[u8; 20], HashSet> { // Map of bridge fingerprint to set of countries which newly block it let mut blockages = HashMap::<[u8; 20], HashSet>::new(); @@ -737,21 +791,36 @@ pub fn guess_blockages( Some(v) => bincode::deserialize(&v).unwrap(), None => HashSet::<[u8; 20]>::new(), }; + // Get list of bridges with historical data to re-evaluate + let bridges_to_re_evaluate = match db.get("bridges-to-re-evaluate").unwrap() { + Some(v) => bincode::deserialize(&v).unwrap(), + None => HashMap::::new(), + }; // Guess for each bridge for fingerprint in bridges { + let today = get_date(); let mut bridge_info: BridgeInfo = bincode::deserialize(&db.get(fingerprint).unwrap().unwrap()).unwrap(); let mut new_blockages = HashSet::::new(); - // Re-evaluate the last MAX_BACKDATE + 1 days in case we received new - // reports for those days. For efficiency, we could instead keep track - // of which bridges received new reports and only re-evaluate those. - for i in 0..MAX_BACKDATE + 1 { + let fpr_str = array_bytes::bytes2hex("", &fingerprint); + let first_date = if bridges_to_re_evaluate.contains_key(&fpr_str) { + *bridges_to_re_evaluate.get(&fpr_str).unwrap() + } else { + today + }; + + // Re-evaluate the last days from first_date to today. + // (This approach is still suboptimal because we re-evaluate for + // countries that don't have new reports.) + for i in first_date..=today { let blocked_in = analysis::blocked_in( analyzer, &bridge_info, confidence, - get_date() - MAX_BACKDATE - 1 + i, + i, + min_historical_days, + max_historical_days, ); for country in blocked_in { let bridge_country_info = bridge_info.info_by_country.get_mut(&country).unwrap(); @@ -769,6 +838,13 @@ pub fn guess_blockages( .unwrap(); } + // Remove all bridges to re-evaluate from DB + db.insert( + "bridges-to-re-evaluate", + bincode::serialize(&HashMap::::new()).unwrap(), + ) + .unwrap(); + // Return map of new blockages blockages } diff --git a/src/tests/analysis/stage_one.rs b/src/tests/analysis/stage_one.rs index 62bfd5a..62c395e 100644 --- a/src/tests/analysis/stage_one.rs +++ b/src/tests/analysis/stage_one.rs @@ -20,7 +20,7 @@ fn test_stage_1_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -32,7 +32,7 @@ fn test_stage_1_analysis() { 8, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -44,7 +44,7 @@ fn test_stage_1_analysis() { 0, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -58,7 +58,7 @@ fn test_stage_1_analysis() { ); blocking_countries.insert("ru".to_string()); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -79,7 +79,7 @@ fn test_stage_1_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -96,7 +96,7 @@ fn test_stage_1_analysis() { 1, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -113,7 +113,7 @@ fn test_stage_1_analysis() { 2, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -132,7 +132,7 @@ fn test_stage_1_analysis() { ); blocking_countries.insert("ru".to_string()); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -163,7 +163,7 @@ fn test_stage_1_analysis() { 5, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -182,7 +182,7 @@ fn test_stage_1_analysis() { ); blocking_countries.insert("ru".to_string()); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } diff --git a/src/tests/analysis/stage_three.rs b/src/tests/analysis/stage_three.rs index 192a9c1..32bd924 100644 --- a/src/tests/analysis/stage_three.rs +++ b/src/tests/analysis/stage_three.rs @@ -20,7 +20,7 @@ async fn test_stage_3_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -43,7 +43,7 @@ async fn test_stage_3_analysis() { 13, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -69,7 +69,7 @@ async fn test_stage_3_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -95,7 +95,7 @@ async fn test_stage_3_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -121,7 +121,7 @@ async fn test_stage_3_analysis() { // This should be blocked because it's different in the bad direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -142,7 +142,7 @@ async fn test_stage_3_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -165,7 +165,7 @@ async fn test_stage_3_analysis() { 16 + i % 5, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -191,7 +191,7 @@ async fn test_stage_3_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -217,7 +217,7 @@ async fn test_stage_3_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -243,7 +243,7 @@ async fn test_stage_3_analysis() { // This should be blocked because it's different in the bad direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -264,7 +264,7 @@ async fn test_stage_3_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -287,7 +287,7 @@ async fn test_stage_3_analysis() { 16 + i % 5, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -313,7 +313,7 @@ async fn test_stage_3_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -339,7 +339,7 @@ async fn test_stage_3_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -367,7 +367,7 @@ async fn test_stage_3_analysis() { // The censor artificially inflated bridge stats to prevent detection. // Ensure we still detect the censorship from negative reports. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -388,7 +388,7 @@ async fn test_stage_3_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -411,7 +411,7 @@ async fn test_stage_3_analysis() { 16 + i % 5, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -437,7 +437,7 @@ async fn test_stage_3_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -463,7 +463,7 @@ async fn test_stage_3_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -490,7 +490,7 @@ async fn test_stage_3_analysis() { // This should be blocked because it's different in the bad direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } diff --git a/src/tests/analysis/stage_two.rs b/src/tests/analysis/stage_two.rs index 98f2b50..c052317 100644 --- a/src/tests/analysis/stage_two.rs +++ b/src/tests/analysis/stage_two.rs @@ -19,7 +19,7 @@ async fn test_stage_2_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -37,7 +37,7 @@ async fn test_stage_2_analysis() { i % 4, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -58,7 +58,7 @@ async fn test_stage_2_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -79,7 +79,7 @@ async fn test_stage_2_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -100,7 +100,7 @@ async fn test_stage_2_analysis() { // This should be blocked because it's different in the bad direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -122,7 +122,7 @@ async fn test_stage_2_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -140,7 +140,7 @@ async fn test_stage_2_analysis() { 1, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -161,7 +161,7 @@ async fn test_stage_2_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -182,7 +182,7 @@ async fn test_stage_2_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -204,7 +204,7 @@ async fn test_stage_2_analysis() { // The censor artificially inflated bridge stats to prevent detection. // Ensure we still detect the censorship from negative reports. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -225,7 +225,7 @@ async fn test_stage_2_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -243,7 +243,7 @@ async fn test_stage_2_analysis() { i % 4, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -264,7 +264,7 @@ async fn test_stage_2_analysis() { // Should not be blocked because we have similar data. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -285,7 +285,7 @@ async fn test_stage_2_analysis() { // This should not be blocked even though it's very different because // it's different in the good direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -302,11 +302,11 @@ async fn test_stage_2_analysis() { date, 0, ); - //blocking_countries.insert("ru".to_string()); + blocking_countries.insert("ru".to_string()); // This should be blocked because it's different in the bad direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -327,7 +327,7 @@ async fn test_stage_2_analysis() { // No data today assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); @@ -345,7 +345,7 @@ async fn test_stage_2_analysis() { i % 4, ); assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); } @@ -367,7 +367,7 @@ async fn test_stage_2_analysis() { // This should be blocked because it's different in the bad direction. assert_eq!( - blocked_in(&analyzer, &bridge_info, confidence, date), + blocked_in(&analyzer, &bridge_info, confidence, date, 30, 30), blocking_countries ); }