From 97d4622cd472007a71e7142716992f02743689e5 Mon Sep 17 00:00:00 2001 From: Vecna Date: Mon, 20 May 2024 20:54:28 -0400 Subject: [PATCH] Use CDF, not PDF with artificial 'blocked' data TODO: Figure out proper multivariate CDF --- src/analysis.rs | 131 ++++++++++++++---------------------------------- 1 file changed, 37 insertions(+), 94 deletions(-) diff --git a/src/analysis.rs b/src/analysis.rs index d1d1978..baf485c 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -2,7 +2,7 @@ use crate::{BridgeInfo, BridgeInfoType}; use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL; use nalgebra::{Cholesky, DMatrix, DVector}; use rand::Rng; -use statrs::distribution::{Continuous, MultivariateNormal, Normal}; +use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal}; use std::{ cmp::{max, min}, collections::{BTreeMap, HashSet}, @@ -333,52 +333,29 @@ impl Analyzer for NormalAnalyzer { let alpha = 1.0 - confidence; let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]); + let bridge_ips_mean = mean_vec[0]; let negative_reports_mean = mean_vec[1]; let bridge_ips_sd = sd_vec[0]; let negative_reports_sd = sd_vec[1]; - // Artificially create data for alternative hypothesis - let num_days = bridge_ips.len() as usize; - let mut bridge_ips_blocked = vec![0; num_days]; - let mut negative_reports_blocked = vec![0; num_days]; - let bridge_ips_deviation = (2.0 * bridge_ips_sd).round() as u32; - for i in 0..num_days { - // Suppose bridge stats will go down by 2 SDs - bridge_ips_blocked[i] = if bridge_ips_deviation > bridge_ips[i] { - 0 - } else { - bridge_ips[i] - bridge_ips_deviation - }; - // Suppose negative reports will go up by 2 SDs - negative_reports_blocked[i] = - negative_reports[i] + (2.0 * negative_reports_sd).round() as u32; - } - let (mean_vec_blocked, _sd_vec_blocked, cov_mat_blocked) = - Self::stats(&[&bridge_ips_blocked, &negative_reports_blocked]); + /* + let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); + let pdf = mvn.pdf(&DVector::from_vec(vec![ + bridge_ips_today as f64, + negative_reports_today as f64, + ])); + */ - let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); - let pdf = mvn.pdf(&DVector::from_vec(vec![ - bridge_ips_today as f64, - negative_reports_today as f64, - ])); - - let mvn_blocked = MultivariateNormal::new(mean_vec_blocked, cov_mat_blocked).unwrap(); - let pdf_blocked = mvn_blocked.pdf(&DVector::from_vec(vec![ - bridge_ips_today as f64, - negative_reports_today as f64, - ])); - - // Also model negative reports in isolation + // Model each variable in isolation. We use 1 - the CDF for + // negative reports because more negative reports is worse. + let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap(); + let bip_cdf = bip_normal.cdf(bridge_ips_today as f64); let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap(); - let nr_pdf = nr_normal.pdf(negative_reports_today as f64); - let nr_normal_blocked = Normal::new( - negative_reports_mean + 2.0 * negative_reports_sd, - negative_reports_sd, - ) - .unwrap(); - let nr_pdf_blocked = nr_normal_blocked.pdf(negative_reports_today as f64); + let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64); - (pdf / pdf_blocked).ln() < alpha || (nr_pdf / nr_pdf_blocked).ln() < alpha + // For now, just look at each variable in isolation + // TODO: How do we do a multivariate normal CDF? + bip_cdf < alpha || nr_cdf < alpha } /// Evaluate invite-only bridge with lv3+ users submitting positive reports @@ -400,67 +377,33 @@ impl Analyzer for NormalAnalyzer { let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports, positive_reports]); + let bridge_ips_mean = mean_vec[0]; let negative_reports_mean = mean_vec[1]; + let positive_reports_mean = mean_vec[2]; let bridge_ips_sd = sd_vec[0]; let negative_reports_sd = sd_vec[1]; let positive_reports_sd = sd_vec[2]; - // Artificially create data for alternative hypothesis - let num_days = bridge_ips.len() as usize; - let mut bridge_ips_blocked = vec![0; num_days]; - let mut negative_reports_blocked = vec![0; num_days]; - let mut positive_reports_blocked = vec![0; num_days]; - let bridge_ips_deviation = (2.0 * bridge_ips_sd).round() as u32; - let positive_reports_deviation = (2.0 * positive_reports_sd).round() as u32; - for i in 0..num_days { - // Suppose positive reports will go down by 2 SDs - positive_reports_blocked[i] = if positive_reports_deviation > positive_reports[i] { - 0 - } else { - positive_reports[i] - positive_reports_deviation - }; - // Suppose bridge stats will go down by 2 SDs - bridge_ips_blocked[i] = if bridge_ips_deviation > bridge_ips[i] { - 0 - } else { - bridge_ips[i] - bridge_ips_deviation - }; - // Suppose each user who would have submitted a positive report but - // didn't submits a negative report instead. - negative_reports_blocked[i] = - negative_reports[i] + positive_reports[i] - positive_reports_blocked[i]; - } - let (mean_vec_blocked, _sd_vec_blocked, cov_mat_blocked) = Self::stats(&[ - &bridge_ips_blocked, - &negative_reports_blocked, - &positive_reports_blocked, - ]); + /* + let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); + let pdf = mvn.pdf(&DVector::from_vec(vec![ + bridge_ips_today as f64, + negative_reports_today as f64, + positive_reports_today as f64, + ])); + */ - let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap(); - let pdf = mvn.pdf(&DVector::from_vec(vec![ - bridge_ips_today as f64, - negative_reports_today as f64, - positive_reports_today as f64, - ])); - - let mvn_blocked = MultivariateNormal::new(mean_vec_blocked, cov_mat_blocked).unwrap(); - let pdf_blocked = mvn_blocked.pdf(&DVector::from_vec(vec![ - bridge_ips_today as f64, - negative_reports_today as f64, - positive_reports_today as f64, - ])); - - // Also model negative reports in isolation + // Model each variable in isolation. We use 1 - the CDF for + // negative reports because more negative reports is worse. + let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap(); + let bip_cdf = bip_normal.cdf(bridge_ips_today as f64); let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap(); - let nr_pdf = nr_normal.pdf(negative_reports_today as f64); - // Note we do NOT make this a function of positive signals - let nr_normal_blocked = Normal::new( - negative_reports_mean + 2.0 * negative_reports_sd, - negative_reports_sd, - ) - .unwrap(); - let nr_pdf_blocked = nr_normal_blocked.pdf(negative_reports_today as f64); + let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64); + let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap(); + let pr_cdf = pr_normal.cdf(positive_reports_today as f64); - (pdf / pdf_blocked).ln() < alpha || (nr_pdf / nr_pdf_blocked).ln() < alpha + // For now, just look at each variable in isolation + // TODO: How do we do a multivariate normal CDF? + bip_cdf < alpha || nr_cdf < alpha || pr_cdf < alpha } }