/*! Fields we need from the extra-info documents for bridges... Note, this is NOT a complete implementation of the document format. (https://spec.torproject.org/dir-spec/extra-info-document-format.html) */ use chrono::DateTime; use http_body_util::{BodyExt, Empty}; use hyper::body::Bytes; use hyper_util::{client::legacy::Client, rt::TokioExecutor}; use julianday::JulianDay; use select::{document::Document, predicate::Name}; use serde::{Deserialize, Serialize}; use std::{ collections::{BTreeMap, HashMap, HashSet}, fs::File, io::{prelude::*, BufReader, Write}, path::Path, }; /// Directory where we store these files pub const DIRECTORY: &str = "extra_infos"; /// Fields we need from extra-info document #[derive(Eq, PartialEq, Hash, Serialize, Deserialize)] pub struct ExtraInfo { /// Bridge nickname, probably unused pub nickname: String, /// Bridge fingerprint, a SHA-1 hash of the bridge ID pub fingerprint: [u8; 20], /// Date (in UTC) that this document covered (bridge-stats-end if /// available) or that the document was published (published), stored /// as a Julian date because we don't need to know more precisely than /// the day. pub date: u32, /// Map of country codes and how many users (rounded up to a multiple of /// 8) have connected to that bridge during the day. /// Uses BTreeMap instead of HashMap so ExtraInfo can implement Hash. pub bridge_ips: BTreeMap, // TODO: What size for count? } fn get_extra_info_or_error(entry: &HashMap) -> Result { if !entry.contains_key("nickname") || !entry.contains_key("fingerprint") { // How did we get here?? return Err("Cannot parse extra-info: Missing nickname or fingerprint".to_string()); } if !(entry.contains_key("bridge-stats-end") || entry.contains_key("published")) || !entry.contains_key("bridge-ips") { // Some extra-infos are missing data on connecting IPs... // But we can't do anything in that case. return Err(format!( "Failed to parse extra-info for {} {}", entry.get("nickname").unwrap(), entry.get("fingerprint").unwrap() )); } let nickname = entry.get("nickname").unwrap().to_string(); let fingerprint_str = entry.get("fingerprint").unwrap(); if fingerprint_str.len() != 40 { return Err("Fingerprint must be 20 bytes".to_string()); } let fingerprint = array_bytes::hex2array(fingerprint_str).unwrap(); let date: u32 = { let date_str = if entry.contains_key("bridge-stats-end") { let line = entry.get("bridge-stats-end").unwrap(); // Parse out (86400 s) from end of line &line[..line.find("(").unwrap()-1] } else { entry.get("published").unwrap().as_str() }; JulianDay::from( DateTime::parse_from_str( &(date_str.to_owned() + " +0000"), "%F %T %z", ) .unwrap() .date_naive(), ) .inner() .try_into() .unwrap() }; let bridge_ips_str = entry.get("bridge-ips").unwrap(); let mut bridge_ips: BTreeMap = BTreeMap::new(); let countries: Vec<&str> = bridge_ips_str.split(',').collect(); for country in countries { if country != "" { // bridge-ips may be empty let (cc, count) = country.split_once('=').unwrap(); bridge_ips.insert(cc.to_string(), count.parse::().unwrap()); } } Ok(ExtraInfo { nickname, fingerprint, date, bridge_ips, }) } pub fn add_extra_infos<'a>(filename: &str, set: &mut HashSet) { let infile = File::open(format!("{}/{}", DIRECTORY, filename)).unwrap(); let reader = BufReader::new(infile); let mut entry = HashMap::::new(); for line in reader.lines() { let line = line.unwrap(); if line.starts_with("@type bridge-extra-info ") { if !entry.is_empty() { let extra_info = get_extra_info_or_error(&entry); if extra_info.is_ok() { set.insert(extra_info.unwrap()); } else { // Just print the error and continue. println!("{}", extra_info.err().unwrap()); } entry = HashMap::::new(); } } else { if line.starts_with("extra-info ") { // extra-info line has format: // extra-info let line_split: Vec<&str> = line.split(' ').collect(); if line_split.len() != 3 { println!("Misformed extra-info line"); } else { entry.insert("nickname".to_string(), line_split[1].to_string()); entry.insert("fingerprint".to_string(), line_split[2].to_string()); } } else { let (key, value) = match line.split_once(' ') { Some((k, v)) => (k, v), None => (line.as_str(), ""), }; entry.insert(key.to_string(), value.to_string()); } } } // Do for the last one let extra_info = get_extra_info_or_error(&entry); if extra_info.is_ok() { set.insert(extra_info.unwrap()); } else { println!("{}", extra_info.err().unwrap()); } } /// Download new extra-infos files and save them in DIRECTORY. This function /// returns the set of newly downloaded filenames. pub async fn download_extra_infos( ) -> Result, Box> { // Download directory of recent extra-infos let base_url = "https://collector.torproject.org/recent/bridge-descriptors/extra-infos/"; let url = base_url.parse().unwrap(); let https = hyper_rustls::HttpsConnectorBuilder::new() .with_native_roots() // TODO: Pin certificate? Is this data signed/verifiable? .expect("no native root CA certificates found") .https_only() .enable_http1() .build(); let client: Client<_, Empty> = Client::builder(TokioExecutor::new()).build(https); println!("Downloading {}", base_url); let mut res = client.get(url).await?; assert_eq!(res.status(), hyper::StatusCode::OK); let mut body_str = String::from(""); while let Some(next) = res.frame().await { let frame = next?; if let Some(chunk) = frame.data_ref() { body_str.push_str(&String::from_utf8(chunk.to_vec())?); } } let doc = Document::from(body_str.as_str()); // Create extra-infos directory if it doesn't exist std::fs::create_dir_all(&DIRECTORY)?; let mut new_files = HashSet::::new(); // Go through all the links in the page and download new files let links = doc.find(Name("a")).filter_map(|n| n.attr("href")); for link in links { if link.ends_with("-extra-infos") { let filename = format!("{}/{}", DIRECTORY, link); // Download file if it's not already downloaded if !Path::new(&filename).exists() { let extra_infos_url = format!("{}{}", base_url, link); println!("Downloading {}", extra_infos_url); let mut res = client.get(extra_infos_url.parse().unwrap()).await?; assert_eq!(res.status(), hyper::StatusCode::OK); let mut file = std::fs::File::create(filename).unwrap(); while let Some(next) = res.frame().await { let frame = next?; if let Some(chunk) = frame.data_ref() { file.write_all(&chunk)?; } } new_files.insert(link.to_string()); } } } Ok(new_files) }