troll-patrol/src/extra_info.rs

222 lines
8.2 KiB
Rust
Raw Normal View History

2024-02-07 18:36:40 -05:00
/*! Fields we need from the extra-info documents for bridges...
Note, this is NOT a complete implementation of the document format.
(https://spec.torproject.org/dir-spec/extra-info-document-format.html) */
use chrono::DateTime;
use http::status::StatusCode;
2024-02-07 18:36:40 -05:00
use http_body_util::{BodyExt, Empty};
use hyper::body::Bytes;
use hyper_util::{client::legacy::Client, rt::TokioExecutor};
use julianday::JulianDay;
use serde::{Deserialize, Serialize};
use std::{
collections::{BTreeMap, HashMap, HashSet},
fs::File,
io::{prelude::*, BufReader, Write},
path::Path,
};
/// Directory where we store these files
pub const DIRECTORY: &str = "extra_infos";
/// Fields we need from extra-info document
#[derive(Eq, PartialEq, Hash, Serialize, Deserialize)]
pub struct ExtraInfo {
/// Bridge nickname, probably unused
pub nickname: String,
2024-02-07 18:36:40 -05:00
/// Bridge fingerprint, a SHA-1 hash of the bridge ID
pub fingerprint: [u8; 20],
/// Date (in UTC) that this document covered (bridge-stats-end if
/// available) or that the document was published (published), stored
/// as a Julian date because we don't need to know more precisely than
/// the day.
pub date: u32,
2024-02-07 18:36:40 -05:00
/// Map of country codes and how many users (rounded up to a multiple of
/// 8) have connected to that bridge during the day.
/// Uses BTreeMap instead of HashMap so ExtraInfo can implement Hash.
pub bridge_ips: BTreeMap<String, u32>, // TODO: What size for count?
}
fn get_extra_info_or_error(entry: &HashMap<String, String>) -> Result<ExtraInfo, String> {
if !entry.contains_key("nickname") || !entry.contains_key("fingerprint") {
// How did we get here??
return Err("Cannot parse extra-info: Missing nickname or fingerprint".to_string());
}
2024-02-26 18:01:07 -05:00
if !(entry.contains_key("bridge-stats-end") || entry.contains_key("published"))
|| !entry.contains_key("bridge-ips")
{
2024-02-07 18:36:40 -05:00
// Some extra-infos are missing data on connecting IPs...
// But we can't do anything in that case.
return Err(format!(
"Failed to parse extra-info for {} {}",
entry.get("nickname").unwrap(),
entry.get("fingerprint").unwrap()
));
}
let nickname = entry.get("nickname").unwrap().to_string();
let fingerprint_str = entry.get("fingerprint").unwrap();
if fingerprint_str.len() != 40 {
return Err("Fingerprint must be 20 bytes".to_string());
}
let fingerprint = array_bytes::hex2array(fingerprint_str).unwrap();
let date: u32 = {
let date_str = if entry.contains_key("bridge-stats-end") {
let line = entry.get("bridge-stats-end").unwrap();
// Parse out (86400 s) from end of line
2024-02-26 18:01:07 -05:00
&line[..line.find("(").unwrap() - 1]
} else {
entry.get("published").unwrap().as_str()
};
JulianDay::from(
2024-02-26 18:01:07 -05:00
DateTime::parse_from_str(&(date_str.to_owned() + " +0000"), "%F %T %z")
.unwrap()
.date_naive(),
2024-02-07 18:36:40 -05:00
)
.inner()
.try_into()
2024-02-07 18:36:40 -05:00
.unwrap()
};
2024-02-07 18:36:40 -05:00
let bridge_ips_str = entry.get("bridge-ips").unwrap();
let mut bridge_ips: BTreeMap<String, u32> = BTreeMap::new();
let countries: Vec<&str> = bridge_ips_str.split(',').collect();
for country in countries {
if country != "" {
// bridge-ips may be empty
let (cc, count) = country.split_once('=').unwrap();
bridge_ips.insert(cc.to_string(), count.parse::<u32>().unwrap());
}
}
Ok(ExtraInfo {
nickname,
fingerprint,
date,
2024-02-07 18:36:40 -05:00
bridge_ips,
})
}
pub fn add_extra_infos<'a>(filename: &str, set: &mut HashSet<ExtraInfo>) {
let infile = File::open(format!("{}/{}", DIRECTORY, filename)).unwrap();
let reader = BufReader::new(infile);
let mut entry = HashMap::<String, String>::new();
for line in reader.lines() {
let line = line.unwrap();
if line.starts_with("@type bridge-extra-info ") {
if !entry.is_empty() {
let extra_info = get_extra_info_or_error(&entry);
if extra_info.is_ok() {
set.insert(extra_info.unwrap());
} else {
// Just print the error and continue.
println!("{}", extra_info.err().unwrap());
}
entry = HashMap::<String, String>::new();
}
} else {
if line.starts_with("extra-info ") {
// extra-info line has format:
// extra-info <nickname> <fingerprint>
let line_split: Vec<&str> = line.split(' ').collect();
if line_split.len() != 3 {
println!("Misformed extra-info line");
} else {
entry.insert("nickname".to_string(), line_split[1].to_string());
entry.insert("fingerprint".to_string(), line_split[2].to_string());
}
} else {
let (key, value) = match line.split_once(' ') {
Some((k, v)) => (k, v),
None => (line.as_str(), ""),
};
entry.insert(key.to_string(), value.to_string());
}
}
}
// Do for the last one
let extra_info = get_extra_info_or_error(&entry);
if extra_info.is_ok() {
set.insert(extra_info.unwrap());
} else {
println!("{}", extra_info.err().unwrap());
}
}
/// Download new extra-infos files and save them in DIRECTORY. This function
/// returns the set of newly downloaded filenames.
pub async fn download_extra_infos(
base_url: &str,
2024-02-07 18:36:40 -05:00
) -> Result<HashSet<String>, Box<dyn std::error::Error + Send + Sync>> {
// Download directory of recent extra-infos
let url = base_url.parse().unwrap();
let https = hyper_rustls::HttpsConnectorBuilder::new()
.with_native_roots() // TODO: Pin certificate? Is this data signed/verifiable?
.expect("no native root CA certificates found")
.https_only()
.enable_http1()
.build();
let client: Client<_, Empty<Bytes>> = Client::builder(TokioExecutor::new()).build(https);
println!("Downloading {}", base_url);
let mut res = client.get(url).await?;
assert_eq!(res.status(), StatusCode::OK);
2024-02-07 18:36:40 -05:00
let mut body_str = String::from("");
while let Some(next) = res.frame().await {
let frame = next?;
if let Some(chunk) = frame.data_ref() {
body_str.push_str(&String::from_utf8(chunk.to_vec())?);
}
}
// Removed because it caused some problem...
//let doc = Document::from(body_str.clone().as_str());
// Instead, do this
let mut links = HashSet::<String>::new();
for line in body_str.lines() {
let begin_match = "<a href=\"";
let end_match = "\">";
if line.contains(begin_match) {
let link = &line[line.find(begin_match).unwrap() + begin_match.len()..];
if link.contains(end_match) {
let link = &link[0..link.find(end_match).unwrap()];
links.insert(link.to_string());
}
}
}
2024-02-07 18:36:40 -05:00
// Create extra-infos directory if it doesn't exist
std::fs::create_dir_all(&DIRECTORY)?;
let mut new_files = HashSet::<String>::new();
// Go through all the links in the page and download new files
for link in links {
if link.ends_with("-extra-infos") {
let filename = format!("{}/{}", DIRECTORY, link);
// Download file if it's not already downloaded
if !Path::new(&filename).exists() {
let extra_infos_url = format!("{}{}", base_url, link);
println!("Downloading {}", extra_infos_url);
let mut res = client.get(extra_infos_url.parse().unwrap()).await.unwrap();
assert_eq!(res.status(), StatusCode::OK);
2024-02-07 18:36:40 -05:00
let mut file = std::fs::File::create(filename).unwrap();
while let Some(next) = res.frame().await {
let frame = next?;
if let Some(chunk) = frame.data_ref() {
file.write_all(&chunk)?;
}
}
new_files.insert(link.to_string());
}
}
}
Ok(new_files)
}