From b54500f04a4bccee5a50b3729943ff31d0eab6fa Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 7 Jan 2025 16:20:10 -0500 Subject: [PATCH 01/25] Process CSVs in order of first keys --- CHANGELOG.md | 2 + Cargo.toml | 2 +- README.md | 7 +-- src/consts.rs | 4 ++ src/inventory/item.rs | 10 +++ src/main.rs | 15 ++--- src/s3/mod.rs | 58 ++++++++++++++++- src/syncer.rs | 141 +++++++++++++++++++++++++----------------- 8 files changed, 164 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4ac243..bbf161b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ In Development - Add `--list-dates` option - The `` command-line argument is now optional and defaults to the current directory +- The `--inventory-jobs` and `--object-jobs` options have been eliminated in + favor of a new `--jobs` option v0.1.0-alpha.2 (2025-01-06) --------------------------- diff --git a/Cargo.toml b/Cargo.toml index 9d1836b..b324e2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ strum = { version = "0.26.3", features = ["derive"] } tempfile = "3.15.0" thiserror = "2.0.11" time = { version = "0.3.37", features = ["macros", "parsing"] } -tokio = { version = "1.43.0", features = ["macros", "rt-multi-thread", "signal"] } +tokio = { version = "1.43.0", features = ["macros", "rt-multi-thread", "signal", "sync"] } tokio-util = { version = "0.7.13", features = ["rt"] } tracing = "0.1.41" tracing-subscriber = { version = "0.3.19", features = ["local-time", "time"] } diff --git a/README.md b/README.md index bad085c..df606fd 100644 --- a/README.md +++ b/README.md @@ -110,8 +110,8 @@ Options inventory for the given date is used) or in the format `YYYY-MM-DDTHH-MMZ` (to specify a specific inventory). -- `-I `, `--inventory-jobs ` — Specify the maximum number of inventory - list files to download & process at once [default: 20] +- `-J `, `--jobs ` — Specify the maximum number of concurrent + download jobs [default: 20] - `--list-dates` — List available inventory manifest dates instead of backing anything up @@ -120,9 +120,6 @@ Options Possible values are "`ERROR`", "`WARN`", "`INFO`", "`DEBUG`", and "`TRACE`" (all case-insensitive). [default value: `DEBUG`] -- `-O `, `--object-jobs ` — Specify the maximum number of inventory - entries to download & process at once [default: 20] - - `--path-filter ` — Only download objects whose keys match the given [regular expression](https://docs.rs/regex/latest/regex/#syntax) diff --git a/src/consts.rs b/src/consts.rs index b66da2b..99dae40 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,3 +1,7 @@ /// The name of the file in which metadata (version ID and etag) are stored for /// the latest versions of objects in each directory pub(crate) static METADATA_FILENAME: &str = ".s3invsync.versions.json"; + +/// The number of initial bytes of an inventory csv.gz file to fetch when +/// peeking at just the first entry +pub(crate) const CSV_GZIP_PEEK_SIZE: usize = 1024; diff --git a/src/inventory/item.rs b/src/inventory/item.rs index c331ecc..7f7c2dc 100644 --- a/src/inventory/item.rs +++ b/src/inventory/item.rs @@ -9,6 +9,16 @@ pub(crate) enum InventoryEntry { Item(InventoryItem), } +impl InventoryEntry { + /// Returns the entry's key + pub(crate) fn key(&self) -> &str { + match self { + InventoryEntry::Directory(Directory { key, .. }) => key, + InventoryEntry::Item(InventoryItem { key, .. }) => key.as_ref(), + } + } +} + /// An entry in an inventory list file pointing to a directory object #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Directory { diff --git a/src/main.rs b/src/main.rs index abfaa28..9c59eab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -39,10 +39,9 @@ struct Arguments { #[arg(short, long)] date: Option, - /// Set the maximum number of inventory list files to download & process at - /// once - #[arg(short = 'I', long, default_value = "20")] - inventory_jobs: NonZeroUsize, + /// Set the maximum number of concurrent download jobs + #[arg(short = 'J', long, default_value = "20")] + jobs: NonZeroUsize, /// List available inventory manifest dates instead of backing anything up #[arg(long)] @@ -57,11 +56,6 @@ struct Arguments { )] log_level: Level, - /// Set the maximum number of inventory entries to download & process at - /// once - #[arg(short = 'O', long, default_value = "20")] - object_jobs: NonZeroUsize, - /// Only download objects whose keys match the given regular expression #[arg(long, value_name = "REGEX")] path_filter: Option, @@ -134,8 +128,7 @@ async fn run(args: Arguments) -> anyhow::Result<()> { outdir, manifest_date, start_time, - args.inventory_jobs, - args.object_jobs, + args.jobs, args.path_filter, args.compress_filter_msgs, ); diff --git a/src/s3/mod.rs b/src/s3/mod.rs index 5247c0d..455dcbd 100644 --- a/src/s3/mod.rs +++ b/src/s3/mod.rs @@ -3,7 +3,8 @@ mod location; mod streams; pub(crate) use self::location::S3Location; use self::streams::{ListManifestDates, ListObjectsError}; -use crate::inventory::{CsvReader, InventoryList}; +use crate::consts::CSV_GZIP_PEEK_SIZE; +use crate::inventory::{CsvReader, CsvReaderError, InventoryEntry, InventoryList}; use crate::manifest::{CsvManifest, FileSpec}; use crate::timestamps::{Date, DateHM, DateMaybeHM}; use aws_credential_types::{ @@ -240,6 +241,39 @@ impl S3Client { Ok(InventoryList::for_downloaded_csv(path, url, reader)) } + /// Fetch the first [`CSV_GZIP_PEEK_SIZE`] bytes of the CSV inventory list + /// file described by `fspec` and extract the first line. Returns `None` + /// if the file is empty. + #[tracing::instrument(skip_all, fields(key = fspec.key))] + pub(crate) async fn peek_inventory_csv( + &self, + fspec: &FileSpec, + ) -> Result, CsvPeekError> { + tracing::debug!("Peeking at first {CSV_GZIP_PEEK_SIZE} bytes of file"); + let url = self.inventory_base.with_key(&fspec.key); + let obj = self.get_object(&url).await?; + let mut bytestream = obj.body; + let mut header = std::collections::VecDeque::with_capacity(CSV_GZIP_PEEK_SIZE); + while let Some(blob) = + bytestream + .try_next() + .await + .map_err(|source| CsvPeekError::Download { + url: url.clone(), + source, + })? + { + header.extend(blob); + if header.len() >= CSV_GZIP_PEEK_SIZE { + break; + } + } + CsvReader::from_gzipped_reader(header, fspec.file_schema.clone()) + .next() + .transpose() + .map_err(|source| CsvPeekError::Decode { url, source }) + } + /// Download the object at `url` and write its bytes to `outfile`. If /// `md5_digest` is non-`None` (in which case it must be a 32-character /// lowercase hexadecimal string), it is used to validate the download. @@ -442,6 +476,28 @@ pub(crate) enum CsvDownloadError { }, } +/// Error returned by [`S3Client::peek_inventory_csv()`] +#[derive(Debug, Error)] +pub(crate) enum CsvPeekError { + /// Failed to perform "Get Object" request + #[error(transparent)] + Get(#[from] GetError), + + /// Error while receiving bytes for the object + #[error("failed downloading contents for {url}")] + Download { + url: S3Location, + source: ByteStreamError, + }, + + /// Failed to read first line from header + #[error("failed to decode first line from peeking at {url}")] + Decode { + url: S3Location, + source: CsvReaderError, + }, +} + /// Error returned by [`S3Client::get_object()`] when a "Get Object" request /// fails #[derive(Debug, Error)] diff --git a/src/syncer.rs b/src/syncer.rs index 834913a..e76bbfc 100644 --- a/src/syncer.rs +++ b/src/syncer.rs @@ -1,6 +1,6 @@ use crate::consts::METADATA_FILENAME; use crate::inventory::{InventoryEntry, InventoryItem, ItemDetails}; -use crate::manifest::CsvManifest; +use crate::manifest::{CsvManifest, FileSpec}; use crate::s3::S3Client; use crate::timestamps::DateHM; use crate::util::*; @@ -38,11 +38,8 @@ pub(crate) struct Syncer { /// The time at which the overall backup procedure started start_time: std::time::Instant, - /// The number of concurrent downloads of CSV inventory lists - inventory_jobs: NonZeroUsize, - - /// The number of concurrent downloads of S3 objects - object_jobs: NonZeroUsize, + /// The number of concurrent downloads jobs + jobs: NonZeroUsize, /// Only download objects whose keys match the given regex path_filter: Option, @@ -76,8 +73,7 @@ impl Syncer { outdir: PathBuf, manifest_date: DateHM, start_time: std::time::Instant, - inventory_jobs: NonZeroUsize, - object_jobs: NonZeroUsize, + jobs: NonZeroUsize, path_filter: Option, compress_filter_msgs: Option, ) -> Arc { @@ -87,8 +83,7 @@ impl Syncer { outdir, manifest_date, start_time, - inventory_jobs, - object_jobs, + jobs, path_filter, locks: lockable::LockPool::new(), token: CancellationToken::new(), @@ -111,10 +106,11 @@ impl Syncer { } }); + let fspecs = self.sort_csvs_by_first_line(manifest.files).await?; + tracing::trace!(path = %self.outdir.display(), "Creating root output directory"); fs_err::create_dir_all(&self.outdir).map_err(|e| MultiError(vec![e.into()]))?; let mut joinset = JoinSet::new(); - let (fspec_sender, fspec_receiver) = async_channel::bounded(CHANNEL_SIZE); let obj_sender = { let guard = self .obj_sender @@ -126,43 +122,30 @@ impl Syncer { .expect("obj_sender should not be None") }; - for _ in 0..self.inventory_jobs.get() { - let clnt = self.client.clone(); - let token = self.token.clone(); - let recv = fspec_receiver.clone(); - let sender = obj_sender.clone(); - joinset.spawn(async move { - while let Ok(fspec) = recv.recv().await { - let clnt = clnt.clone(); - let sender = sender.clone(); - let r = token - .run_until_cancelled(async move { - let entries = clnt.download_inventory_csv(fspec).await?; - for entry in entries { - match entry? { - InventoryEntry::Directory(d) => { - tracing::debug!(url = %d.url(), "Ignoring directory entry in inventory list"); - } - InventoryEntry::Item(item) => { - if sender.send(item).await.is_err() { - // Assume we're shutting down - return Ok(()); - } - } + let clnt = self.client.clone(); + let token = self.token.clone(); + let sender = obj_sender.clone(); + joinset.spawn(async move { + token.run_until_cancelled(async move { + for spec in fspecs { + let entries = clnt.download_inventory_csv(spec).await?; + for entry in entries { + match entry.context("error reading from inventory list file")? { + InventoryEntry::Directory(d) => { + tracing::debug!(url = %d.url(), "Ignoring directory entry in inventory list"); + } + InventoryEntry::Item(item) => { + if sender.send(item).await.is_err() { + // Assume we're shutting down + return Ok(()); } } - Ok(()) - }) - .await; - match r { - Some(Ok(())) => (), - Some(Err(e)) => return Err(e), - None => return Ok(()), + } } } Ok(()) - }); - } + }).await.unwrap_or(Ok(())) + }); drop(obj_sender); { let mut guard = self @@ -171,18 +154,8 @@ impl Syncer { .expect("obj_sender mutex should not be poisoned"); *guard = None; } - drop(fspec_receiver); - - joinset.spawn(async move { - for fspec in manifest.files { - if fspec_sender.send(fspec).await.is_err() { - return Ok(()); - } - } - Ok(()) - }); - for _ in 0..self.object_jobs.get() { + for _ in 0..self.jobs.get() { let this = self.clone(); let recv = self.obj_receiver.clone(); joinset.spawn(async move { @@ -196,6 +169,62 @@ impl Syncer { }); } + let r = self.await_joinset(joinset).await; + self.filterlog.finish(); + r + } + + /// Fetch the first line of each inventory list file in `specs` and sort + /// the list by the keys in those lines + async fn sort_csvs_by_first_line( + self: &Arc, + specs: Vec, + ) -> Result, MultiError> { + tracing::info!("Peeking at inventory lists in order to sort by first line ..."); + let mut joinset = JoinSet::new(); + let mut receiver = { + let specs = Arc::new(Mutex::new(specs)); + let (output_sender, output_receiver) = tokio::sync::mpsc::channel(CHANNEL_SIZE); + for _ in 0..self.jobs.get() { + let clnt = self.client.clone(); + let token = self.token.clone(); + let specs = specs.clone(); + let sender = output_sender.clone(); + joinset.spawn(async move { + token + .run_until_cancelled(async move { + while let Some(fspec) = { + let mut guard = + specs.lock().expect("specs mutex should not be poisoned"); + guard.pop() + } { + if let Some(entry) = clnt.peek_inventory_csv(&fspec).await? { + if sender.send((fspec, entry)).await.is_err() { + // Assume we're shutting down + return Ok(()); + } + } + } + Ok(()) + }) + .await + .unwrap_or(Ok(())) + }); + } + output_receiver + }; + let mut firsts2fspecs = BTreeMap::new(); + while let Some((fspec, entry)) = receiver.recv().await { + firsts2fspecs.insert(entry.key().to_owned(), fspec); + } + self.await_joinset(joinset).await?; + Ok(firsts2fspecs.into_values().collect()) + } + + async fn await_joinset( + &self, + mut joinset: JoinSet>, + ) -> Result<(), MultiError> { let mut errors = Vec::new(); while let Some(r) = joinset.join_next().await { match r { @@ -212,8 +241,6 @@ impl Syncer { Err(_) => (), } } - self.filterlog.finish(); - if self.terminated.load(Ordering::Acquire) { errors.push(anyhow::anyhow!("Shut down due to Ctrl-C")); } @@ -224,7 +251,7 @@ impl Syncer { } } - fn shutdown(self: &Arc) { + fn shutdown(&self) { if !self.token.is_cancelled() { self.token.cancel(); self.obj_receiver.close(); From 7694a9d9604cb490d519d4d53f0b351236b6ea42 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 10:01:21 -0500 Subject: [PATCH 02/25] Move metadata code to submodule of syncer --- src/keypath.rs | 1 + src/syncer/metadata.rs | 151 +++++++++++++++++++++++++++++++ src/{syncer.rs => syncer/mod.rs} | 136 +--------------------------- 3 files changed, 155 insertions(+), 133 deletions(-) create mode 100644 src/syncer/metadata.rs rename src/{syncer.rs => syncer/mod.rs} (81%) diff --git a/src/keypath.rs b/src/keypath.rs index 4078114..7896c68 100644 --- a/src/keypath.rs +++ b/src/keypath.rs @@ -200,6 +200,7 @@ mod tests { #[case(".old.bar.baz", false)] #[case("foo.old..baz", false)] #[case("foo.old..", false)] + #[case(".s3invsync.versions.json", true)] fn test_is_special_component(#[case] s: &str, #[case] r: bool) { assert_eq!(is_special_component(s), r); } diff --git a/src/syncer/metadata.rs b/src/syncer/metadata.rs new file mode 100644 index 0000000..f58a0dc --- /dev/null +++ b/src/syncer/metadata.rs @@ -0,0 +1,151 @@ +use super::*; +use serde::{Deserialize, Serialize}; + +/// Metadata about the latest version of a key +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub(super) struct Metadata { + /// The object's version ID + pub(super) version_id: String, + + /// The object's etag + pub(super) etag: String, +} + +impl Metadata { + /// Return the filename used for backing up a non-latest object that has + /// `self` as its metadata and `basename` as the filename portion of its + /// key + pub(super) fn old_filename(&self, basename: &str) -> String { + format!("{}.old.{}.{}", basename, self.version_id, self.etag) + } +} + +/// Handle for manipulating the metadata for the latest version of a key in a +/// local JSON database +pub(super) struct FileMetadataManager<'a> { + syncer: &'a Syncer, + + /// The manager for the directory's database + inner: MetadataManager<'a>, + + /// The filename of the object + filename: &'a str, +} + +impl<'a> FileMetadataManager<'a> { + pub(super) fn new(syncer: &'a Syncer, parentdir: &'a Path, filename: &'a str) -> Self { + FileMetadataManager { + syncer, + inner: MetadataManager::new(parentdir), + filename, + } + } + + /// Acquire a lock on this JSON database + async fn lock(&self) -> Guard<'a> { + self.syncer.lock_path(self.database_path().to_owned()).await + } + + fn database_path(&self) -> &Path { + &self.inner.database_path + } + + /// Retrieve the metadata for the key from the database + pub(super) async fn get(&self) -> anyhow::Result { + tracing::trace!(file = self.filename, database = %self.database_path().display(), "Fetching object metadata for file from database"); + let mut data = { + let _guard = self.lock().await; + self.inner.load()? + }; + let Some(md) = data.remove(self.filename) else { + anyhow::bail!( + "No entry for {:?} in {}", + self.filename, + self.database_path().display() + ); + }; + Ok(md) + } + + /// Set the metadata for the key in the database to `md` + pub(super) async fn set(&self, md: Metadata) -> anyhow::Result<()> { + tracing::trace!(file = self.filename, database = %self.database_path().display(), "Setting object metadata for file in database"); + let _guard = self.lock().await; + let mut data = self.inner.load()?; + data.insert(self.filename.to_owned(), md); + self.inner.store(data)?; + Ok(()) + } + + /// Remove the metadata for the key from the database + pub(super) async fn delete(&self) -> anyhow::Result<()> { + tracing::trace!(file = self.filename, database = %self.database_path().display(), "Deleting object metadata for file from database"); + let _guard = self.lock().await; + let mut data = self.inner.load()?; + if data.remove(self.filename).is_some() { + self.inner.store(data)?; + } + Ok(()) + } +} + +/// Handle for manipulating the metadata a local JSON database +pub(super) struct MetadataManager<'a> { + /// The local directory in which the downloaded object and the JSON + /// database are both located + dirpath: &'a Path, + + /// The path to the JSON database + database_path: PathBuf, +} + +impl<'a> MetadataManager<'a> { + pub(super) fn new(dirpath: &'a Path) -> MetadataManager<'a> { + MetadataManager { + dirpath, + database_path: dirpath.join(METADATA_FILENAME), + } + } + + /// Read & parse the database file. If the file does not exist, return an + /// empty map. + fn load(&self) -> anyhow::Result> { + let content = match fs_err::read_to_string(&self.database_path) { + Ok(content) => content, + Err(e) if e.kind() == ErrorKind::NotFound => String::from("{}"), + Err(e) => return Err(e.into()), + }; + serde_json::from_str(&content).with_context(|| { + format!( + "failed to deserialize contents of {}", + self.database_path.display() + ) + }) + } + + /// Set the content of the database file to the serialized map + fn store(&self, data: BTreeMap) -> anyhow::Result<()> { + let fp = tempfile::Builder::new() + .prefix(".s3invsync.versions.") + .tempfile_in(self.dirpath) + .with_context(|| { + format!( + "failed to create temporary database file for updating {}", + self.database_path.display() + ) + })?; + serde_json::to_writer_pretty(fp.as_file(), &data).with_context(|| { + format!( + "failed to serialize metadata to {}", + self.database_path.display() + ) + })?; + fp.persist(&self.database_path).with_context(|| { + format!( + "failed to persist temporary database file to {}", + self.database_path.display() + ) + })?; + Ok(()) + } +} diff --git a/src/syncer.rs b/src/syncer/mod.rs similarity index 81% rename from src/syncer.rs rename to src/syncer/mod.rs index e76bbfc..a122474 100644 --- a/src/syncer.rs +++ b/src/syncer/mod.rs @@ -1,3 +1,5 @@ +mod metadata; +use self::metadata::*; use crate::consts::METADATA_FILENAME; use crate::inventory::{InventoryEntry, InventoryItem, ItemDetails}; use crate::manifest::{CsvManifest, FileSpec}; @@ -5,7 +7,6 @@ use crate::s3::S3Client; use crate::timestamps::DateHM; use crate::util::*; use anyhow::Context; -use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::io::ErrorKind; use std::num::NonZeroUsize; @@ -290,7 +291,7 @@ impl Syncer { } else { self.outdir.clone() }; - let mdmanager = MetadataManager::new(self, &parentdir, filename); + let mdmanager = FileMetadataManager::new(self, &parentdir, filename); if item.is_latest { tracing::info!("Object is latest version of key"); @@ -473,137 +474,6 @@ impl Syncer { } } -/// Metadata about the latest version of a key -#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] -struct Metadata { - /// The object's version ID - version_id: String, - - /// The object's etag - etag: String, -} - -impl Metadata { - /// Return the filename used for backing up a non-latest object that has - /// `self` as its metadata and `basename` as the filename portion of its - /// key - fn old_filename(&self, basename: &str) -> String { - format!("{}.old.{}.{}", basename, self.version_id, self.etag) - } -} - -/// Handle for manipulating the metadata for the latest version of a key in a -/// local JSON database -struct MetadataManager<'a> { - syncer: &'a Syncer, - - /// The local directory in which the downloaded object and the JSON - /// database are both located - dirpath: &'a Path, - - /// The path to the JSON database - database_path: PathBuf, - - /// The filename of the object - filename: &'a str, -} - -impl<'a> MetadataManager<'a> { - fn new(syncer: &'a Syncer, parentdir: &'a Path, filename: &'a str) -> Self { - MetadataManager { - syncer, - dirpath: parentdir, - database_path: parentdir.join(METADATA_FILENAME), - filename, - } - } - - /// Acquire a lock on this JSON database - async fn lock(&self) -> Guard<'a> { - self.syncer.lock_path(self.database_path.clone()).await - } - - /// Read & parse the database file. If the file does not exist, return an - /// empty map. - fn load(&self) -> anyhow::Result> { - let content = match fs_err::read_to_string(&self.database_path) { - Ok(content) => content, - Err(e) if e.kind() == ErrorKind::NotFound => String::from("{}"), - Err(e) => return Err(e.into()), - }; - serde_json::from_str(&content).with_context(|| { - format!( - "failed to deserialize contents of {}", - self.database_path.display() - ) - }) - } - - /// Set the content of the database file to the serialized map - fn store(&self, data: BTreeMap) -> anyhow::Result<()> { - let fp = tempfile::Builder::new() - .prefix(".s3invsync.versions.") - .tempfile_in(self.dirpath) - .with_context(|| { - format!( - "failed to create temporary database file for updating {}", - self.database_path.display() - ) - })?; - serde_json::to_writer_pretty(fp.as_file(), &data).with_context(|| { - format!( - "failed to serialize metadata to {}", - self.database_path.display() - ) - })?; - fp.persist(&self.database_path).with_context(|| { - format!( - "failed to persist temporary database file to {}", - self.database_path.display() - ) - })?; - Ok(()) - } - - /// Retrieve the metadata for the key from the database - async fn get(&self) -> anyhow::Result { - tracing::trace!(file = self.filename, database = %self.database_path.display(), "Fetching object metadata for file from database"); - let mut data = { - let _guard = self.lock().await; - self.load()? - }; - let Some(md) = data.remove(self.filename) else { - anyhow::bail!( - "No entry for {:?} in {}", - self.filename, - self.database_path.display() - ); - }; - Ok(md) - } - - /// Set the metadata for the key in the database to `md` - async fn set(&self, md: Metadata) -> anyhow::Result<()> { - tracing::trace!(file = self.filename, database = %self.database_path.display(), "Setting object metadata for file in database"); - let _guard = self.lock().await; - let mut data = self.load()?; - data.insert(self.filename.to_owned(), md); - self.store(data)?; - Ok(()) - } - - /// Remove the metadata for the key from the database - async fn delete(&self) -> anyhow::Result<()> { - tracing::trace!(file = self.filename, database = %self.database_path.display(), "Deleting object metadata for file from database"); - let _guard = self.lock().await; - let mut data = self.load()?; - if data.remove(self.filename).is_some() { - self.store(data)?; - } - Ok(()) - } -} - /// An emitter of log messages about objects skipped due to `--path-filter` #[derive(Debug)] enum FilterLogger { From 0d572cbf03da0626493fc8f7f4330e42487572dc Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 10:02:39 -0500 Subject: [PATCH 03/25] First draft of TreeTracker --- Cargo.lock | 1 + Cargo.toml | 1 + src/syncer/metadata.rs | 4 +- src/syncer/mod.rs | 65 +++++- src/syncer/treetracker.rs | 441 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 508 insertions(+), 4 deletions(-) create mode 100644 src/syncer/treetracker.rs diff --git a/Cargo.lock b/Cargo.lock index 5adf174..17a1112 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1796,6 +1796,7 @@ dependencies = [ "aws-smithy-runtime-api", "clap", "csv", + "either", "flate2", "fs-err", "futures-util", diff --git a/Cargo.toml b/Cargo.toml index b324e2f..90f48b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ aws-smithy-async = "1.2.3" aws-smithy-runtime-api = "1.7.3" clap = { version = "4.5.26", default-features = false, features = ["derive", "error-context", "help", "std", "suggestions", "usage", "wrap_help"] } csv = "1.3.1" +either = "1.13.0" flate2 = "1.0.35" fs-err = { version = "3.0.0", features = ["tokio"] } futures-util = "0.3.31" diff --git a/src/syncer/metadata.rs b/src/syncer/metadata.rs index f58a0dc..a5d5b24 100644 --- a/src/syncer/metadata.rs +++ b/src/syncer/metadata.rs @@ -109,7 +109,7 @@ impl<'a> MetadataManager<'a> { /// Read & parse the database file. If the file does not exist, return an /// empty map. - fn load(&self) -> anyhow::Result> { + pub(super) fn load(&self) -> anyhow::Result> { let content = match fs_err::read_to_string(&self.database_path) { Ok(content) => content, Err(e) if e.kind() == ErrorKind::NotFound => String::from("{}"), @@ -124,7 +124,7 @@ impl<'a> MetadataManager<'a> { } /// Set the content of the database file to the serialized map - fn store(&self, data: BTreeMap) -> anyhow::Result<()> { + pub(super) fn store(&self, data: BTreeMap) -> anyhow::Result<()> { let fp = tempfile::Builder::new() .prefix(".s3invsync.versions.") .tempfile_in(self.dirpath) diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index a122474..3935928 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -1,7 +1,10 @@ mod metadata; +mod treetracker; use self::metadata::*; +use self::treetracker::*; use crate::consts::METADATA_FILENAME; use crate::inventory::{InventoryEntry, InventoryItem, ItemDetails}; +use crate::keypath::is_special_component; use crate::manifest::{CsvManifest, FileSpec}; use crate::s3::S3Client; use crate::timestamps::DateHM; @@ -123,19 +126,26 @@ impl Syncer { .expect("obj_sender should not be None") }; - let clnt = self.client.clone(); + let this = self.clone(); let token = self.token.clone(); let sender = obj_sender.clone(); joinset.spawn(async move { token.run_until_cancelled(async move { + let mut tracker = TreeTracker::new(); for spec in fspecs { - let entries = clnt.download_inventory_csv(spec).await?; + let entries = this.client.download_inventory_csv(spec).await?; for entry in entries { match entry.context("error reading from inventory list file")? { InventoryEntry::Directory(d) => { tracing::debug!(url = %d.url(), "Ignoring directory entry in inventory list"); } InventoryEntry::Item(item) => { + // TODO: Store a Cond for awaiting completion of item processing + for dir in tracker.add(&item.key, ())? { + // TODO: Spawn in JoinSet + let this2 = this.clone(); + tokio::task::spawn_blocking(move || this2.cleanup_dir(dir)); + } if sender.send(item).await.is_err() { // Assume we're shutting down return Ok(()); @@ -144,6 +154,11 @@ impl Syncer { } } } + for dir in tracker.finish() { + // TODO: Spawn in JoinSet + let this2 = this.clone(); + tokio::task::spawn_blocking(move || this2.cleanup_dir(dir)); + } Ok(()) }).await.unwrap_or(Ok(())) }); @@ -472,6 +487,52 @@ impl Syncer { "Process info", ); } + + fn cleanup_dir(&self, dir: Directory<()>) -> anyhow::Result<()> { + // TODO: Wait for directory's jobs to finish + let dirpath = match dir.path() { + Some(p) => self.outdir.join(p), + None => self.outdir.clone(), + }; + let mut dbdeletions = Vec::new(); + for entry in fs_err::read_dir(&dirpath)? { + let entry = entry?; + let epath = entry.path(); + let is_dir = entry.file_type()?.is_dir(); + let to_delete = match entry.file_name().to_str() { + Some(name) => { + if is_dir { + !dir.contains_dir(name) + } else { + if !is_special_component(name) { + dbdeletions.push(name.to_owned()); + } + !dir.contains_file(name) && name != METADATA_FILENAME + } + } + None => true, + }; + if to_delete { + // TODO: Log + if is_dir { + // TODO: Use async here? + fs_err::remove_dir_all(&epath)?; + } else { + fs_err::remove_file(&epath)?; + } + } + } + if !dbdeletions.is_empty() { + // TODO: Log + let manager = MetadataManager::new(&dirpath); + let mut data = manager.load()?; + for name in dbdeletions { + data.remove(&name); + } + manager.store(data)?; + } + Ok(()) + } } /// An emitter of log messages about objects skipped due to `--path-filter` diff --git a/src/syncer/treetracker.rs b/src/syncer/treetracker.rs new file mode 100644 index 0000000..a4166a9 --- /dev/null +++ b/src/syncer/treetracker.rs @@ -0,0 +1,441 @@ +use crate::keypath::KeyPath; +use either::Either; +use std::cmp::Ordering; +use thiserror::Error; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct TreeTracker(Vec>); + +impl TreeTracker { + pub(super) fn new() -> Self { + TreeTracker(vec![PartialDirectory::new()]) + } + + pub(super) fn add( + &mut self, + key: &KeyPath, + //old_filename: Option, // TODO + value: T, + ) -> Result>, TreeTrackerError> { + let (dirpath, filename) = key.split(); + let mut popped_dirs = Vec::new(); + let mut parts = dirpath.unwrap_or_default().split('/').enumerate(); + for (i, dirname) in parts.by_ref() { + let cmp_dirname = CmpName::Dir(dirname); + let Some(pd) = self.0.get_mut(i) else { + unreachable!( + "TreeTracker::add() iteration should not go past the end of the stack" + ); + }; + if let Some(cd) = pd.current_subdir.as_ref() { + match cmp_dirname.cmp(&CmpName::Dir(cd)) { + Ordering::Equal => continue, + Ordering::Greater => { + // Close current dirs & push + for _ in (i + 1)..(self.0.len()) { + popped_dirs.push(self.pop()); + } + self.push_dir(dirname); + break; // GOTO push + } + Ordering::Less => { + return Err(TreeTrackerError::Unsorted { + before: self.last_key(), + after: key.into(), + }); + } + } + } else if let Some(en) = pd.entries.last() { + match cmp_dirname.cmp(&en.cmp_name()) { + Ordering::Equal => { + assert!(en.is_file(), "last element of PartialDirectory::entries should be a file when current_subdir is None"); + return Err(TreeTrackerError::Conflict(self.last_key())); + } + Ordering::Greater => { + self.push_dir(dirname); + break; // GOTO push + } + Ordering::Less => { + return Err(TreeTrackerError::Unsorted { + before: self.last_key(), + after: key.into(), + }); + } + } + } else { + assert!( + self.is_empty(), + "top dir of TreeTracker should be root when empty" + ); + self.push_dir(dirname); + break; // GOTO push + } + } + for (_, dirname) in parts { + self.push_dir(dirname); + } + if self.push_file(filename, value) { + Ok(popped_dirs) + } else { + Err(TreeTrackerError::Unsorted { + before: self.last_key(), + after: key.into(), + }) + } + } + + pub(super) fn finish(mut self) -> Vec> { + let mut dirs = Vec::new(); + while !self.0.is_empty() { + dirs.push(self.pop()); + } + dirs + } + + fn is_empty(&self) -> bool { + self.0.is_empty() || (self.0.len() == 1 && self.0[0].is_empty()) + } + + fn push_dir(&mut self, name: &str) { + let Some(pd) = self.0.last_mut() else { + panic!("TreeTracker::push_dir() called on void tracker"); + }; + assert!( + pd.current_subdir.is_none(), + "TreeTracker::push_dir() called when top dir has subdir" + ); + pd.current_subdir = Some(name.to_owned()); + self.0.push(PartialDirectory::new()); + } + + fn push_file(&mut self, name: &str, value: T) -> bool { + let Some(pd) = self.0.last_mut() else { + panic!("TreeTracker::push_file() called on void tracker"); + }; + assert!( + pd.current_subdir.is_none(), + "TreeTracker::push_file() called when top dir has subdir" + ); + if let Some(en) = pd.entries.last() { + if en.cmp_name() >= CmpName::File(name) { + return false; + } + } + pd.entries.push(Entry::file(name, value)); + true + } + + fn pop(&mut self) -> Directory { + let Some(pd) = self.0.pop() else { + panic!("TreeTracker::pop() called on void tracker"); + }; + assert!( + pd.current_subdir.is_none(), + "TreeTracker::pop() called when top dir has subdir" + ); + let entries = pd.entries; + let path = (!self.0.is_empty()).then(|| self.last_key()); + if let Some(ppd) = self.0.last_mut() { + ppd.close_current(); + } + Directory { path, entries } + } + + fn last_key(&self) -> String { + let mut s = String::new(); + for pd in &self.0 { + if let Some(name) = pd + .current_subdir + .as_deref() + .or_else(|| pd.entries.last().map(Entry::name)) + { + if !s.is_empty() { + s.push('/'); + } + s.push_str(name); + } else { + assert!( + self.is_empty(), + "TreeTracker dir should be empty root when empty" + ); + panic!("TreeTracker::last_key() called on empty tracker"); + } + } + s + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct PartialDirectory { + entries: Vec>, + current_subdir: Option, +} + +impl PartialDirectory { + fn new() -> Self { + PartialDirectory { + entries: Vec::new(), + current_subdir: None, + } + } + + fn is_empty(&self) -> bool { + self.entries.is_empty() && self.current_subdir.is_none() + } + + fn close_current(&mut self) { + let Some(name) = self.current_subdir.take() else { + panic!("PartialDirectory::close_current() called without a current directory"); + }; + self.entries.push(Entry::dir(name)); + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +enum Entry { + File { + name: String, + //old_filenames: Vec, // TODO + value: T, + }, + Dir { + name: String, + }, +} + +impl Entry { + fn file>(name: S, value: T) -> Entry { + Entry::File { + name: name.into(), + //old_filenames: Vec::new(), // TODO + value, + } + } + + fn dir>(name: S) -> Entry { + Entry::Dir { name: name.into() } + } + + fn name(&self) -> &str { + match self { + Entry::File { name, .. } => name, + Entry::Dir { name } => name, + } + } + + fn is_file(&self) -> bool { + matches!(self, Entry::File { .. }) + } + + fn is_dir(&self) -> bool { + matches!(self, Entry::Dir { .. }) + } + + fn cmp_name(&self) -> CmpName<'_> { + match self { + Entry::File { name, .. } => CmpName::File(name.as_ref()), + Entry::Dir { name } => CmpName::Dir(name.as_ref()), + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct Directory { + path: Option, // `None` for the root + entries: Vec>, // TODO: Flatten out the old_filenames +} + +impl Directory { + pub(super) fn path(&self) -> Option<&str> { + self.path.as_deref() + } + + fn find(&self, name: &str) -> Option<&Entry> { + self.entries + .binary_search_by(|en| en.name().cmp(name)) + .ok() + .map(|i| &self.entries[i]) + } + + pub(super) fn contains_file(&self, name: &str) -> bool { + self.find(name).is_some_and(Entry::is_file) + } + + pub(super) fn contains_dir(&self, name: &str) -> bool { + self.find(name).is_some_and(Entry::is_dir) + } + + #[allow(dead_code)] + pub(super) fn map U>(self, mut f: F) -> Directory { + Directory { + path: self.path, + entries: self + .entries + .into_iter() + .map(|en| match en { + Entry::File { name, value } => Entry::File { + name, + value: f(value), + }, + Entry::Dir { name } => Entry::Dir { name }, + }) + .collect(), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum CmpName<'a> { + File(&'a str), + Dir(&'a str), +} + +impl CmpName<'_> { + fn chars(&self) -> impl Iterator + '_ { + match self { + CmpName::File(s) => Either::Left(s.chars()), + CmpName::Dir(s) => Either::Right(s.chars().chain(std::iter::once('/'))), + } + } +} + +impl PartialOrd for CmpName<'_> { + fn partial_cmp(&self, other: &CmpName<'_>) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for CmpName<'_> { + fn cmp(&self, other: &CmpName<'_>) -> Ordering { + self.chars().cmp(other.chars()) + } +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub(super) enum TreeTrackerError { + #[error("received keys in unsorted order: {before:?} came before {after:?}")] + Unsorted { before: String, after: String }, + #[error("path {0:?} is used as both a file and a directory")] + Conflict(String), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn same_dir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add(&"foo/quux.txt".parse::().unwrap(), 2), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!(dirs[0].path(), Some("foo")); + assert_eq!( + dirs[0].entries, + vec![Entry::file("bar.txt", 1), Entry::file("quux.txt", 2)] + ); + assert_eq!(dirs[1].path(), None); + assert_eq!(dirs[1].entries, vec![Entry::dir("foo")]); + } + + #[test] + fn different_dir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"glarch/quux.txt".parse::().unwrap(), 2) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("foo")); + assert_eq!(dirs[0].entries, vec![Entry::file("bar.txt", 1)]); + } + + #[test] + fn different_subdir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar/apple.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"foo/quux/banana.txt".parse::().unwrap(), 2) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("foo/bar")); + assert_eq!(dirs[0].entries, vec![Entry::file("apple.txt", 1)]); + } + + #[test] + fn preslash_dir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"foo/apple!banana/gnusto.txt".parse::().unwrap(), + 1, + ), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"foo/apple/cleesh.txt".parse::().unwrap(), 2) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("foo/apple!banana")); + assert_eq!(dirs[0].entries, vec![Entry::file("gnusto.txt", 1)]); + } + + #[test] + fn preslash_file() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 1,), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/bar/apple".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!( + e, + TreeTrackerError::Unsorted { + before: "foo/bar/apple!banana.txt".into(), + after: "foo/bar/apple".into() + } + ); + } + + #[test] + fn preslash_file_rev() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar/apple".parse::().unwrap(), 1,), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 2), + Ok(Vec::new()) + ); + } +} + +// TESTS TO ADD: +// - "pre-slash" directory followed by file cut off at pre-slash → error +// - "pre-slash" file followed by directory with `/` at pre-slash location → +// success +// - path is both a file and a directory → error +// - second path closes multiple directories +// - close multiple directories down to root +// - finish() when multiple directories are open +// - finish() without calling add() +// - close a subdirectory, continue on with parent dir +// - close a directory in the root, continue on +// - mix of files & directories in a directory +// - working with *.old.*.* filenames, especially ones out of order From 7bdb8e6fb18b7bc411a7fb3269fb682717b3a3da Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 12:05:21 -0500 Subject: [PATCH 04/25] Work on CmpName --- src/syncer/treetracker.rs | 57 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/src/syncer/treetracker.rs b/src/syncer/treetracker.rs index a4166a9..7abde39 100644 --- a/src/syncer/treetracker.rs +++ b/src/syncer/treetracker.rs @@ -284,13 +284,25 @@ impl Directory { } } -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +/// A wrapper around an individual path name component that compares it to +/// other components as though they were part of longer paths, i.e., directory +/// names have an implicit trailing '/' added. As an exception, if a file name +/// and a directory name are equal aside from the trailing '/', this type +/// compares them as equal. +#[derive(Clone, Copy, Debug)] enum CmpName<'a> { File(&'a str), Dir(&'a str), } impl CmpName<'_> { + fn name(&self) -> &str { + match self { + CmpName::File(s) => s, + CmpName::Dir(s) => s, + } + } + fn chars(&self) -> impl Iterator + '_ { match self { CmpName::File(s) => Either::Left(s.chars()), @@ -299,6 +311,14 @@ impl CmpName<'_> { } } +impl PartialEq for CmpName<'_> { + fn eq(&self, other: &CmpName<'_>) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for CmpName<'_> {} + impl PartialOrd for CmpName<'_> { fn partial_cmp(&self, other: &CmpName<'_>) -> Option { Some(self.cmp(other)) @@ -307,7 +327,11 @@ impl PartialOrd for CmpName<'_> { impl Ord for CmpName<'_> { fn cmp(&self, other: &CmpName<'_>) -> Ordering { - self.chars().cmp(other.chars()) + if self.name() == other.name() { + Ordering::Equal + } else { + self.chars().cmp(other.chars()) + } } } @@ -424,6 +448,35 @@ mod tests { Ok(Vec::new()) ); } + + mod cmp_name { + use super::*; + + #[test] + fn dir_eq_file() { + assert!(CmpName::File("foo") == CmpName::Dir("foo")); + } + + #[test] + fn pre_slash_dir_before_dir() { + assert!(CmpName::Dir("apple!banana") < CmpName::Dir("apple")); + } + + #[test] + fn pre_slash_file_before_dir() { + assert!(CmpName::File("apple!banana") < CmpName::Dir("apple")); + } + + #[test] + fn pre_slash_dir_after_file() { + assert!(CmpName::Dir("apple!banana") > CmpName::File("apple")); + } + + #[test] + fn pre_slash_file_after_file() { + assert!(CmpName::File("apple!banana") > CmpName::File("apple")); + } + } } // TESTS TO ADD: From 774d947a12fa6a70f82396d8e114d0028f798978 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 13:03:55 -0500 Subject: [PATCH 05/25] Add more tests, including a failing one --- src/syncer/treetracker.rs | 182 +++++++++++++++++++++++++++++++++++--- 1 file changed, 169 insertions(+), 13 deletions(-) diff --git a/src/syncer/treetracker.rs b/src/syncer/treetracker.rs index 7abde39..0ac5eba 100644 --- a/src/syncer/treetracker.rs +++ b/src/syncer/treetracker.rs @@ -382,6 +382,15 @@ mod tests { assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("foo")); assert_eq!(dirs[0].entries, vec![Entry::file("bar.txt", 1)]); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!(dirs[0].path(), Some("glarch")); + assert_eq!(dirs[0].entries, vec![Entry::file("quux.txt", 2)]); + assert_eq!(dirs[1].path(), None); + assert_eq!( + dirs[1].entries, + vec![Entry::dir("foo"), Entry::dir("glarch")] + ); } #[test] @@ -397,10 +406,18 @@ mod tests { assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("foo/bar")); assert_eq!(dirs[0].entries, vec![Entry::file("apple.txt", 1)]); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 3); + assert_eq!(dirs[0].path(), Some("foo/quux")); + assert_eq!(dirs[0].entries, vec![Entry::file("banana.txt", 2)]); + assert_eq!(dirs[1].path(), Some("foo")); + assert_eq!(dirs[1].entries, vec![Entry::dir("bar"), Entry::dir("quux")]); + assert_eq!(dirs[2].path(), None); + assert_eq!(dirs[2].entries, vec![Entry::dir("foo")]); } #[test] - fn preslash_dir() { + fn preslash_dir_then_toslash_dir() { let mut tracker = TreeTracker::new(); assert_eq!( tracker.add( @@ -415,13 +432,24 @@ mod tests { assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("foo/apple!banana")); assert_eq!(dirs[0].entries, vec![Entry::file("gnusto.txt", 1)]); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 3); + assert_eq!(dirs[0].path(), Some("foo/apple")); + assert_eq!(dirs[0].entries, vec![Entry::file("cleesh.txt", 2)]); + assert_eq!(dirs[1].path(), Some("foo")); + assert_eq!( + dirs[1].entries, + vec![Entry::dir("apple!banana"), Entry::dir("apple")] + ); + assert_eq!(dirs[2].path(), None); + assert_eq!(dirs[2].entries, vec![Entry::dir("foo")]); } #[test] - fn preslash_file() { + fn preslash_file_then_toslash_file() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 1,), + tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 1), Ok(Vec::new()) ); let e = tracker @@ -437,16 +465,151 @@ mod tests { } #[test] - fn preslash_file_rev() { + fn tostash_file_then_preslash_file() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/apple".parse::().unwrap(), 1,), + tracker.add(&"foo/bar/apple".parse::().unwrap(), 1), Ok(Vec::new()) ); assert_eq!( tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 2), Ok(Vec::new()) ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 3); + assert_eq!(dirs[0].path(), Some("foo/bar")); + assert_eq!( + dirs[0].entries, + vec![Entry::file("apple", 1), Entry::file("apple!banana.txt", 2)] + ); + assert_eq!(dirs[1].path(), Some("foo")); + assert_eq!(dirs[1].entries, vec![Entry::dir("bar")]); + assert_eq!(dirs[2].path(), None); + assert_eq!(dirs[2].entries, vec![Entry::dir("foo")]); + } + + #[test] + fn preslash_dir_then_toslash_file() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"foo/apple!banana/gnusto.txt".parse::().unwrap(), + 1, + ), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/apple".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!( + e, + TreeTrackerError::Unsorted { + before: "foo/apple!banana/gnusto.txt".into(), + after: "foo/apple".into() + } + ); + } + + #[test] + fn preslash_file_then_toslash_dir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add(&"foo/bar/apple/apricot.txt".parse::().unwrap(), 2), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 4); + assert_eq!(dirs[0].path(), Some("foo/bar/apple")); + assert_eq!(dirs[0].entries, vec![Entry::file("apricot.txt", 2)]); + assert_eq!(dirs[1].path(), Some("foo/bar")); + assert_eq!( + dirs[1].entries, + vec![Entry::file("apple!banana.txt", 1), Entry::dir("apple")] + ); + assert_eq!(dirs[2].path(), Some("foo")); + assert_eq!(dirs[2].entries, vec![Entry::dir("bar")]); + assert_eq!(dirs[3].path(), None); + assert_eq!(dirs[3].entries, vec![Entry::dir("foo")]); + } + + #[test] + fn path_conflict_file_then_dir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/bar/apple.txt".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!(e, TreeTrackerError::Conflict("foo/bar".into())); + } + + #[test] + fn just_finish() { + let tracker = TreeTracker::<()>::new(); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), None); + assert!(dirs[0].entries.is_empty()); + } + + #[test] + fn multidir_finish() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"apple/banana/coconut/date.txt".parse::().unwrap(), + 1 + ), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 4); + assert_eq!(dirs[0].path(), Some("apple/banana/coconut")); + assert_eq!(dirs[0].entries, vec![Entry::file("date.txt", 1)]); + assert_eq!(dirs[1].path(), Some("apple/banana")); + assert_eq!(dirs[1].entries, vec![Entry::dir("coconut")]); + assert_eq!(dirs[2].path(), Some("apple")); + assert_eq!(dirs[2].entries, vec![Entry::dir("banana")]); + assert_eq!(dirs[3].path(), None); + assert_eq!(dirs[3].entries, vec![Entry::dir("apple")]); + } + + #[test] + fn closedir_then_files_in_parent() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"apple/banana/coconut.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"apple/kumquat.txt".parse::().unwrap(), 2) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("apple/banana")); + assert_eq!(dirs[0].entries, vec![Entry::file("coconut.txt", 1)]); + assert_eq!( + tracker.add(&"apple/mango.txt".parse::().unwrap(), 3), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!(dirs[0].path(), Some("apple")); + assert_eq!( + dirs[0].entries, + vec![ + Entry::dir("banana"), + Entry::file("kumquat.txt", 2), + Entry::file("mango.txt", 3), + ] + ); + assert_eq!(dirs[1].path(), None); + assert_eq!(dirs[1].entries, vec![Entry::dir("apple")]); } mod cmp_name { @@ -480,15 +643,8 @@ mod tests { } // TESTS TO ADD: -// - "pre-slash" directory followed by file cut off at pre-slash → error -// - "pre-slash" file followed by directory with `/` at pre-slash location → -// success -// - path is both a file and a directory → error // - second path closes multiple directories // - close multiple directories down to root -// - finish() when multiple directories are open -// - finish() without calling add() -// - close a subdirectory, continue on with parent dir +// - close a subdirectory, then start a new directory in its parent // - close a directory in the root, continue on // - mix of files & directories in a directory -// - working with *.old.*.* filenames, especially ones out of order From e228aa193f9ac3c93949b8e577bdb489f321ad1a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 14:09:18 -0500 Subject: [PATCH 06/25] Fix failing test --- src/syncer/mod.rs | 2 + src/syncer/treetracker.rs | 227 ++++++++++++++++++++++++++++---------- 2 files changed, 173 insertions(+), 56 deletions(-) diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index 3935928..4bb0ca4 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -513,6 +513,8 @@ impl Syncer { None => true, }; if to_delete { + // TODO: Store the deletion in a Vec and delete them *after* + // finishing iterating over the directory // TODO: Log if is_dir { // TODO: Use async here? diff --git a/src/syncer/treetracker.rs b/src/syncer/treetracker.rs index 0ac5eba..e5b4ac0 100644 --- a/src/syncer/treetracker.rs +++ b/src/syncer/treetracker.rs @@ -17,71 +17,97 @@ impl TreeTracker { //old_filename: Option, // TODO value: T, ) -> Result>, TreeTrackerError> { - let (dirpath, filename) = key.split(); + fn after_error(key: &KeyPath, mut e: TreeTrackerError) -> TreeTrackerError { + if let TreeTrackerError::Unsorted { ref mut after, .. } = e { + *after = key.into(); + } + e + } let mut popped_dirs = Vec::new(); - let mut parts = dirpath.unwrap_or_default().split('/').enumerate(); - for (i, dirname) in parts.by_ref() { - let cmp_dirname = CmpName::Dir(dirname); + let mut partiter = KeyComponents::new(key, value); + while let Some((i, part)) = partiter.next() { let Some(pd) = self.0.get_mut(i) else { unreachable!( "TreeTracker::add() iteration should not go past the end of the stack" ); }; - if let Some(cd) = pd.current_subdir.as_ref() { - match cmp_dirname.cmp(&CmpName::Dir(cd)) { - Ordering::Equal => continue, - Ordering::Greater => { - // Close current dirs & push - for _ in (i + 1)..(self.0.len()) { - popped_dirs.push(self.pop()); + let cmp_name = part.cmp_name(); + match part { + Component::File(name, value) => { + match (pd.last_entry_is_dir(), pd.cmp_vs_last_entry(cmp_name)) { + (in_dir, Some(Ordering::Greater)) => { + if in_dir { + // Close current dirs + for _ in (i + 1)..(self.0.len()) { + popped_dirs.push(self.pop()); + } + } + self.push_file(name, value) + .map_err(|e| after_error(key, e))?; + break; + } + (true, Some(Ordering::Equal)) => { + return Err(TreeTrackerError::Conflict(self.last_key())); + } + (false, Some(Ordering::Equal)) => { + // XXX: Change this when support for old filenames is + // added: + return Err(TreeTrackerError::DuplicateFile(key.into())); + } + (_, Some(Ordering::Less)) => { + return Err(TreeTrackerError::Unsorted { + before: self.last_key(), + after: key.into(), + }); + } + (_, None) => { + assert!( + self.is_empty(), + "top dir of TreeTracker should be root when empty" + ); + self.push_file(name, value) + .map_err(|e| after_error(key, e))?; + break; } - self.push_dir(dirname); - break; // GOTO push - } - Ordering::Less => { - return Err(TreeTrackerError::Unsorted { - before: self.last_key(), - after: key.into(), - }); } } - } else if let Some(en) = pd.entries.last() { - match cmp_dirname.cmp(&en.cmp_name()) { - Ordering::Equal => { - assert!(en.is_file(), "last element of PartialDirectory::entries should be a file when current_subdir is None"); - return Err(TreeTrackerError::Conflict(self.last_key())); - } - Ordering::Greater => { - self.push_dir(dirname); - break; // GOTO push - } - Ordering::Less => { - return Err(TreeTrackerError::Unsorted { - before: self.last_key(), - after: key.into(), - }); + Component::Dir(name) => { + match (pd.last_entry_is_dir(), pd.cmp_vs_last_entry(cmp_name)) { + (in_dir, Some(Ordering::Greater)) => { + if in_dir { + // Close current dirs + for _ in (i + 1)..(self.0.len()) { + popped_dirs.push(self.pop()); + } + } + self.push_parts(name, partiter) + .map_err(|e| after_error(key, e))?; + break; + } + (true, Some(Ordering::Equal)) => continue, + (false, Some(Ordering::Equal)) => { + return Err(TreeTrackerError::Conflict(self.last_key())); + } + (_, Some(Ordering::Less)) => { + return Err(TreeTrackerError::Unsorted { + before: self.last_key(), + after: key.into(), + }); + } + (_, None) => { + assert!( + self.is_empty(), + "top dir of TreeTracker should be root when empty" + ); + self.push_parts(name, partiter) + .map_err(|e| after_error(key, e))?; + break; + } } } - } else { - assert!( - self.is_empty(), - "top dir of TreeTracker should be root when empty" - ); - self.push_dir(dirname); - break; // GOTO push } } - for (_, dirname) in parts { - self.push_dir(dirname); - } - if self.push_file(filename, value) { - Ok(popped_dirs) - } else { - Err(TreeTrackerError::Unsorted { - before: self.last_key(), - after: key.into(), - }) - } + Ok(popped_dirs) } pub(super) fn finish(mut self) -> Vec> { @@ -96,6 +122,21 @@ impl TreeTracker { self.0.is_empty() || (self.0.len() == 1 && self.0[0].is_empty()) } + fn push_parts( + &mut self, + first_dirname: &str, + rest: KeyComponents<'_, T>, + ) -> Result<(), TreeTrackerError> { + self.push_dir(first_dirname); + for (_, part) in rest { + match part { + Component::Dir(name) => self.push_dir(name), + Component::File(name, value) => self.push_file(name, value)?, + } + } + Ok(()) + } + fn push_dir(&mut self, name: &str) { let Some(pd) = self.0.last_mut() else { panic!("TreeTracker::push_dir() called on void tracker"); @@ -108,7 +149,7 @@ impl TreeTracker { self.0.push(PartialDirectory::new()); } - fn push_file(&mut self, name: &str, value: T) -> bool { + fn push_file(&mut self, name: &str, value: T) -> Result<(), TreeTrackerError> { let Some(pd) = self.0.last_mut() else { panic!("TreeTracker::push_file() called on void tracker"); }; @@ -117,12 +158,21 @@ impl TreeTracker { "TreeTracker::push_file() called when top dir has subdir" ); if let Some(en) = pd.entries.last() { - if en.cmp_name() >= CmpName::File(name) { - return false; + match CmpName::File(name).cmp(&en.cmp_name()) { + Ordering::Equal => return Err(TreeTrackerError::DuplicateFile(self.last_key())), + // IMPORTANT: The `after` needs to be replaced with the full path in the + // calling context: + Ordering::Less => { + return Err(TreeTrackerError::Unsorted { + before: self.last_key(), + after: name.into(), + }) + } + Ordering::Greater => (), } } pd.entries.push(Entry::file(name, value)); - true + Ok(()) } fn pop(&mut self) -> Directory { @@ -189,6 +239,17 @@ impl PartialDirectory { }; self.entries.push(Entry::dir(name)); } + + fn last_entry_is_dir(&self) -> bool { + self.current_subdir.is_some() + } + + fn cmp_vs_last_entry(&self, cname: CmpName<'_>) -> Option { + self.current_subdir + .as_deref() + .map(|cd| cname.cmp(&CmpName::Dir(cd))) + .or_else(|| self.entries.last().map(|en| cname.cmp(&en.cmp_name()))) + } } #[derive(Clone, Debug, Eq, PartialEq)] @@ -341,6 +402,58 @@ pub(super) enum TreeTrackerError { Unsorted { before: String, after: String }, #[error("path {0:?} is used as both a file and a directory")] Conflict(String), + #[error("file key {0:?} encountered more than once")] + DuplicateFile(String), +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct KeyComponents<'a, T> { + i: usize, + path: &'a str, + value: Option, +} + +impl<'a, T> KeyComponents<'a, T> { + fn new(key: &'a KeyPath, value: T) -> Self { + KeyComponents { + i: 0, + path: key.as_ref(), + value: Some(value), + } + } +} + +impl<'a, T> Iterator for KeyComponents<'a, T> { + type Item = (usize, Component<'a, T>); + + fn next(&mut self) -> Option { + let c = match self.path.find('/') { + Some(i) => { + let name = &self.path[..i]; + self.path = &self.path[(i + 1)..]; + Component::Dir(name) + } + None => Component::File(self.path, self.value.take()?), + }; + let i = self.i; + self.i += 1; + Some((i, c)) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Component<'a, T> { + Dir(&'a str), + File(&'a str, T), +} + +impl<'a, T> Component<'a, T> { + fn cmp_name(&self) -> CmpName<'a> { + match self { + Component::Dir(name) => CmpName::Dir(name), + Component::File(name, _) => CmpName::File(name), + } + } } #[cfg(test)] @@ -648,3 +761,5 @@ mod tests { // - close a subdirectory, then start a new directory in its parent // - close a directory in the root, continue on // - mix of files & directories in a directory +// - file in root dir (with & without preceding entries) +// - KeyComponents From 9f25b4c671f9cdb92ac2a31c040f340bb8f4260b Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 14:13:23 -0500 Subject: [PATCH 07/25] Move TreeTracker utilities to their own file --- src/syncer/treetracker/inner.rs | 223 ++++++++++++++++++ .../{treetracker.rs => treetracker/mod.rs} | 218 +---------------- 2 files changed, 225 insertions(+), 216 deletions(-) create mode 100644 src/syncer/treetracker/inner.rs rename src/syncer/{treetracker.rs => treetracker/mod.rs} (78%) diff --git a/src/syncer/treetracker/inner.rs b/src/syncer/treetracker/inner.rs new file mode 100644 index 0000000..f50289e --- /dev/null +++ b/src/syncer/treetracker/inner.rs @@ -0,0 +1,223 @@ +use crate::keypath::KeyPath; +use either::Either; +use std::cmp::Ordering; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct PartialDirectory { + pub(super) entries: Vec>, + pub(super) current_subdir: Option, +} + +impl PartialDirectory { + pub(super) fn new() -> Self { + PartialDirectory { + entries: Vec::new(), + current_subdir: None, + } + } + + pub(super) fn is_empty(&self) -> bool { + self.entries.is_empty() && self.current_subdir.is_none() + } + + pub(super) fn close_current(&mut self) { + let Some(name) = self.current_subdir.take() else { + panic!("PartialDirectory::close_current() called without a current directory"); + }; + self.entries.push(Entry::dir(name)); + } + + pub(super) fn last_entry_is_dir(&self) -> bool { + self.current_subdir.is_some() + } + + pub(super) fn cmp_vs_last_entry(&self, cname: CmpName<'_>) -> Option { + self.current_subdir + .as_deref() + .map(|cd| cname.cmp(&CmpName::Dir(cd))) + .or_else(|| self.entries.last().map(|en| cname.cmp(&en.cmp_name()))) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) enum Entry { + File { + name: String, + //old_filenames: Vec, // TODO + value: T, + }, + Dir { + name: String, + }, +} + +impl Entry { + pub(super) fn file>(name: S, value: T) -> Entry { + Entry::File { + name: name.into(), + //old_filenames: Vec::new(), // TODO + value, + } + } + + pub(super) fn dir>(name: S) -> Entry { + Entry::Dir { name: name.into() } + } + + pub(super) fn name(&self) -> &str { + match self { + Entry::File { name, .. } => name, + Entry::Dir { name } => name, + } + } + + pub(super) fn is_file(&self) -> bool { + matches!(self, Entry::File { .. }) + } + + pub(super) fn is_dir(&self) -> bool { + matches!(self, Entry::Dir { .. }) + } + + pub(super) fn cmp_name(&self) -> CmpName<'_> { + match self { + Entry::File { name, .. } => CmpName::File(name.as_ref()), + Entry::Dir { name } => CmpName::Dir(name.as_ref()), + } + } +} + +/// A wrapper around an individual path name component that compares it to +/// other components as though they were part of longer paths, i.e., directory +/// names have an implicit trailing '/' added. As an exception, if a file name +/// and a directory name are equal aside from the trailing '/', this type +/// compares them as equal. +#[derive(Clone, Copy, Debug)] +pub(super) enum CmpName<'a> { + File(&'a str), + Dir(&'a str), +} + +impl CmpName<'_> { + pub(super) fn name(&self) -> &str { + match self { + CmpName::File(s) => s, + CmpName::Dir(s) => s, + } + } + + pub(super) fn chars(&self) -> impl Iterator + '_ { + match self { + CmpName::File(s) => Either::Left(s.chars()), + CmpName::Dir(s) => Either::Right(s.chars().chain(std::iter::once('/'))), + } + } +} + +impl PartialEq for CmpName<'_> { + fn eq(&self, other: &CmpName<'_>) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for CmpName<'_> {} + +impl PartialOrd for CmpName<'_> { + fn partial_cmp(&self, other: &CmpName<'_>) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for CmpName<'_> { + fn cmp(&self, other: &CmpName<'_>) -> Ordering { + if self.name() == other.name() { + Ordering::Equal + } else { + self.chars().cmp(other.chars()) + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct KeyComponents<'a, T> { + i: usize, + path: &'a str, + value: Option, +} + +impl<'a, T> KeyComponents<'a, T> { + pub(super) fn new(key: &'a KeyPath, value: T) -> Self { + KeyComponents { + i: 0, + path: key.as_ref(), + value: Some(value), + } + } +} + +impl<'a, T> Iterator for KeyComponents<'a, T> { + type Item = (usize, Component<'a, T>); + + fn next(&mut self) -> Option { + let c = match self.path.find('/') { + Some(i) => { + let name = &self.path[..i]; + self.path = &self.path[(i + 1)..]; + Component::Dir(name) + } + None => Component::File(self.path, self.value.take()?), + }; + let i = self.i; + self.i += 1; + Some((i, c)) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(super) enum Component<'a, T> { + Dir(&'a str), + File(&'a str, T), +} + +impl<'a, T> Component<'a, T> { + pub(super) fn cmp_name(&self) -> CmpName<'a> { + match self { + Component::Dir(name) => CmpName::Dir(name), + Component::File(name, _) => CmpName::File(name), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + mod cmp_name { + use super::*; + + #[test] + fn dir_eq_file() { + assert!(CmpName::File("foo") == CmpName::Dir("foo")); + } + + #[test] + fn pre_slash_dir_before_dir() { + assert!(CmpName::Dir("apple!banana") < CmpName::Dir("apple")); + } + + #[test] + fn pre_slash_file_before_dir() { + assert!(CmpName::File("apple!banana") < CmpName::Dir("apple")); + } + + #[test] + fn pre_slash_dir_after_file() { + assert!(CmpName::Dir("apple!banana") > CmpName::File("apple")); + } + + #[test] + fn pre_slash_file_after_file() { + assert!(CmpName::File("apple!banana") > CmpName::File("apple")); + } + } +} diff --git a/src/syncer/treetracker.rs b/src/syncer/treetracker/mod.rs similarity index 78% rename from src/syncer/treetracker.rs rename to src/syncer/treetracker/mod.rs index e5b4ac0..a62db3f 100644 --- a/src/syncer/treetracker.rs +++ b/src/syncer/treetracker/mod.rs @@ -1,5 +1,6 @@ +mod inner; +use self::inner::*; use crate::keypath::KeyPath; -use either::Either; use std::cmp::Ordering; use thiserror::Error; @@ -215,91 +216,6 @@ impl TreeTracker { } } -#[derive(Clone, Debug, Eq, PartialEq)] -struct PartialDirectory { - entries: Vec>, - current_subdir: Option, -} - -impl PartialDirectory { - fn new() -> Self { - PartialDirectory { - entries: Vec::new(), - current_subdir: None, - } - } - - fn is_empty(&self) -> bool { - self.entries.is_empty() && self.current_subdir.is_none() - } - - fn close_current(&mut self) { - let Some(name) = self.current_subdir.take() else { - panic!("PartialDirectory::close_current() called without a current directory"); - }; - self.entries.push(Entry::dir(name)); - } - - fn last_entry_is_dir(&self) -> bool { - self.current_subdir.is_some() - } - - fn cmp_vs_last_entry(&self, cname: CmpName<'_>) -> Option { - self.current_subdir - .as_deref() - .map(|cd| cname.cmp(&CmpName::Dir(cd))) - .or_else(|| self.entries.last().map(|en| cname.cmp(&en.cmp_name()))) - } -} - -#[derive(Clone, Debug, Eq, PartialEq)] -enum Entry { - File { - name: String, - //old_filenames: Vec, // TODO - value: T, - }, - Dir { - name: String, - }, -} - -impl Entry { - fn file>(name: S, value: T) -> Entry { - Entry::File { - name: name.into(), - //old_filenames: Vec::new(), // TODO - value, - } - } - - fn dir>(name: S) -> Entry { - Entry::Dir { name: name.into() } - } - - fn name(&self) -> &str { - match self { - Entry::File { name, .. } => name, - Entry::Dir { name } => name, - } - } - - fn is_file(&self) -> bool { - matches!(self, Entry::File { .. }) - } - - fn is_dir(&self) -> bool { - matches!(self, Entry::Dir { .. }) - } - - fn cmp_name(&self) -> CmpName<'_> { - match self { - Entry::File { name, .. } => CmpName::File(name.as_ref()), - Entry::Dir { name } => CmpName::Dir(name.as_ref()), - } - } -} - #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct Directory { path: Option, // `None` for the root @@ -345,57 +261,6 @@ impl Directory { } } -/// A wrapper around an individual path name component that compares it to -/// other components as though they were part of longer paths, i.e., directory -/// names have an implicit trailing '/' added. As an exception, if a file name -/// and a directory name are equal aside from the trailing '/', this type -/// compares them as equal. -#[derive(Clone, Copy, Debug)] -enum CmpName<'a> { - File(&'a str), - Dir(&'a str), -} - -impl CmpName<'_> { - fn name(&self) -> &str { - match self { - CmpName::File(s) => s, - CmpName::Dir(s) => s, - } - } - - fn chars(&self) -> impl Iterator + '_ { - match self { - CmpName::File(s) => Either::Left(s.chars()), - CmpName::Dir(s) => Either::Right(s.chars().chain(std::iter::once('/'))), - } - } -} - -impl PartialEq for CmpName<'_> { - fn eq(&self, other: &CmpName<'_>) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for CmpName<'_> {} - -impl PartialOrd for CmpName<'_> { - fn partial_cmp(&self, other: &CmpName<'_>) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for CmpName<'_> { - fn cmp(&self, other: &CmpName<'_>) -> Ordering { - if self.name() == other.name() { - Ordering::Equal - } else { - self.chars().cmp(other.chars()) - } - } -} - #[derive(Clone, Debug, Eq, Error, PartialEq)] pub(super) enum TreeTrackerError { #[error("received keys in unsorted order: {before:?} came before {after:?}")] @@ -406,56 +271,6 @@ pub(super) enum TreeTrackerError { DuplicateFile(String), } -#[derive(Clone, Debug, Eq, PartialEq)] -struct KeyComponents<'a, T> { - i: usize, - path: &'a str, - value: Option, -} - -impl<'a, T> KeyComponents<'a, T> { - fn new(key: &'a KeyPath, value: T) -> Self { - KeyComponents { - i: 0, - path: key.as_ref(), - value: Some(value), - } - } -} - -impl<'a, T> Iterator for KeyComponents<'a, T> { - type Item = (usize, Component<'a, T>); - - fn next(&mut self) -> Option { - let c = match self.path.find('/') { - Some(i) => { - let name = &self.path[..i]; - self.path = &self.path[(i + 1)..]; - Component::Dir(name) - } - None => Component::File(self.path, self.value.take()?), - }; - let i = self.i; - self.i += 1; - Some((i, c)) - } -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -enum Component<'a, T> { - Dir(&'a str), - File(&'a str, T), -} - -impl<'a, T> Component<'a, T> { - fn cmp_name(&self) -> CmpName<'a> { - match self { - Component::Dir(name) => CmpName::Dir(name), - Component::File(name, _) => CmpName::File(name), - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -724,35 +539,6 @@ mod tests { assert_eq!(dirs[1].path(), None); assert_eq!(dirs[1].entries, vec![Entry::dir("apple")]); } - - mod cmp_name { - use super::*; - - #[test] - fn dir_eq_file() { - assert!(CmpName::File("foo") == CmpName::Dir("foo")); - } - - #[test] - fn pre_slash_dir_before_dir() { - assert!(CmpName::Dir("apple!banana") < CmpName::Dir("apple")); - } - - #[test] - fn pre_slash_file_before_dir() { - assert!(CmpName::File("apple!banana") < CmpName::Dir("apple")); - } - - #[test] - fn pre_slash_dir_after_file() { - assert!(CmpName::Dir("apple!banana") > CmpName::File("apple")); - } - - #[test] - fn pre_slash_file_after_file() { - assert!(CmpName::File("apple!banana") > CmpName::File("apple")); - } - } } // TESTS TO ADD: From a1605bebb7a148b40052ed114f3f273e931cb110 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 14:16:42 -0500 Subject: [PATCH 08/25] Test KeyComponents --- src/syncer/treetracker/inner.rs | 24 ++++++++++++++++++++++++ src/syncer/treetracker/mod.rs | 1 - 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/syncer/treetracker/inner.rs b/src/syncer/treetracker/inner.rs index f50289e..a815ce9 100644 --- a/src/syncer/treetracker/inner.rs +++ b/src/syncer/treetracker/inner.rs @@ -220,4 +220,28 @@ mod tests { assert!(CmpName::File("apple!banana") > CmpName::File("apple")); } } + + mod key_components { + use super::*; + + #[test] + fn plain() { + let key = "foo/bar/quux.txt".parse::().unwrap(); + let mut iter = KeyComponents::new(&key, 1); + assert_eq!(iter.next(), Some((0, Component::Dir("foo")))); + assert_eq!(iter.next(), Some((1, Component::Dir("bar")))); + assert_eq!(iter.next(), Some((2, Component::File("quux.txt", 1)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + #[test] + fn filename_only() { + let key = "quux.txt".parse::().unwrap(); + let mut iter = KeyComponents::new(&key, 1); + assert_eq!(iter.next(), Some((0, Component::File("quux.txt", 1)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + } } diff --git a/src/syncer/treetracker/mod.rs b/src/syncer/treetracker/mod.rs index a62db3f..8bdba52 100644 --- a/src/syncer/treetracker/mod.rs +++ b/src/syncer/treetracker/mod.rs @@ -548,4 +548,3 @@ mod tests { // - close a directory in the root, continue on // - mix of files & directories in a directory // - file in root dir (with & without preceding entries) -// - KeyComponents From c4173338367f12a2f281da0df329399ca1939108 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 9 Jan 2025 16:54:57 -0500 Subject: [PATCH 09/25] More TreeTracker tests --- src/syncer/treetracker/mod.rs | 172 ++++++++++++++++++++++++++++++++-- 1 file changed, 163 insertions(+), 9 deletions(-) diff --git a/src/syncer/treetracker/mod.rs b/src/syncer/treetracker/mod.rs index 8bdba52..c52de7c 100644 --- a/src/syncer/treetracker/mod.rs +++ b/src/syncer/treetracker/mod.rs @@ -48,7 +48,7 @@ impl TreeTracker { break; } (true, Some(Ordering::Equal)) => { - return Err(TreeTrackerError::Conflict(self.last_key())); + return Err(TreeTrackerError::Conflict(key.into())); } (false, Some(Ordering::Equal)) => { // XXX: Change this when support for old filenames is @@ -477,6 +477,19 @@ mod tests { assert_eq!(e, TreeTrackerError::Conflict("foo/bar".into())); } + #[test] + fn path_conflict_dir_then_file() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar/quux.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/bar".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!(e, TreeTrackerError::Conflict("foo/bar".into())); + } + #[test] fn just_finish() { let tracker = TreeTracker::<()>::new(); @@ -539,12 +552,153 @@ mod tests { assert_eq!(dirs[1].path(), None); assert_eq!(dirs[1].entries, vec![Entry::dir("apple")]); } -} -// TESTS TO ADD: -// - second path closes multiple directories -// - close multiple directories down to root -// - close a subdirectory, then start a new directory in its parent -// - close a directory in the root, continue on -// - mix of files & directories in a directory -// - file in root dir (with & without preceding entries) + #[test] + fn closedir_then_dirs_in_parent() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"apple/banana/coconut.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"apple/eggplant/kumquat.txt".parse::().unwrap(), 2) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("apple/banana")); + assert_eq!(dirs[0].entries, vec![Entry::file("coconut.txt", 1)]); + let dirs = tracker + .add(&"apple/mango/tangerine.txt".parse::().unwrap(), 3) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("apple/eggplant")); + assert_eq!(dirs[0].entries, vec![Entry::file("kumquat.txt", 2)]); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 3); + assert_eq!(dirs[0].path(), Some("apple/mango")); + assert_eq!(dirs[0].entries, vec![Entry::file("tangerine.txt", 3)]); + assert_eq!(dirs[1].path(), Some("apple")); + assert_eq!( + dirs[1].entries, + vec![ + Entry::dir("banana"), + Entry::dir("eggplant"), + Entry::dir("mango"), + ] + ); + assert_eq!(dirs[2].path(), None); + assert_eq!(dirs[2].entries, vec![Entry::dir("apple")]); + } + + #[test] + fn close_multiple_dirs() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"apple/banana/coconut/date.txt".parse::().unwrap(), + 1 + ), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"foo.txt".parse::().unwrap(), 2) + .unwrap(); + assert_eq!(dirs.len(), 3); + assert_eq!(dirs[0].path(), Some("apple/banana/coconut")); + assert_eq!(dirs[0].entries, vec![Entry::file("date.txt", 1)]); + assert_eq!(dirs[1].path(), Some("apple/banana")); + assert_eq!(dirs[1].entries, vec![Entry::dir("coconut")]); + assert_eq!(dirs[2].path(), Some("apple")); + assert_eq!(dirs[2].entries, vec![Entry::dir("banana")]); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), None); + assert_eq!( + dirs[0].entries, + vec![Entry::dir("apple"), Entry::file("foo.txt", 2)] + ); + } + + #[test] + fn same_file_twice() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar/quux.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/bar/quux.txt".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!( + e, + TreeTrackerError::DuplicateFile("foo/bar/quux.txt".into()) + ); + } + + #[test] + fn unsorted_parent_dirs() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/gnusto/quux.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/bar/cleesh.txt".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!( + e, + TreeTrackerError::Unsorted { + before: "foo/gnusto/quux.txt".into(), + after: "foo/bar/cleesh.txt".into() + } + ); + } + + #[test] + fn file_then_preceding_dir() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/gnusto.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + let e = tracker + .add(&"foo/bar/cleesh.txt".parse::().unwrap(), 2) + .unwrap_err(); + assert_eq!( + e, + TreeTrackerError::Unsorted { + before: "foo/gnusto.txt".into(), + after: "foo/bar/cleesh.txt".into() + } + ); + } + + #[test] + fn files_in_root() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo.txt".parse::().unwrap(), 1), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add(&"gnusto/cleesh.txt".parse::().unwrap(), 2), + Ok(Vec::new()) + ); + let dirs = tracker + .add(&"quux.txt".parse::().unwrap(), 3) + .unwrap(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), Some("gnusto")); + assert_eq!(dirs[0].entries, vec![Entry::file("cleesh.txt", 2)]); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 1); + assert_eq!(dirs[0].path(), None); + assert_eq!( + dirs[0].entries, + vec![ + Entry::file("foo.txt", 1), + Entry::dir("gnusto"), + Entry::file("quux.txt", 3) + ] + ); + } +} From dd070c8a7687a678ae38ea18df48fc6c7c4abeaa Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 08:46:09 -0500 Subject: [PATCH 10/25] Remove some unreachable code --- src/syncer/treetracker/mod.rs | 48 ++++++++++------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/src/syncer/treetracker/mod.rs b/src/syncer/treetracker/mod.rs index c52de7c..c15d411 100644 --- a/src/syncer/treetracker/mod.rs +++ b/src/syncer/treetracker/mod.rs @@ -18,12 +18,6 @@ impl TreeTracker { //old_filename: Option, // TODO value: T, ) -> Result>, TreeTrackerError> { - fn after_error(key: &KeyPath, mut e: TreeTrackerError) -> TreeTrackerError { - if let TreeTrackerError::Unsorted { ref mut after, .. } = e { - *after = key.into(); - } - e - } let mut popped_dirs = Vec::new(); let mut partiter = KeyComponents::new(key, value); while let Some((i, part)) = partiter.next() { @@ -43,8 +37,7 @@ impl TreeTracker { popped_dirs.push(self.pop()); } } - self.push_file(name, value) - .map_err(|e| after_error(key, e))?; + self.push_file(name, value); break; } (true, Some(Ordering::Equal)) => { @@ -66,8 +59,7 @@ impl TreeTracker { self.is_empty(), "top dir of TreeTracker should be root when empty" ); - self.push_file(name, value) - .map_err(|e| after_error(key, e))?; + self.push_file(name, value); break; } } @@ -81,8 +73,7 @@ impl TreeTracker { popped_dirs.push(self.pop()); } } - self.push_parts(name, partiter) - .map_err(|e| after_error(key, e))?; + self.push_parts(name, partiter); break; } (true, Some(Ordering::Equal)) => continue, @@ -100,8 +91,7 @@ impl TreeTracker { self.is_empty(), "top dir of TreeTracker should be root when empty" ); - self.push_parts(name, partiter) - .map_err(|e| after_error(key, e))?; + self.push_parts(name, partiter); break; } } @@ -123,19 +113,14 @@ impl TreeTracker { self.0.is_empty() || (self.0.len() == 1 && self.0[0].is_empty()) } - fn push_parts( - &mut self, - first_dirname: &str, - rest: KeyComponents<'_, T>, - ) -> Result<(), TreeTrackerError> { + fn push_parts(&mut self, first_dirname: &str, rest: KeyComponents<'_, T>) { self.push_dir(first_dirname); for (_, part) in rest { match part { Component::Dir(name) => self.push_dir(name), - Component::File(name, value) => self.push_file(name, value)?, + Component::File(name, value) => self.push_file(name, value), } } - Ok(()) } fn push_dir(&mut self, name: &str) { @@ -150,7 +135,7 @@ impl TreeTracker { self.0.push(PartialDirectory::new()); } - fn push_file(&mut self, name: &str, value: T) -> Result<(), TreeTrackerError> { + fn push_file(&mut self, name: &str, value: T) { let Some(pd) = self.0.last_mut() else { panic!("TreeTracker::push_file() called on void tracker"); }; @@ -159,21 +144,14 @@ impl TreeTracker { "TreeTracker::push_file() called when top dir has subdir" ); if let Some(en) = pd.entries.last() { - match CmpName::File(name).cmp(&en.cmp_name()) { - Ordering::Equal => return Err(TreeTrackerError::DuplicateFile(self.last_key())), - // IMPORTANT: The `after` needs to be replaced with the full path in the - // calling context: - Ordering::Less => { - return Err(TreeTrackerError::Unsorted { - before: self.last_key(), - after: name.into(), - }) - } - Ordering::Greater => (), - } + // The following assert should not fail, due to the code leading up + // to it in `add()` + assert!( + CmpName::File(name) > en.cmp_name(), + "TreeTracker::push_file() called with filename not after previous name" + ); } pd.entries.push(Entry::file(name, value)); - Ok(()) } fn pop(&mut self) -> Directory { From ebe1b7d724fbc6691194e764e97cc919a02df1d1 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 10:49:59 -0500 Subject: [PATCH 11/25] Update TreeTracker to track filenames of non-latest versions --- src/syncer/mod.rs | 3 +- src/syncer/treetracker/inner.rs | 72 ++-- src/syncer/treetracker/mod.rs | 563 ++++++++++++++++++++++++-------- 3 files changed, 473 insertions(+), 165 deletions(-) diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index 4bb0ca4..be4515c 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -141,7 +141,8 @@ impl Syncer { } InventoryEntry::Item(item) => { // TODO: Store a Cond for awaiting completion of item processing - for dir in tracker.add(&item.key, ())? { + // TODO: Old filenames: + for dir in tracker.add(&item.key, (), None)? { // TODO: Spawn in JoinSet let this2 = this.clone(); tokio::task::spawn_blocking(move || this2.cleanup_dir(dir)); diff --git a/src/syncer/treetracker/inner.rs b/src/syncer/treetracker/inner.rs index a815ce9..300f874 100644 --- a/src/syncer/treetracker/inner.rs +++ b/src/syncer/treetracker/inner.rs @@ -1,6 +1,7 @@ use crate::keypath::KeyPath; use either::Either; use std::cmp::Ordering; +use std::collections::HashMap; #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct PartialDirectory { @@ -43,8 +44,8 @@ impl PartialDirectory { pub(super) enum Entry { File { name: String, - //old_filenames: Vec, // TODO - value: T, + value: Option, + old_filenames: HashMap, }, Dir { name: String, @@ -52,11 +53,23 @@ pub(super) enum Entry { } impl Entry { - pub(super) fn file>(name: S, value: T) -> Entry { - Entry::File { - name: name.into(), - //old_filenames: Vec::new(), // TODO - value, + pub(super) fn file>( + name: S, + value: T, + old_filename: Option, + ) -> Entry { + if let Some(of) = old_filename { + Entry::File { + name: name.into(), + value: None, + old_filenames: HashMap::from([(of, value)]), + } + } else { + Entry::File { + name: name.into(), + value: Some(value), + old_filenames: HashMap::new(), + } } } @@ -71,14 +84,6 @@ impl Entry { } } - pub(super) fn is_file(&self) -> bool { - matches!(self, Entry::File { .. }) - } - - pub(super) fn is_dir(&self) -> bool { - matches!(self, Entry::Dir { .. }) - } - pub(super) fn cmp_name(&self) -> CmpName<'_> { match self { Entry::File { name, .. } => CmpName::File(name.as_ref()), @@ -143,14 +148,16 @@ pub(super) struct KeyComponents<'a, T> { i: usize, path: &'a str, value: Option, + old_filename: Option>, } impl<'a, T> KeyComponents<'a, T> { - pub(super) fn new(key: &'a KeyPath, value: T) -> Self { + pub(super) fn new(key: &'a KeyPath, value: T, old_filename: Option) -> Self { KeyComponents { i: 0, path: key.as_ref(), value: Some(value), + old_filename: Some(old_filename), } } } @@ -165,7 +172,7 @@ impl<'a, T> Iterator for KeyComponents<'a, T> { self.path = &self.path[(i + 1)..]; Component::Dir(name) } - None => Component::File(self.path, self.value.take()?), + None => Component::File(self.path, self.value.take()?, self.old_filename.take()?), }; let i = self.i; self.i += 1; @@ -173,17 +180,17 @@ impl<'a, T> Iterator for KeyComponents<'a, T> { } } -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum Component<'a, T> { Dir(&'a str), - File(&'a str, T), + File(&'a str, T, Option), } impl<'a, T> Component<'a, T> { pub(super) fn cmp_name(&self) -> CmpName<'a> { match self { Component::Dir(name) => CmpName::Dir(name), - Component::File(name, _) => CmpName::File(name), + Component::File(name, _, _) => CmpName::File(name), } } } @@ -227,10 +234,10 @@ mod tests { #[test] fn plain() { let key = "foo/bar/quux.txt".parse::().unwrap(); - let mut iter = KeyComponents::new(&key, 1); + let mut iter = KeyComponents::new(&key, 1, None); assert_eq!(iter.next(), Some((0, Component::Dir("foo")))); assert_eq!(iter.next(), Some((1, Component::Dir("bar")))); - assert_eq!(iter.next(), Some((2, Component::File("quux.txt", 1)))); + assert_eq!(iter.next(), Some((2, Component::File("quux.txt", 1, None)))); assert_eq!(iter.next(), None); assert_eq!(iter.next(), None); } @@ -238,8 +245,25 @@ mod tests { #[test] fn filename_only() { let key = "quux.txt".parse::().unwrap(); - let mut iter = KeyComponents::new(&key, 1); - assert_eq!(iter.next(), Some((0, Component::File("quux.txt", 1)))); + let mut iter = KeyComponents::new(&key, 1, None); + assert_eq!(iter.next(), Some((0, Component::File("quux.txt", 1, None)))); + assert_eq!(iter.next(), None); + assert_eq!(iter.next(), None); + } + + #[test] + fn with_old_filename() { + let key = "foo/bar/quux.txt".parse::().unwrap(); + let mut iter = KeyComponents::new(&key, 1, Some("quux.old.1.2".into())); + assert_eq!(iter.next(), Some((0, Component::Dir("foo")))); + assert_eq!(iter.next(), Some((1, Component::Dir("bar")))); + assert_eq!( + iter.next(), + Some(( + 2, + Component::File("quux.txt", 1, Some("quux.old.1.2".into())) + )) + ); assert_eq!(iter.next(), None); assert_eq!(iter.next(), None); } diff --git a/src/syncer/treetracker/mod.rs b/src/syncer/treetracker/mod.rs index c15d411..a809574 100644 --- a/src/syncer/treetracker/mod.rs +++ b/src/syncer/treetracker/mod.rs @@ -2,6 +2,7 @@ mod inner; use self::inner::*; use crate::keypath::KeyPath; use std::cmp::Ordering; +use std::collections::{HashMap, HashSet}; use thiserror::Error; #[derive(Clone, Debug, Eq, PartialEq)] @@ -15,11 +16,11 @@ impl TreeTracker { pub(super) fn add( &mut self, key: &KeyPath, - //old_filename: Option, // TODO value: T, + old_filename: Option, ) -> Result>, TreeTrackerError> { let mut popped_dirs = Vec::new(); - let mut partiter = KeyComponents::new(key, value); + let mut partiter = KeyComponents::new(key, value, old_filename); while let Some((i, part)) = partiter.next() { let Some(pd) = self.0.get_mut(i) else { unreachable!( @@ -28,7 +29,7 @@ impl TreeTracker { }; let cmp_name = part.cmp_name(); match part { - Component::File(name, value) => { + Component::File(name, value, old_filename) => { match (pd.last_entry_is_dir(), pd.cmp_vs_last_entry(cmp_name)) { (in_dir, Some(Ordering::Greater)) => { if in_dir { @@ -37,16 +38,14 @@ impl TreeTracker { popped_dirs.push(self.pop()); } } - self.push_file(name, value); + self.push_file(name, value, old_filename)?; break; } (true, Some(Ordering::Equal)) => { return Err(TreeTrackerError::Conflict(key.into())); } (false, Some(Ordering::Equal)) => { - // XXX: Change this when support for old filenames is - // added: - return Err(TreeTrackerError::DuplicateFile(key.into())); + self.push_file(name, value, old_filename)?; } (_, Some(Ordering::Less)) => { return Err(TreeTrackerError::Unsorted { @@ -59,7 +58,7 @@ impl TreeTracker { self.is_empty(), "top dir of TreeTracker should be root when empty" ); - self.push_file(name, value); + self.push_file(name, value, old_filename)?; break; } } @@ -73,7 +72,7 @@ impl TreeTracker { popped_dirs.push(self.pop()); } } - self.push_parts(name, partiter); + self.push_parts(name, partiter)?; break; } (true, Some(Ordering::Equal)) => continue, @@ -91,7 +90,7 @@ impl TreeTracker { self.is_empty(), "top dir of TreeTracker should be root when empty" ); - self.push_parts(name, partiter); + self.push_parts(name, partiter)?; break; } } @@ -113,14 +112,21 @@ impl TreeTracker { self.0.is_empty() || (self.0.len() == 1 && self.0[0].is_empty()) } - fn push_parts(&mut self, first_dirname: &str, rest: KeyComponents<'_, T>) { + fn push_parts( + &mut self, + first_dirname: &str, + rest: KeyComponents<'_, T>, + ) -> Result<(), TreeTrackerError> { self.push_dir(first_dirname); for (_, part) in rest { match part { Component::Dir(name) => self.push_dir(name), - Component::File(name, value) => self.push_file(name, value), + Component::File(name, value, old_filename) => { + self.push_file(name, value, old_filename)?; + } } } + Ok(()) } fn push_dir(&mut self, name: &str) { @@ -135,7 +141,12 @@ impl TreeTracker { self.0.push(PartialDirectory::new()); } - fn push_file(&mut self, name: &str, value: T) { + fn push_file( + &mut self, + name: &str, + value: T, + old_filename: Option, + ) -> Result<(), TreeTrackerError> { let Some(pd) = self.0.last_mut() else { panic!("TreeTracker::push_file() called on void tracker"); }; @@ -143,15 +154,39 @@ impl TreeTracker { pd.current_subdir.is_none(), "TreeTracker::push_file() called when top dir has subdir" ); - if let Some(en) = pd.entries.last() { - // The following assert should not fail, due to the code leading up - // to it in `add()` - assert!( - CmpName::File(name) > en.cmp_name(), - "TreeTracker::push_file() called with filename not after previous name" - ); + if let Some(en) = pd.entries.last_mut() { + match CmpName::File(name).cmp(&en.cmp_name()) { + Ordering::Less => { + panic!("TreeTracker::push_file() called with filename less than previous name") + } + Ordering::Equal => { + let Entry::File { + old_filenames, + value: envalue, + .. + } = en + else { + panic!("TreeTracker::push_file() called with filename equal to previous name and previous is not a file"); + }; + if let Some(of) = old_filename { + if old_filenames.insert(of.clone(), value).is_some() { + return Err(TreeTrackerError::DuplicateOldFile { + key: self.last_key(), + old_filename: of, + }); + } + } else if envalue.is_none() { + *envalue = Some(value); + } else { + return Err(TreeTrackerError::DuplicateFile(self.last_key())); + } + } + Ordering::Greater => pd.entries.push(Entry::file(name, value, old_filename)), + } + } else { + pd.entries.push(Entry::file(name, value, old_filename)); } - pd.entries.push(Entry::file(name, value)); + Ok(()) } fn pop(&mut self) -> Directory { @@ -167,7 +202,30 @@ impl TreeTracker { if let Some(ppd) = self.0.last_mut() { ppd.close_current(); } - Directory { path, entries } + let mut files = HashMap::new(); + let mut directories = HashSet::new(); + for en in entries { + match en { + Entry::File { + name, + value, + old_filenames, + } => { + if let Some(value) = value { + files.insert(name, value); + } + files.extend(old_filenames); + } + Entry::Dir { name } => { + directories.insert(name); + } + } + } + Directory { + path, + files, + directories, + } } fn last_key(&self) -> String { @@ -196,8 +254,9 @@ impl TreeTracker { #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct Directory { - path: Option, // `None` for the root - entries: Vec>, // TODO: Flatten out the old_filenames + path: Option, // `None` for the root + files: HashMap, + directories: HashSet, } impl Directory { @@ -205,36 +264,24 @@ impl Directory { self.path.as_deref() } - fn find(&self, name: &str) -> Option<&Entry> { - self.entries - .binary_search_by(|en| en.name().cmp(name)) - .ok() - .map(|i| &self.entries[i]) - } - pub(super) fn contains_file(&self, name: &str) -> bool { - self.find(name).is_some_and(Entry::is_file) + self.files.contains_key(name) } pub(super) fn contains_dir(&self, name: &str) -> bool { - self.find(name).is_some_and(Entry::is_dir) + self.directories.contains(name) } #[allow(dead_code)] pub(super) fn map U>(self, mut f: F) -> Directory { Directory { path: self.path, - entries: self - .entries + files: self + .files .into_iter() - .map(|en| match en { - Entry::File { name, value } => Entry::File { - name, - value: f(value), - }, - Entry::Dir { name } => Entry::Dir { name }, - }) + .map(|(name, value)| (name, f(value))) .collect(), + directories: self.directories, } } } @@ -247,6 +294,8 @@ pub(super) enum TreeTrackerError { Conflict(String), #[error("file key {0:?} encountered more than once")] DuplicateFile(String), + #[error("key {key:?} has multiple non-latest versions with filename {old_filename:?}")] + DuplicateOldFile { key: String, old_filename: String }, } #[cfg(test)] @@ -257,45 +306,50 @@ mod tests { fn same_dir() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar.txt".parse::().unwrap(), 1), + tracker.add(&"foo/bar.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); assert_eq!( - tracker.add(&"foo/quux.txt".parse::().unwrap(), 2), + tracker.add(&"foo/quux.txt".parse::().unwrap(), 2, None), Ok(Vec::new()) ); let dirs = tracker.finish(); assert_eq!(dirs.len(), 2); assert_eq!(dirs[0].path(), Some("foo")); assert_eq!( - dirs[0].entries, - vec![Entry::file("bar.txt", 1), Entry::file("quux.txt", 2)] + dirs[0].files, + HashMap::from([("bar.txt".into(), 1), ("quux.txt".into(), 2),]) ); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), None); - assert_eq!(dirs[1].entries, vec![Entry::dir("foo")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[1].directories, HashSet::from(["foo".into()])); } #[test] fn different_dir() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar.txt".parse::().unwrap(), 1), + tracker.add(&"foo/bar.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let dirs = tracker - .add(&"glarch/quux.txt".parse::().unwrap(), 2) + .add(&"glarch/quux.txt".parse::().unwrap(), 2, None) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("foo")); - assert_eq!(dirs[0].entries, vec![Entry::file("bar.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("bar.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); let dirs = tracker.finish(); assert_eq!(dirs.len(), 2); assert_eq!(dirs[0].path(), Some("glarch")); - assert_eq!(dirs[0].entries, vec![Entry::file("quux.txt", 2)]); + assert_eq!(dirs[0].files, HashMap::from([("quux.txt".into(), 2)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), None); + assert!(dirs[1].files.is_empty()); assert_eq!( - dirs[1].entries, - vec![Entry::dir("foo"), Entry::dir("glarch")] + dirs[1].directories, + HashSet::from(["foo".into(), "glarch".into()]) ); } @@ -303,23 +357,30 @@ mod tests { fn different_subdir() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/apple.txt".parse::().unwrap(), 1), + tracker.add(&"foo/bar/apple.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let dirs = tracker - .add(&"foo/quux/banana.txt".parse::().unwrap(), 2) + .add(&"foo/quux/banana.txt".parse::().unwrap(), 2, None) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("foo/bar")); - assert_eq!(dirs[0].entries, vec![Entry::file("apple.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("apple.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); let dirs = tracker.finish(); assert_eq!(dirs.len(), 3); assert_eq!(dirs[0].path(), Some("foo/quux")); - assert_eq!(dirs[0].entries, vec![Entry::file("banana.txt", 2)]); + assert_eq!(dirs[0].files, HashMap::from([("banana.txt".into(), 2)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("foo")); - assert_eq!(dirs[1].entries, vec![Entry::dir("bar"), Entry::dir("quux")]); + assert!(dirs[1].files.is_empty()); + assert_eq!( + dirs[1].directories, + HashSet::from(["bar".into(), "quux".into()]) + ); assert_eq!(dirs[2].path(), None); - assert_eq!(dirs[2].entries, vec![Entry::dir("foo")]); + assert!(dirs[2].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from(["foo".into()])); } #[test] @@ -329,37 +390,46 @@ mod tests { tracker.add( &"foo/apple!banana/gnusto.txt".parse::().unwrap(), 1, + None ), Ok(Vec::new()) ); let dirs = tracker - .add(&"foo/apple/cleesh.txt".parse::().unwrap(), 2) + .add(&"foo/apple/cleesh.txt".parse::().unwrap(), 2, None) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("foo/apple!banana")); - assert_eq!(dirs[0].entries, vec![Entry::file("gnusto.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("gnusto.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); let dirs = tracker.finish(); assert_eq!(dirs.len(), 3); assert_eq!(dirs[0].path(), Some("foo/apple")); - assert_eq!(dirs[0].entries, vec![Entry::file("cleesh.txt", 2)]); + assert_eq!(dirs[0].files, HashMap::from([("cleesh.txt".into(), 2)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("foo")); + assert!(dirs[1].files.is_empty()); assert_eq!( - dirs[1].entries, - vec![Entry::dir("apple!banana"), Entry::dir("apple")] + dirs[1].directories, + HashSet::from(["apple!banana".into(), "apple".into()]) ); assert_eq!(dirs[2].path(), None); - assert_eq!(dirs[2].entries, vec![Entry::dir("foo")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from([("foo".into())])); } #[test] fn preslash_file_then_toslash_file() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 1), + tracker.add( + &"foo/bar/apple!banana.txt".parse::().unwrap(), + 1, + None + ), Ok(Vec::new()) ); let e = tracker - .add(&"foo/bar/apple".parse::().unwrap(), 2) + .add(&"foo/bar/apple".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!( e, @@ -374,24 +444,31 @@ mod tests { fn tostash_file_then_preslash_file() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/apple".parse::().unwrap(), 1), + tracker.add(&"foo/bar/apple".parse::().unwrap(), 1, None), Ok(Vec::new()) ); assert_eq!( - tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 2), + tracker.add( + &"foo/bar/apple!banana.txt".parse::().unwrap(), + 2, + None + ), Ok(Vec::new()) ); let dirs = tracker.finish(); assert_eq!(dirs.len(), 3); assert_eq!(dirs[0].path(), Some("foo/bar")); assert_eq!( - dirs[0].entries, - vec![Entry::file("apple", 1), Entry::file("apple!banana.txt", 2)] + dirs[0].files, + HashMap::from([("apple".into(), 1), ("apple!banana.txt".into(), 2)]) ); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("foo")); - assert_eq!(dirs[1].entries, vec![Entry::dir("bar")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[1].directories, HashSet::from(["bar".into()])); assert_eq!(dirs[2].path(), None); - assert_eq!(dirs[2].entries, vec![Entry::dir("foo")]); + assert!(dirs[2].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from(["foo".into()])); } #[test] @@ -401,11 +478,12 @@ mod tests { tracker.add( &"foo/apple!banana/gnusto.txt".parse::().unwrap(), 1, + None, ), Ok(Vec::new()) ); let e = tracker - .add(&"foo/apple".parse::().unwrap(), 2) + .add(&"foo/apple".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!( e, @@ -420,37 +498,49 @@ mod tests { fn preslash_file_then_toslash_dir() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/apple!banana.txt".parse::().unwrap(), 1), + tracker.add( + &"foo/bar/apple!banana.txt".parse::().unwrap(), + 1, + None + ), Ok(Vec::new()) ); assert_eq!( - tracker.add(&"foo/bar/apple/apricot.txt".parse::().unwrap(), 2), + tracker.add( + &"foo/bar/apple/apricot.txt".parse::().unwrap(), + 2, + None + ), Ok(Vec::new()) ); let dirs = tracker.finish(); assert_eq!(dirs.len(), 4); assert_eq!(dirs[0].path(), Some("foo/bar/apple")); - assert_eq!(dirs[0].entries, vec![Entry::file("apricot.txt", 2)]); + assert_eq!(dirs[0].files, HashMap::from([("apricot.txt".into(), 2)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("foo/bar")); assert_eq!( - dirs[1].entries, - vec![Entry::file("apple!banana.txt", 1), Entry::dir("apple")] + dirs[1].files, + HashMap::from([("apple!banana.txt".into(), 1)]) ); + assert_eq!(dirs[1].directories, HashSet::from(["apple".into()])); assert_eq!(dirs[2].path(), Some("foo")); - assert_eq!(dirs[2].entries, vec![Entry::dir("bar")]); + assert!(dirs[2].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from(["bar".into()])); assert_eq!(dirs[3].path(), None); - assert_eq!(dirs[3].entries, vec![Entry::dir("foo")]); + assert!(dirs[3].files.is_empty()); + assert_eq!(dirs[3].directories, HashSet::from(["foo".into()])); } #[test] fn path_conflict_file_then_dir() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar".parse::().unwrap(), 1), + tracker.add(&"foo/bar".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let e = tracker - .add(&"foo/bar/apple.txt".parse::().unwrap(), 2) + .add(&"foo/bar/apple.txt".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!(e, TreeTrackerError::Conflict("foo/bar".into())); } @@ -459,11 +549,11 @@ mod tests { fn path_conflict_dir_then_file() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/quux.txt".parse::().unwrap(), 1), + tracker.add(&"foo/bar/quux.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let e = tracker - .add(&"foo/bar".parse::().unwrap(), 2) + .add(&"foo/bar".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!(e, TreeTrackerError::Conflict("foo/bar".into())); } @@ -474,7 +564,8 @@ mod tests { let dirs = tracker.finish(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), None); - assert!(dirs[0].entries.is_empty()); + assert!(dirs[0].files.is_empty()); + assert!(dirs[0].directories.is_empty()); } #[test] @@ -483,88 +574,110 @@ mod tests { assert_eq!( tracker.add( &"apple/banana/coconut/date.txt".parse::().unwrap(), - 1 + 1, + None ), Ok(Vec::new()) ); let dirs = tracker.finish(); assert_eq!(dirs.len(), 4); assert_eq!(dirs[0].path(), Some("apple/banana/coconut")); - assert_eq!(dirs[0].entries, vec![Entry::file("date.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("date.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("apple/banana")); - assert_eq!(dirs[1].entries, vec![Entry::dir("coconut")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[1].directories, HashSet::from(["coconut".into()])); assert_eq!(dirs[2].path(), Some("apple")); - assert_eq!(dirs[2].entries, vec![Entry::dir("banana")]); + assert!(dirs[2].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from(["banana".into()])); assert_eq!(dirs[3].path(), None); - assert_eq!(dirs[3].entries, vec![Entry::dir("apple")]); + assert!(dirs[3].files.is_empty()); + assert_eq!(dirs[3].directories, HashSet::from(["apple".into()])); } #[test] fn closedir_then_files_in_parent() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"apple/banana/coconut.txt".parse::().unwrap(), 1), + tracker.add( + &"apple/banana/coconut.txt".parse::().unwrap(), + 1, + None + ), Ok(Vec::new()) ); let dirs = tracker - .add(&"apple/kumquat.txt".parse::().unwrap(), 2) + .add(&"apple/kumquat.txt".parse::().unwrap(), 2, None) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("apple/banana")); - assert_eq!(dirs[0].entries, vec![Entry::file("coconut.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("coconut.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); + assert_eq!( - tracker.add(&"apple/mango.txt".parse::().unwrap(), 3), + tracker.add(&"apple/mango.txt".parse::().unwrap(), 3, None), Ok(Vec::new()) ); let dirs = tracker.finish(); assert_eq!(dirs.len(), 2); assert_eq!(dirs[0].path(), Some("apple")); assert_eq!( - dirs[0].entries, - vec![ - Entry::dir("banana"), - Entry::file("kumquat.txt", 2), - Entry::file("mango.txt", 3), - ] + dirs[0].files, + HashMap::from([("kumquat.txt".into(), 2), ("mango.txt".into(), 3)]) ); + assert_eq!(dirs[0].directories, HashSet::from(["banana".into()])); assert_eq!(dirs[1].path(), None); - assert_eq!(dirs[1].entries, vec![Entry::dir("apple")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[1].directories, HashSet::from(["apple".into()])); } #[test] fn closedir_then_dirs_in_parent() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"apple/banana/coconut.txt".parse::().unwrap(), 1), + tracker.add( + &"apple/banana/coconut.txt".parse::().unwrap(), + 1, + None + ), Ok(Vec::new()) ); let dirs = tracker - .add(&"apple/eggplant/kumquat.txt".parse::().unwrap(), 2) + .add( + &"apple/eggplant/kumquat.txt".parse::().unwrap(), + 2, + None, + ) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("apple/banana")); - assert_eq!(dirs[0].entries, vec![Entry::file("coconut.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("coconut.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); let dirs = tracker - .add(&"apple/mango/tangerine.txt".parse::().unwrap(), 3) + .add( + &"apple/mango/tangerine.txt".parse::().unwrap(), + 3, + None, + ) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("apple/eggplant")); - assert_eq!(dirs[0].entries, vec![Entry::file("kumquat.txt", 2)]); + assert_eq!(dirs[0].files, HashMap::from([("kumquat.txt".into(), 2)])); + assert!(dirs[0].directories.is_empty()); let dirs = tracker.finish(); assert_eq!(dirs.len(), 3); assert_eq!(dirs[0].path(), Some("apple/mango")); - assert_eq!(dirs[0].entries, vec![Entry::file("tangerine.txt", 3)]); + assert_eq!(dirs[0].files, HashMap::from([("tangerine.txt".into(), 3)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("apple")); + assert!(dirs[1].files.is_empty()); assert_eq!( - dirs[1].entries, - vec![ - Entry::dir("banana"), - Entry::dir("eggplant"), - Entry::dir("mango"), - ] + dirs[1].directories, + HashSet::from(["banana".into(), "eggplant".into(), "mango".into()]) ); assert_eq!(dirs[2].path(), None); - assert_eq!(dirs[2].entries, vec![Entry::dir("apple")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from(["apple".into()])); } #[test] @@ -573,38 +686,40 @@ mod tests { assert_eq!( tracker.add( &"apple/banana/coconut/date.txt".parse::().unwrap(), - 1 + 1, + None ), Ok(Vec::new()) ); let dirs = tracker - .add(&"foo.txt".parse::().unwrap(), 2) + .add(&"foo.txt".parse::().unwrap(), 2, None) .unwrap(); assert_eq!(dirs.len(), 3); assert_eq!(dirs[0].path(), Some("apple/banana/coconut")); - assert_eq!(dirs[0].entries, vec![Entry::file("date.txt", 1)]); + assert_eq!(dirs[0].files, HashMap::from([("date.txt".into(), 1)])); + assert!(dirs[0].directories.is_empty()); assert_eq!(dirs[1].path(), Some("apple/banana")); - assert_eq!(dirs[1].entries, vec![Entry::dir("coconut")]); + assert!(dirs[1].files.is_empty()); + assert_eq!(dirs[1].directories, HashSet::from(["coconut".into()])); assert_eq!(dirs[2].path(), Some("apple")); - assert_eq!(dirs[2].entries, vec![Entry::dir("banana")]); + assert!(dirs[2].files.is_empty()); + assert_eq!(dirs[2].directories, HashSet::from(["banana".into()])); let dirs = tracker.finish(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), None); - assert_eq!( - dirs[0].entries, - vec![Entry::dir("apple"), Entry::file("foo.txt", 2)] - ); + assert_eq!(dirs[0].files, HashMap::from([("foo.txt".into(), 2)])); + assert_eq!(dirs[0].directories, HashSet::from(["apple".into()])); } #[test] fn same_file_twice() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/bar/quux.txt".parse::().unwrap(), 1), + tracker.add(&"foo/bar/quux.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let e = tracker - .add(&"foo/bar/quux.txt".parse::().unwrap(), 2) + .add(&"foo/bar/quux.txt".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!( e, @@ -616,11 +731,11 @@ mod tests { fn unsorted_parent_dirs() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/gnusto/quux.txt".parse::().unwrap(), 1), + tracker.add(&"foo/gnusto/quux.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let e = tracker - .add(&"foo/bar/cleesh.txt".parse::().unwrap(), 2) + .add(&"foo/bar/cleesh.txt".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!( e, @@ -635,11 +750,11 @@ mod tests { fn file_then_preceding_dir() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo/gnusto.txt".parse::().unwrap(), 1), + tracker.add(&"foo/gnusto.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); let e = tracker - .add(&"foo/bar/cleesh.txt".parse::().unwrap(), 2) + .add(&"foo/bar/cleesh.txt".parse::().unwrap(), 2, None) .unwrap_err(); assert_eq!( e, @@ -654,29 +769,197 @@ mod tests { fn files_in_root() { let mut tracker = TreeTracker::new(); assert_eq!( - tracker.add(&"foo.txt".parse::().unwrap(), 1), + tracker.add(&"foo.txt".parse::().unwrap(), 1, None), Ok(Vec::new()) ); assert_eq!( - tracker.add(&"gnusto/cleesh.txt".parse::().unwrap(), 2), + tracker.add(&"gnusto/cleesh.txt".parse::().unwrap(), 2, None), Ok(Vec::new()) ); let dirs = tracker - .add(&"quux.txt".parse::().unwrap(), 3) + .add(&"quux.txt".parse::().unwrap(), 3, None) .unwrap(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), Some("gnusto")); - assert_eq!(dirs[0].entries, vec![Entry::file("cleesh.txt", 2)]); + assert_eq!(dirs[0].files, HashMap::from([("cleesh.txt".into(), 2)])); + assert!(dirs[0].directories.is_empty()); let dirs = tracker.finish(); assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].path(), None); assert_eq!( - dirs[0].entries, - vec![ - Entry::file("foo.txt", 1), - Entry::dir("gnusto"), - Entry::file("quux.txt", 3) - ] + dirs[0].files, + HashMap::from([("foo.txt".into(), 1), ("quux.txt".into(), 3)]) + ); + assert_eq!(dirs[0].directories, HashSet::from(["gnusto".into()])); + } + + #[test] + fn old_filename() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"foo/bar.txt".parse::().unwrap(), + 1, + Some("bar.txt.old.1.2".into()) + ), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!( + dirs[0], + Directory { + path: Some("foo".into()), + files: HashMap::from([("bar.txt.old.1.2".into(), 1)]), + directories: HashSet::new(), + } + ); + assert_eq!( + dirs[1], + Directory { + path: None, + files: HashMap::new(), + directories: HashSet::from(["foo".into()]), + } + ); + } + + #[test] + fn multiple_old_filenames() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"foo/bar.txt".parse::().unwrap(), + 1, + Some("bar.txt.old.a.b".into()) + ), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add( + &"foo/bar.txt".parse::().unwrap(), + 2, + Some("bar.txt.old.1.2".into()) + ), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!( + dirs[0], + Directory { + path: Some("foo".into()), + files: HashMap::from([ + ("bar.txt.old.a.b".into(), 1), + ("bar.txt.old.1.2".into(), 2), + ]), + directories: HashSet::new(), + } + ); + assert_eq!( + dirs[1], + Directory { + path: None, + files: HashMap::new(), + directories: HashSet::from(["foo".into()]), + } + ); + } + + #[test] + fn old_and_non_old() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"foo/bar.txt".parse::().unwrap(), + 1, + Some("bar.txt.old.a.b".into()) + ), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add(&"foo/bar.txt".parse::().unwrap(), 2, None), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!( + dirs[0], + Directory { + path: Some("foo".into()), + files: HashMap::from([("bar.txt.old.a.b".into(), 1), ("bar.txt".into(), 2),]), + directories: HashSet::new(), + } + ); + assert_eq!( + dirs[1], + Directory { + path: None, + files: HashMap::new(), + directories: HashSet::from(["foo".into()]), + } + ); + } + + #[test] + fn non_old_and_old() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add(&"foo/bar.txt".parse::().unwrap(), 1, None,), + Ok(Vec::new()) + ); + assert_eq!( + tracker.add( + &"foo/bar.txt".parse::().unwrap(), + 2, + Some("bar.txt.old.a.b".into()) + ), + Ok(Vec::new()) + ); + let dirs = tracker.finish(); + assert_eq!(dirs.len(), 2); + assert_eq!( + dirs[0], + Directory { + path: Some("foo".into()), + files: HashMap::from([("bar.txt".into(), 1), ("bar.txt.old.a.b".into(), 2),]), + directories: HashSet::new(), + } + ); + assert_eq!( + dirs[1], + Directory { + path: None, + files: HashMap::new(), + directories: HashSet::from(["foo".into()]), + } + ); + } + + #[test] + fn duplicate_old_filenames() { + let mut tracker = TreeTracker::new(); + assert_eq!( + tracker.add( + &"foo/bar.txt".parse::().unwrap(), + 1, + Some("bar.txt.old.1.2".into()) + ), + Ok(Vec::new()) + ); + let e = tracker + .add( + &"foo/bar.txt".parse::().unwrap(), + 2, + Some("bar.txt.old.1.2".into()), + ) + .unwrap_err(); + assert_eq!( + e, + TreeTrackerError::DuplicateOldFile { + key: "foo/bar.txt".into(), + old_filename: "bar.txt.old.1.2".into() + } ); } } From 96182739135865127ba92cf60cb45c1bfd1c4221 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 11:43:41 -0500 Subject: [PATCH 12/25] Finish up syncer/mod.rs --- Cargo.lock | 173 ++++++++++++++++++++++++++++-------------- Cargo.toml | 4 +- src/inventory/item.rs | 5 ++ src/syncer/mod.rs | 137 +++++++++++++++++++++------------ 4 files changed, 216 insertions(+), 103 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 17a1112..82bd6fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,6 +62,34 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async_executors" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b2463773401e1f684136f9cdb956cf611f22172472cf3f049e72123f59e359" +dependencies = [ + "blanket", + "futures-core", + "futures-task", + "futures-util", + "pin-project", + "rustc_version", + "tokio", +] + +[[package]] +name = "async_nursery" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f211dbbf310df25f4814c16f8e8f827456af323fc9c156a0f9042c2dd8278437" +dependencies = [ + "async_executors", + "futures", + "futures-channel", + "futures-task", + "rustc_version", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -70,9 +98,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.10" +version = "1.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" +checksum = "c03a50b30228d3af8865ce83376b4e99e1ffa34728220fe2860e4df0bb5278d6" dependencies = [ "aws-credential-types", "aws-runtime", @@ -81,7 +109,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.60.7", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -112,9 +140,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.4.4" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea" +checksum = "b16d1aa50accc11a4b4d5c50f7fb81cc0cf60328259c587d0e6b0f11385bde46" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -138,9 +166,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.65.0" +version = "1.68.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e" +checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc" dependencies = [ "aws-credential-types", "aws-runtime", @@ -149,7 +177,7 @@ dependencies = [ "aws-smithy-checksums", "aws-smithy-eventstream", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -172,15 +200,15 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.50.0" +version = "1.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab" +checksum = "1605dc0bf9f0a4b05b451441a17fcb0bda229db384f23bf5cead3adbab0664ac" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -194,15 +222,15 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.51.0" +version = "1.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0" +checksum = "59f3f73466ff24f6ad109095e0f3f2c830bfb4cd6c8b12f744c8e61ebf4d3ba1" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -216,15 +244,15 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.51.0" +version = "1.54.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf" +checksum = "861d324ef69247c6f3c6823755f408a68877ffb1a9afaff6dd8b0057c760de60" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -254,7 +282,7 @@ dependencies = [ "hex", "hmac", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "once_cell", "p256", "percent-encoding", @@ -330,15 +358,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-json" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" -dependencies = [ - "aws-smithy-types", -] - [[package]] name = "aws-smithy-json" version = "0.61.1" @@ -395,7 +414,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "pin-project-lite", "tokio", "tracing", @@ -413,7 +432,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -500,6 +519,17 @@ version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" +[[package]] +name = "blanket" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b04ce3d2372d05d1ef4ea3fdf427da6ae3c17ca06d688a107b5344836276bc3" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -571,7 +601,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -733,7 +763,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", "unicode-xid", ] @@ -756,7 +786,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -810,7 +840,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -947,7 +977,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -1051,9 +1081,9 @@ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", @@ -1094,9 +1124,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -1121,7 +1151,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.1.0", + "http 1.2.0", ] [[package]] @@ -1132,7 +1162,7 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "pin-project-lite", ] @@ -1304,7 +1334,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -1335,7 +1365,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -1402,7 +1432,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -1550,6 +1580,26 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "pin-project" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -1695,7 +1745,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn", + "syn 2.0.96", "unicode-ident", ] @@ -1724,7 +1774,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1789,6 +1839,8 @@ dependencies = [ "anyhow", "assert_matches", "async-channel", + "async_executors", + "async_nursery", "aws-config", "aws-credential-types", "aws-sdk-s3", @@ -1898,7 +1950,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2041,7 +2093,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.96", ] [[package]] @@ -2050,6 +2102,17 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.96" @@ -2069,7 +2132,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2083,7 +2146,7 @@ dependencies = [ "getrandom", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2113,7 +2176,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2194,7 +2257,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2247,7 +2310,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2526,7 +2589,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", "synstructure", ] @@ -2547,7 +2610,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", "synstructure", ] @@ -2576,5 +2639,5 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] diff --git a/Cargo.toml b/Cargo.toml index 90f48b8..e8eec1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ publish = false [dependencies] anyhow = "1.0.95" async-channel = "2.3.1" +async_executors = { version = "0.6.0", features = ["tokio_tp"] } +async_nursery = "0.5.0" aws-config = { version = "1.5.10", features = ["behavior-version-latest", "rustls"] } aws-credential-types = "1.2.1" aws-sdk-s3 = "1.65.0" @@ -27,7 +29,7 @@ csv = "1.3.1" either = "1.13.0" flate2 = "1.0.35" fs-err = { version = "3.0.0", features = ["tokio"] } -futures-util = "0.3.31" +futures-util = { version = "0.3.31", default-features = false, features = ["std"] } hex = "0.4.3" lockable = "0.1.1" md-5 = "0.10.6" diff --git a/src/inventory/item.rs b/src/inventory/item.rs index 7f7c2dc..856cd6e 100644 --- a/src/inventory/item.rs +++ b/src/inventory/item.rs @@ -70,6 +70,11 @@ impl InventoryItem { S3Location::new(self.bucket.clone(), String::from(&self.key)) .with_version_id(self.version_id.clone()) } + + /// Returns whether the object is a delete marker + pub(crate) fn is_deleted(&self) -> bool { + self.details == ItemDetails::Deleted + } } /// Metadata about an object's content diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index be4515c..88a968d 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -10,6 +10,9 @@ use crate::s3::S3Client; use crate::timestamps::DateHM; use crate::util::*; use anyhow::Context; +use async_executors::TokioTpBuilder; +use async_nursery::{NurseExt, Nursery, NurseryStream}; +use futures_util::StreamExt; use std::collections::BTreeMap; use std::io::ErrorKind; use std::num::NonZeroUsize; @@ -18,7 +21,7 @@ use std::sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, }; -use tokio::task::JoinSet; +use tokio::sync::Notify; use tokio_util::sync::CancellationToken; /// Capacity of async channels @@ -27,6 +30,8 @@ const CHANNEL_SIZE: usize = 65535; /// Lock guard returned by [`Syncer::lock_path()`] type Guard<'a> = as lockable::Lockable>::Guard<'a>; +type ObjChannelItem = (InventoryItem, Option>); + /// Object responsible for syncing an S3 bucket to a local backup by means of /// the bucket's S3 Inventory pub(crate) struct Syncer { @@ -57,10 +62,10 @@ pub(crate) struct Syncer { /// A clone of the channel used for sending inventory entries off to be /// downloaded. This is set to `None` after spawning all of the inventory /// list download tasks. - obj_sender: Mutex>>, + obj_sender: Mutex>>, /// A clone of the channel used for receiving inventory entries to download - obj_receiver: async_channel::Receiver, + obj_receiver: async_channel::Receiver, /// Whether the backup was terminated by Ctrl-C terminated: AtomicBool, @@ -114,7 +119,11 @@ impl Syncer { tracing::trace!(path = %self.outdir.display(), "Creating root output directory"); fs_err::create_dir_all(&self.outdir).map_err(|e| MultiError(vec![e.into()]))?; - let mut joinset = JoinSet::new(); + let (nursery, nursery_stream) = Nursery::new( + TokioTpBuilder::new() + .build() + .expect("building tokio runtime should not fail"), + ); let obj_sender = { let guard = self .obj_sender @@ -129,7 +138,9 @@ impl Syncer { let this = self.clone(); let token = self.token.clone(); let sender = obj_sender.clone(); - joinset.spawn(async move { + let subnursery = nursery.clone(); + let _ = nursery.nurse(async move { + let inner_token = token.clone(); token.run_until_cancelled(async move { let mut tracker = TreeTracker::new(); for spec in fspecs { @@ -140,14 +151,22 @@ impl Syncer { tracing::debug!(url = %d.url(), "Ignoring directory entry in inventory list"); } InventoryEntry::Item(item) => { - // TODO: Store a Cond for awaiting completion of item processing - // TODO: Old filenames: - for dir in tracker.add(&item.key, (), None)? { - // TODO: Spawn in JoinSet - let this2 = this.clone(); - tokio::task::spawn_blocking(move || this2.cleanup_dir(dir)); - } - if sender.send(item).await.is_err() { + let notify = if !item.is_deleted() { + let notify = Arc::new(Notify::new()); + for dir in tracker.add(&item.key, notify.clone(), None)? { + let _ = subnursery.nurse({ + let this = this.clone(); + let token = inner_token.clone(); + async move { + token.run_until_cancelled(async move { this.cleanup_dir(dir).await }).await.unwrap_or(Ok(())) + } + }); + } + Some(notify) + } else { + None + }; + if sender.send((item, notify)).await.is_err() { // Assume we're shutting down return Ok(()); } @@ -156,9 +175,13 @@ impl Syncer { } } for dir in tracker.finish() { - // TODO: Spawn in JoinSet - let this2 = this.clone(); - tokio::task::spawn_blocking(move || this2.cleanup_dir(dir)); + let _ = subnursery.nurse({ + let this = this.clone(); + let token = inner_token.clone(); + async move { + token.run_until_cancelled(async move { this.cleanup_dir(dir).await }).await.unwrap_or(Ok(())) + } + }); } Ok(()) }).await.unwrap_or(Ok(())) @@ -175,18 +198,23 @@ impl Syncer { for _ in 0..self.jobs.get() { let this = self.clone(); let recv = self.obj_receiver.clone(); - joinset.spawn(async move { - while let Ok(item) = recv.recv().await { + let _ = nursery.nurse(async move { + while let Ok((item, notify)) = recv.recv().await { if this.token.is_cancelled() { return Ok(()); } - Box::pin(this.process_item(item)).await?; + let r = Box::pin(this.process_item(item)).await; + if let Some(n) = notify { + n.notify_one(); + } + r?; } Ok(()) }); } - let r = self.await_joinset(joinset).await; + drop(nursery); + let r = self.await_nursery(nursery_stream).await; self.filterlog.finish(); r } @@ -198,7 +226,11 @@ impl Syncer { specs: Vec, ) -> Result, MultiError> { tracing::info!("Peeking at inventory lists in order to sort by first line ..."); - let mut joinset = JoinSet::new(); + let (nursery, nursery_stream) = Nursery::new( + TokioTpBuilder::new() + .build() + .expect("building tokio runtime should not fail"), + ); let mut receiver = { let specs = Arc::new(Mutex::new(specs)); let (output_sender, output_receiver) = tokio::sync::mpsc::channel(CHANNEL_SIZE); @@ -207,7 +239,7 @@ impl Syncer { let token = self.token.clone(); let specs = specs.clone(); let sender = output_sender.clone(); - joinset.spawn(async move { + let _ = nursery.nurse(async move { token .run_until_cancelled(async move { while let Some(fspec) = { @@ -230,32 +262,28 @@ impl Syncer { } output_receiver }; + drop(nursery); let mut firsts2fspecs = BTreeMap::new(); while let Some((fspec, entry)) = receiver.recv().await { firsts2fspecs.insert(entry.key().to_owned(), fspec); } - self.await_joinset(joinset).await?; + self.await_nursery(nursery_stream).await?; Ok(firsts2fspecs.into_values().collect()) } - async fn await_joinset( + async fn await_nursery( &self, - mut joinset: JoinSet>, + mut stream: NurseryStream>, ) -> Result<(), MultiError> { let mut errors = Vec::new(); - while let Some(r) = joinset.join_next().await { - match r { - Ok(Ok(())) => (), - Ok(Err(e)) => { - tracing::error!(error = ?e, "Error occurred"); - if errors.is_empty() { - tracing::info!("Shutting down in response to error"); - self.shutdown(); - } - errors.push(e); + while let Some(r) = stream.next().await { + if let Err(e) = r { + tracing::error!(error = ?e, "Error occurred"); + if errors.is_empty() { + tracing::info!("Shutting down in response to error"); + self.shutdown(); } - Err(e) if e.is_panic() => std::panic::resume_unwind(e.into_panic()), - Err(_) => (), + errors.push(e); } } if self.terminated.load(Ordering::Acquire) { @@ -489,16 +517,24 @@ impl Syncer { ); } - fn cleanup_dir(&self, dir: Directory<()>) -> anyhow::Result<()> { - // TODO: Wait for directory's jobs to finish + #[tracing::instrument(skip_all, fields(dirpath = %dir.path().unwrap_or("")))] + async fn cleanup_dir(&self, dir: Directory>) -> anyhow::Result<()> { + let mut notifiers = Vec::new(); + let dir = dir.map(|n| { + notifiers.push(n); + }); + for n in notifiers { + n.notified().await; + } let dirpath = match dir.path() { Some(p) => self.outdir.join(p), None => self.outdir.clone(), }; + let mut files_to_delete = Vec::new(); + let mut dirs_to_delete = Vec::new(); let mut dbdeletions = Vec::new(); for entry in fs_err::read_dir(&dirpath)? { let entry = entry?; - let epath = entry.path(); let is_dir = entry.file_type()?.is_dir(); let to_delete = match entry.file_name().to_str() { Some(name) => { @@ -514,19 +550,26 @@ impl Syncer { None => true, }; if to_delete { - // TODO: Store the deletion in a Vec and delete them *after* - // finishing iterating over the directory - // TODO: Log if is_dir { - // TODO: Use async here? - fs_err::remove_dir_all(&epath)?; + dirs_to_delete.push(entry.path()); } else { - fs_err::remove_file(&epath)?; + files_to_delete.push(entry.path()); } } } + for p in files_to_delete { + tracing::debug!(path = %p.display(), "File does not belong in backup; deleting"); + if let Err(e) = fs_err::remove_file(&p) { + tracing::warn!(error = %e, path = %p.display(), "Failed to delete file"); + } + } + for p in dirs_to_delete { + tracing::debug!(path = %p.display(), "Directory does not belong in backup; deleting"); + if let Err(e) = fs_err::tokio::remove_dir_all(&p).await { + tracing::warn!(error = %e, path = %p.display(), "Failed to delete directory"); + } + } if !dbdeletions.is_empty() { - // TODO: Log let manager = MetadataManager::new(&dirpath); let mut data = manager.load()?; for name in dbdeletions { From 136a725947f317088ff96bd5c0dcfb26a5b4d2b5 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 11:53:12 -0500 Subject: [PATCH 13/25] Improve some code --- src/syncer/mod.rs | 73 ++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index 88a968d..f74f907 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -14,6 +14,7 @@ use async_executors::TokioTpBuilder; use async_nursery::{NurseExt, Nursery, NurseryStream}; use futures_util::StreamExt; use std::collections::BTreeMap; +use std::future::Future; use std::io::ErrorKind; use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; @@ -136,12 +137,10 @@ impl Syncer { }; let this = self.clone(); - let token = self.token.clone(); let sender = obj_sender.clone(); let subnursery = nursery.clone(); - let _ = nursery.nurse(async move { - let inner_token = token.clone(); - token.run_until_cancelled(async move { + let _ = nursery.nurse( + self.until_cancelled_ok(async move { let mut tracker = TreeTracker::new(); for spec in fspecs { let entries = this.client.download_inventory_csv(spec).await?; @@ -155,11 +154,10 @@ impl Syncer { let notify = Arc::new(Notify::new()); for dir in tracker.add(&item.key, notify.clone(), None)? { let _ = subnursery.nurse({ - let this = this.clone(); - let token = inner_token.clone(); - async move { - token.run_until_cancelled(async move { this.cleanup_dir(dir).await }).await.unwrap_or(Ok(())) - } + this.until_cancelled_ok({ + let this = this.clone(); + async move { this.cleanup_dir(dir).await } + }) }); } Some(notify) @@ -176,16 +174,15 @@ impl Syncer { } for dir in tracker.finish() { let _ = subnursery.nurse({ - let this = this.clone(); - let token = inner_token.clone(); - async move { - token.run_until_cancelled(async move { this.cleanup_dir(dir).await }).await.unwrap_or(Ok(())) - } + this.until_cancelled_ok({ + let this = this.clone(); + async move { this.cleanup_dir(dir).await } + }) }); } Ok(()) - }).await.unwrap_or(Ok(())) - }); + }) + ); drop(obj_sender); { let mut guard = self @@ -236,29 +233,22 @@ impl Syncer { let (output_sender, output_receiver) = tokio::sync::mpsc::channel(CHANNEL_SIZE); for _ in 0..self.jobs.get() { let clnt = self.client.clone(); - let token = self.token.clone(); let specs = specs.clone(); let sender = output_sender.clone(); - let _ = nursery.nurse(async move { - token - .run_until_cancelled(async move { - while let Some(fspec) = { - let mut guard = - specs.lock().expect("specs mutex should not be poisoned"); - guard.pop() - } { - if let Some(entry) = clnt.peek_inventory_csv(&fspec).await? { - if sender.send((fspec, entry)).await.is_err() { - // Assume we're shutting down - return Ok(()); - } - } + let _ = nursery.nurse(self.until_cancelled_ok(async move { + while let Some(fspec) = { + let mut guard = specs.lock().expect("specs mutex should not be poisoned"); + guard.pop() + } { + if let Some(entry) = clnt.peek_inventory_csv(&fspec).await? { + if sender.send((fspec, entry)).await.is_err() { + // Assume we're shutting down + return Ok(()); } - Ok(()) - }) - .await - .unwrap_or(Ok(())) - }); + } + } + Ok(()) + })); } output_receiver }; @@ -271,6 +261,17 @@ impl Syncer { Ok(firsts2fspecs.into_values().collect()) } + fn until_cancelled_ok( + &self, + fut: Fut, + ) -> impl Future> + Send + 'static + where + Fut: Future> + Send + 'static, + { + let token = self.token.clone(); + async move { token.run_until_cancelled(fut).await.unwrap_or(Ok(())) } + } + async fn await_nursery( &self, mut stream: NurseryStream>, From 240169ee08c4944d1da10854bda4c246b69e2c38 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 12:55:56 -0500 Subject: [PATCH 14/25] Roll my own task group --- Cargo.lock | 102 +++++++--------------------------------------- Cargo.toml | 2 - src/main.rs | 1 + src/nursery.rs | 63 ++++++++++++++++++++++++++++ src/syncer/mod.rs | 25 ++++-------- 5 files changed, 87 insertions(+), 106 deletions(-) create mode 100644 src/nursery.rs diff --git a/Cargo.lock b/Cargo.lock index 82bd6fa..6744e3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,34 +62,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "async_executors" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0b2463773401e1f684136f9cdb956cf611f22172472cf3f049e72123f59e359" -dependencies = [ - "blanket", - "futures-core", - "futures-task", - "futures-util", - "pin-project", - "rustc_version", - "tokio", -] - -[[package]] -name = "async_nursery" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f211dbbf310df25f4814c16f8e8f827456af323fc9c156a0f9042c2dd8278437" -dependencies = [ - "async_executors", - "futures", - "futures-channel", - "futures-task", - "rustc_version", -] - [[package]] name = "autocfg" version = "1.4.0" @@ -519,17 +491,6 @@ version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" -[[package]] -name = "blanket" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b04ce3d2372d05d1ef4ea3fdf427da6ae3c17ca06d688a107b5344836276bc3" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "block-buffer" version = "0.10.4" @@ -601,7 +562,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -763,7 +724,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", "unicode-xid", ] @@ -786,7 +747,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -977,7 +938,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -1334,7 +1295,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -1580,26 +1541,6 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "pin-project" -version = "1.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.96", -] - [[package]] name = "pin-project-lite" version = "0.2.16" @@ -1745,7 +1686,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.96", + "syn", "unicode-ident", ] @@ -1839,8 +1780,6 @@ dependencies = [ "anyhow", "assert_matches", "async-channel", - "async_executors", - "async_nursery", "aws-config", "aws-credential-types", "aws-sdk-s3", @@ -1950,7 +1889,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2093,7 +2032,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn", ] [[package]] @@ -2102,17 +2041,6 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - [[package]] name = "syn" version = "2.0.96" @@ -2132,7 +2060,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2176,7 +2104,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2257,7 +2185,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2310,7 +2238,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] [[package]] @@ -2589,7 +2517,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", "synstructure", ] @@ -2610,7 +2538,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", "synstructure", ] @@ -2639,5 +2567,5 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn", ] diff --git a/Cargo.toml b/Cargo.toml index e8eec1c..06172ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,8 +17,6 @@ publish = false [dependencies] anyhow = "1.0.95" async-channel = "2.3.1" -async_executors = { version = "0.6.0", features = ["tokio_tp"] } -async_nursery = "0.5.0" aws-config = { version = "1.5.10", features = ["behavior-version-latest", "rustls"] } aws-credential-types = "1.2.1" aws-sdk-s3 = "1.65.0" diff --git a/src/main.rs b/src/main.rs index 9c59eab..946e7bc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod consts; mod inventory; mod keypath; mod manifest; +mod nursery; mod s3; mod syncer; mod timestamps; diff --git a/src/nursery.rs b/src/nursery.rs new file mode 100644 index 0000000..094ccd2 --- /dev/null +++ b/src/nursery.rs @@ -0,0 +1,63 @@ +use futures_util::{FutureExt, Stream}; +use std::future::Future; +use std::pin::Pin; +use std::task::{ready, Context, Poll}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; + +type UnwindResult = Result>; + +#[derive(Debug)] +pub(crate) struct Nursery { + sender: UnboundedSender>, +} + +impl Nursery { + pub(crate) fn new() -> (Nursery, NurseryStream) { + let (sender, receiver) = unbounded_channel(); + (Nursery { sender }, NurseryStream { receiver }) + } + + pub(crate) fn spawn(&self, fut: Fut) + where + Fut: Future + Send + 'static, + { + let sender = self.sender.clone(); + tokio::spawn(async move { + let task = std::panic::AssertUnwindSafe(fut).catch_unwind(); + let _ = sender.send(task.await); + }); + } +} + +// Clone can't be derived, as that would erroneously add `T: Clone` bounds to +// the impl. +impl Clone for Nursery { + fn clone(&self) -> Nursery { + Nursery { + sender: self.sender.clone(), + } + } +} + +#[derive(Debug)] +pub(crate) struct NurseryStream { + receiver: UnboundedReceiver>, +} + +impl Stream for NurseryStream { + type Item = T; + + /// Poll for one of the tasks in the group to complete and return its + /// return value. + /// + /// # Panics + /// + /// If a task panics, this method resumes unwinding the panic. + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match ready!(self.receiver.poll_recv(cx)) { + Some(Ok(r)) => Some(r).into(), + Some(Err(e)) => std::panic::resume_unwind(e), + None => None.into(), + } + } +} diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index f74f907..b281964 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -6,12 +6,11 @@ use crate::consts::METADATA_FILENAME; use crate::inventory::{InventoryEntry, InventoryItem, ItemDetails}; use crate::keypath::is_special_component; use crate::manifest::{CsvManifest, FileSpec}; +use crate::nursery::{Nursery, NurseryStream}; use crate::s3::S3Client; use crate::timestamps::DateHM; use crate::util::*; use anyhow::Context; -use async_executors::TokioTpBuilder; -use async_nursery::{NurseExt, Nursery, NurseryStream}; use futures_util::StreamExt; use std::collections::BTreeMap; use std::future::Future; @@ -120,11 +119,7 @@ impl Syncer { tracing::trace!(path = %self.outdir.display(), "Creating root output directory"); fs_err::create_dir_all(&self.outdir).map_err(|e| MultiError(vec![e.into()]))?; - let (nursery, nursery_stream) = Nursery::new( - TokioTpBuilder::new() - .build() - .expect("building tokio runtime should not fail"), - ); + let (nursery, nursery_stream) = Nursery::new(); let obj_sender = { let guard = self .obj_sender @@ -139,7 +134,7 @@ impl Syncer { let this = self.clone(); let sender = obj_sender.clone(); let subnursery = nursery.clone(); - let _ = nursery.nurse( + nursery.spawn( self.until_cancelled_ok(async move { let mut tracker = TreeTracker::new(); for spec in fspecs { @@ -153,7 +148,7 @@ impl Syncer { let notify = if !item.is_deleted() { let notify = Arc::new(Notify::new()); for dir in tracker.add(&item.key, notify.clone(), None)? { - let _ = subnursery.nurse({ + subnursery.spawn({ this.until_cancelled_ok({ let this = this.clone(); async move { this.cleanup_dir(dir).await } @@ -173,7 +168,7 @@ impl Syncer { } } for dir in tracker.finish() { - let _ = subnursery.nurse({ + subnursery.spawn({ this.until_cancelled_ok({ let this = this.clone(); async move { this.cleanup_dir(dir).await } @@ -195,7 +190,7 @@ impl Syncer { for _ in 0..self.jobs.get() { let this = self.clone(); let recv = self.obj_receiver.clone(); - let _ = nursery.nurse(async move { + nursery.spawn(async move { while let Ok((item, notify)) = recv.recv().await { if this.token.is_cancelled() { return Ok(()); @@ -223,11 +218,7 @@ impl Syncer { specs: Vec, ) -> Result, MultiError> { tracing::info!("Peeking at inventory lists in order to sort by first line ..."); - let (nursery, nursery_stream) = Nursery::new( - TokioTpBuilder::new() - .build() - .expect("building tokio runtime should not fail"), - ); + let (nursery, nursery_stream) = Nursery::new(); let mut receiver = { let specs = Arc::new(Mutex::new(specs)); let (output_sender, output_receiver) = tokio::sync::mpsc::channel(CHANNEL_SIZE); @@ -235,7 +226,7 @@ impl Syncer { let clnt = self.client.clone(); let specs = specs.clone(); let sender = output_sender.clone(); - let _ = nursery.nurse(self.until_cancelled_ok(async move { + nursery.spawn(self.until_cancelled_ok(async move { while let Some(fspec) = { let mut guard = specs.lock().expect("specs mutex should not be poisoned"); guard.pop() From 9fd66317ece1e1626219b800957631f9e1c4d21f Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 13:01:53 -0500 Subject: [PATCH 15/25] Don't try to clean up nonexistent dirs --- src/syncer/metadata.rs | 1 + src/syncer/mod.rs | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/syncer/metadata.rs b/src/syncer/metadata.rs index a5d5b24..8bf3df4 100644 --- a/src/syncer/metadata.rs +++ b/src/syncer/metadata.rs @@ -1,5 +1,6 @@ use super::*; use serde::{Deserialize, Serialize}; +use std::io::ErrorKind; /// Metadata about the latest version of a key #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index b281964..0c64f45 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -525,7 +525,12 @@ impl Syncer { let mut files_to_delete = Vec::new(); let mut dirs_to_delete = Vec::new(); let mut dbdeletions = Vec::new(); - for entry in fs_err::read_dir(&dirpath)? { + let iter = match fs_err::read_dir(&dirpath) { + Ok(iter) => iter, + Err(e) if e.kind() == ErrorKind::NotFound => return Ok(()), + Err(e) => return Err(e.into()), + }; + for entry in iter { let entry = entry?; let is_dir = entry.file_type()?.is_dir(); let to_delete = match entry.file_name().to_str() { From bee489e8bb1b0eacf6f0b71add6458e0c604ff70 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 13:11:18 -0500 Subject: [PATCH 16/25] Missed a thing --- src/inventory/item.rs | 9 +++++++++ src/keypath.rs | 7 +++++++ src/syncer/metadata.rs | 3 ++- src/syncer/mod.rs | 2 +- src/util.rs | 4 ++++ 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/inventory/item.rs b/src/inventory/item.rs index 856cd6e..69ee6fb 100644 --- a/src/inventory/item.rs +++ b/src/inventory/item.rs @@ -1,5 +1,6 @@ use crate::keypath::KeyPath; use crate::s3::S3Location; +use crate::util::make_old_filename; use time::OffsetDateTime; /// An entry in an inventory list file @@ -75,6 +76,14 @@ impl InventoryItem { pub(crate) fn is_deleted(&self) -> bool { self.details == ItemDetails::Deleted } + + pub(crate) fn old_filename(&self) -> Option { + let ItemDetails::Present { ref etag, .. } = self.details else { + return None; + }; + (!self.is_deleted() && !self.is_latest) + .then(|| make_old_filename(self.key.name(), &self.version_id, etag)) + } } /// Metadata about an object's content diff --git a/src/keypath.rs b/src/keypath.rs index 7896c68..2a94f36 100644 --- a/src/keypath.rs +++ b/src/keypath.rs @@ -15,6 +15,13 @@ use thiserror::Error; pub(crate) struct KeyPath(String); impl KeyPath { + pub(crate) fn name(&self) -> &str { + self.0 + .split('/') + .next_back() + .expect("path should be nonempty") + } + /// Split the path into the directory component (if any) and filename pub(crate) fn split(&self) -> (Option<&str>, &str) { match self.0.rsplit_once('/') { diff --git a/src/syncer/metadata.rs b/src/syncer/metadata.rs index 8bf3df4..c96292d 100644 --- a/src/syncer/metadata.rs +++ b/src/syncer/metadata.rs @@ -1,4 +1,5 @@ use super::*; +use crate::util::make_old_filename; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; @@ -17,7 +18,7 @@ impl Metadata { /// `self` as its metadata and `basename` as the filename portion of its /// key pub(super) fn old_filename(&self, basename: &str) -> String { - format!("{}.old.{}.{}", basename, self.version_id, self.etag) + make_old_filename(basename, &self.version_id, &self.etag) } } diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index 0c64f45..a1460b6 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -147,7 +147,7 @@ impl Syncer { InventoryEntry::Item(item) => { let notify = if !item.is_deleted() { let notify = Arc::new(Notify::new()); - for dir in tracker.add(&item.key, notify.clone(), None)? { + for dir in tracker.add(&item.key, notify.clone(), item.old_filename())? { subnursery.spawn({ this.until_cancelled_ok({ let this = this.clone(); diff --git a/src/util.rs b/src/util.rs index 4c60be1..43da560 100644 --- a/src/util.rs +++ b/src/util.rs @@ -113,6 +113,10 @@ pub(crate) fn force_create_dir_all>>( Ok(()) } +pub(crate) fn make_old_filename(basename: &str, version_id: &str, etag: &str) -> String { + format!("{basename}.old.{version_id}.{etag}") +} + #[cfg(test)] mod tests { use super::*; From 1afa117109198bb9bdc0488f19ad49dd34741200 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 14:12:14 -0500 Subject: [PATCH 17/25] Add some basic tests for Nursery --- src/nursery.rs | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/nursery.rs b/src/nursery.rs index 094ccd2..b38b843 100644 --- a/src/nursery.rs +++ b/src/nursery.rs @@ -61,3 +61,60 @@ impl Stream for NurseryStream { } } } + +#[cfg(test)] +mod tests { + use super::*; + use futures_util::StreamExt; + + #[test] + fn nursery_is_send() { + #[allow(dead_code)] + fn require_send(_t: T) {} + + #[allow(dead_code)] + fn check_nursery_send() { + let (nursery, _) = Nursery::::new(); + require_send(nursery); + } + } + + #[tokio::test] + async fn collect() { + let (nursery, nursery_stream) = Nursery::new(); + nursery.spawn(std::future::ready(1)); + nursery.spawn(std::future::ready(2)); + nursery.spawn(std::future::ready(3)); + drop(nursery); + let mut values = nursery_stream.collect::>().await; + values.sort_unstable(); + assert_eq!(values, vec![1, 2, 3]); + } + + #[tokio::test] + async fn nested_spawn() { + let (nursery, nursery_stream) = Nursery::new(); + let inner = nursery.clone(); + nursery.spawn(async move { + inner.spawn(std::future::ready(0)); + std::future::ready(1).await + }); + nursery.spawn(std::future::ready(2)); + nursery.spawn(std::future::ready(3)); + drop(nursery); + let mut values = nursery_stream.collect::>().await; + values.sort_unstable(); + assert_eq!(values, vec![0, 1, 2, 3]); + } + + #[tokio::test] + async fn reraise_panic() { + let (nursery, mut nursery_stream) = Nursery::new(); + nursery.spawn(async { panic!("I can't take this anymore!") }); + drop(nursery); + let r = std::panic::AssertUnwindSafe(nursery_stream.next()) + .catch_unwind() + .await; + assert!(r.is_err()); + } +} From 8daea98a331057e74a61870ab3528710eeb15b0a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 14:17:12 -0500 Subject: [PATCH 18/25] Update changelog & README --- CHANGELOG.md | 2 ++ README.md | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbf161b..b9b2ba5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ In Development current directory - The `--inventory-jobs` and `--object-jobs` options have been eliminated in favor of a new `--jobs` option +- Files & directories in the backup tree that are not listed in the inventory + are deleted v0.1.0-alpha.2 (2025-01-06) --------------------------- diff --git a/README.md b/README.md index df606fd..d3c9ca5 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,9 @@ When downloading a given key from S3, the latest version (if not deleted) is stored at `{outdir}/{key}`, and the versionIds and etags of all latest object versions in a given directory are stored in `.s3invsync.versions.json` in that directory. Each non-latest, non-deleted version of a given key is stored at -`{outdir}/{key}.old.{versionId}.{etag}`. +`{outdir}/{key}.old.{versionId}.{etag}`. Any other files or directories under +`` that do not correspond to an object listed in the inventory are +deleted. Options ------- From c5efd2dd9a483c0d70e6de5c50bfd90dc3ccd110 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 15:54:10 -0500 Subject: [PATCH 19/25] Doc comments --- src/inventory/item.rs | 5 +- src/keypath.rs | 1 + src/nursery.rs | 17 +++++ src/syncer/mod.rs | 8 ++ src/syncer/treetracker/inner.rs | 44 +++++++++++ src/syncer/treetracker/mod.rs | 131 +++++++++++++++++++++++++++++++- src/util.rs | 3 + 7 files changed, 203 insertions(+), 6 deletions(-) diff --git a/src/inventory/item.rs b/src/inventory/item.rs index 69ee6fb..bd8bbf3 100644 --- a/src/inventory/item.rs +++ b/src/inventory/item.rs @@ -77,12 +77,13 @@ impl InventoryItem { self.details == ItemDetails::Deleted } + /// If the object is not a delete marker and is not the latest version of + /// the key, return the base filename at which it will be backed up. pub(crate) fn old_filename(&self) -> Option { let ItemDetails::Present { ref etag, .. } = self.details else { return None; }; - (!self.is_deleted() && !self.is_latest) - .then(|| make_old_filename(self.key.name(), &self.version_id, etag)) + (!self.is_latest).then(|| make_old_filename(self.key.name(), &self.version_id, etag)) } } diff --git a/src/keypath.rs b/src/keypath.rs index 2a94f36..ab919c9 100644 --- a/src/keypath.rs +++ b/src/keypath.rs @@ -15,6 +15,7 @@ use thiserror::Error; pub(crate) struct KeyPath(String); impl KeyPath { + /// Return the filename portion of the path pub(crate) fn name(&self) -> &str { self.0 .split('/') diff --git a/src/nursery.rs b/src/nursery.rs index b38b843..40c3f1a 100644 --- a/src/nursery.rs +++ b/src/nursery.rs @@ -1,22 +1,35 @@ +//! Simple tokio-based task group use futures_util::{FutureExt, Stream}; use std::future::Future; use std::pin::Pin; use std::task::{ready, Context, Poll}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +/// Type returned by [`FutureExt::catch_unwind()`]. If the inner task ran to +/// completion, this is `Ok`; otherwise, if the taks panicked, this is `Err`. type UnwindResult = Result>; +/// A handle for spawning new tasks in a task group/nursery. +/// +/// `Nursery` is cloneable and sendable, and so it can be used to spawn tasks +/// from inside other tasks in the nursery. The nursery returned by +/// [`Nursery::new()`] and all clones thereof must be dropped before the +/// corresponding [`NurseryStream`] can yield `None`. #[derive(Debug)] pub(crate) struct Nursery { sender: UnboundedSender>, } impl Nursery { + /// Create a new nursery and return a handle for spawning tasks and a + /// [`Stream`] of task return values. `T` is the `Output` type of the + /// futures that will be spawned in the nursery. pub(crate) fn new() -> (Nursery, NurseryStream) { let (sender, receiver) = unbounded_channel(); (Nursery { sender }, NurseryStream { receiver }) } + /// Spawn a future that returns `T` in the nursery. pub(crate) fn spawn(&self, fut: Fut) where Fut: Future + Send + 'static, @@ -39,6 +52,10 @@ impl Clone for Nursery { } } +/// A [`Stream`] of the values returned by the tasks spawned in a nursery. +/// +/// The corresponding [`Nursery`] and all clones thereof must be dropped before +/// the stream can yield `None`. #[derive(Debug)] pub(crate) struct NurseryStream { receiver: UnboundedReceiver>, diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index a1460b6..3130d9b 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -252,6 +252,8 @@ impl Syncer { Ok(firsts2fspecs.into_values().collect()) } + /// Run the given future to completion, cancelling it if `token` is + /// cancelled, in which case `Ok(())` is returned. fn until_cancelled_ok( &self, fut: Fut, @@ -259,10 +261,16 @@ impl Syncer { where Fut: Future> + Send + 'static, { + // Use an async block instead of making the method async so that the + // future won't capture &self and thus will be 'static let token = self.token.clone(); async move { token.run_until_cancelled(fut).await.unwrap_or(Ok(())) } } + /// Wait for all tasks in a nursery to complete. If any errors occur, + /// [`Syncer::shutdown()`] is called, and a [`MultiError`] of all errors + /// (including a message about Ctrl-C being received if that happened) is + /// returned. async fn await_nursery( &self, mut stream: NurseryStream>, diff --git a/src/syncer/treetracker/inner.rs b/src/syncer/treetracker/inner.rs index 300f874..3bf25b0 100644 --- a/src/syncer/treetracker/inner.rs +++ b/src/syncer/treetracker/inner.rs @@ -3,13 +3,22 @@ use either::Either; use std::cmp::Ordering; use std::collections::HashMap; +/// An "open" directory within a [`TreeTracker`][super::TreeTracker], i.e., one +/// to which keys are currently being added (either directly or to a descendant +/// directory) #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct PartialDirectory { + /// All files & directories in this directory that have been seen so far, + /// excluding `current_subdir` pub(super) entries: Vec>, + + /// The name of the subdirectory of this directory that is currently + /// "open", if any pub(super) current_subdir: Option, } impl PartialDirectory { + /// Create a new, empty `PartialDirectory` pub(super) fn new() -> Self { PartialDirectory { entries: Vec::new(), @@ -17,10 +26,17 @@ impl PartialDirectory { } } + /// Returns true if the directory is empty, i.e., if no entries have been + /// registered in it pub(super) fn is_empty(&self) -> bool { self.entries.is_empty() && self.current_subdir.is_none() } + /// Mark the current "open" subdirectory as closed, adding to `entries` + /// + /// # Panics + /// + /// Panics if there is no current open subdirectory. pub(super) fn close_current(&mut self) { let Some(name) = self.current_subdir.take() else { panic!("PartialDirectory::close_current() called without a current directory"); @@ -28,10 +44,13 @@ impl PartialDirectory { self.entries.push(Entry::dir(name)); } + /// Returns true if the last entry added to this directory is a subdirectory pub(super) fn last_entry_is_dir(&self) -> bool { self.current_subdir.is_some() } + /// Compare `cname` against the name of the last entry added to this + /// directory pub(super) fn cmp_vs_last_entry(&self, cname: CmpName<'_>) -> Option { self.current_subdir .as_deref() @@ -40,19 +59,33 @@ impl PartialDirectory { } } +/// A file or directory entry in an "open" directory tracked by +/// [`TreeTracker`][super::TreeTracker] #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum Entry { File { + /// The filename name: String, + + /// If the latest version of the corresponding key has been added, this + /// is its payload. value: Option, + + /// Mapping from "old filenames" registered for the key to their + /// payloads old_filenames: HashMap, }, Dir { + /// The name of the directory name: String, }, } impl Entry { + /// Create a new `Entry::File` with the given `name`. If `old_filename` is + /// `None`, `value` is used as the payload for the latest version of the + /// key; otherwise, the given old filename is registered with `value` as + /// its payload. pub(super) fn file>( name: S, value: T, @@ -73,10 +106,12 @@ impl Entry { } } + /// Create a new `Entry::Dir` with the given `name` pub(super) fn dir>(name: S) -> Entry { Entry::Dir { name: name.into() } } + /// Returns the name of the entry pub(super) fn name(&self) -> &str { match self { Entry::File { name, .. } => name, @@ -84,6 +119,7 @@ impl Entry { } } + /// Returns the name of the entry as a [`CmpName`] pub(super) fn cmp_name(&self) -> CmpName<'_> { match self { Entry::File { name, .. } => CmpName::File(name.as_ref()), @@ -104,6 +140,7 @@ pub(super) enum CmpName<'a> { } impl CmpName<'_> { + /// Returns the inner name, without any trailing slashes pub(super) fn name(&self) -> &str { match self { CmpName::File(s) => s, @@ -111,6 +148,8 @@ impl CmpName<'_> { } } + /// Returns an iterator over all characters in the name. If the name is + /// for a directory, a `'/'` is emitted at the end of the iterator. pub(super) fn chars(&self) -> impl Iterator + '_ { match self { CmpName::File(s) => Either::Left(s.chars()), @@ -143,6 +182,7 @@ impl Ord for CmpName<'_> { } } +/// An iterator over the path components of a key path #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct KeyComponents<'a, T> { i: usize, @@ -180,9 +220,13 @@ impl<'a, T> Iterator for KeyComponents<'a, T> { } } +/// A path component of a key path #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum Component<'a, T> { + /// `name` (no trailing slash) Dir(&'a str), + + /// `name`, `value`, `old_filename` File(&'a str, T, Option), } diff --git a/src/syncer/treetracker/mod.rs b/src/syncer/treetracker/mod.rs index a809574..fc8f55c 100644 --- a/src/syncer/treetracker/mod.rs +++ b/src/syncer/treetracker/mod.rs @@ -5,14 +5,61 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use thiserror::Error; +/// A type for tracking keys as they're encountered and — assuming the keys are +/// in sorted order — reporting the contents of directories for which no +/// further keys will be seen. +/// +/// When a key is encountered, it can be for either its latest version or a +/// non-latest version; non-latest versions are identified by "old filename" +/// strings (the base filenames with which they are saved), which need not be +/// sorted, but must be unique. +/// +/// Each key version additionally has a payload of type `T`, used by +/// `s3invsync` for storing [`tokio::sync::Notify`] values. #[derive(Clone, Debug, Eq, PartialEq)] -pub(super) struct TreeTracker(Vec>); +pub(super) struct TreeTracker( + /// A stack of currently "open" directories, i.e., directories for which + /// the `TreeTracker` is currently receiving keys. The first element + /// represents the root of the directory tree, the next element (if any) + /// represents the current "open" directory within that, etc. + /// + /// # Invariants + /// + /// - The stack is nonempty. (A `TreeTracker` with an empty stack is + /// referred to as "void" by the documentation and by error messages.) + /// + /// - For all elements `pd` other than the last, + /// `pd.current_subdirs.is_some()`. + /// + /// - For the last element `pd`, `pd.current_subdirs.is_none()`. + /// + /// - Once at least one key has been added to the tracker, for all elements + /// `pd`, either `pd.entries.last()` or `pd.current_subdir` is + /// non-`None`. + Vec>, +); impl TreeTracker { + /// Create a new, empty `TreeTracker` pub(super) fn new() -> Self { TreeTracker(vec![PartialDirectory::new()]) } + /// Register the key `key` with payload `value` and the given old filename + /// (if this is not the latest version of the key). + /// + /// If encountering the key indicates that the currently "open" directory and + /// zero or more of its parents will no longer be receiving any further + /// keys, those directories are returned, innermost first. + /// + /// # Errors + /// + /// This method returns `Err` if `key` is lexicographically less than the + /// previous distinct key, if `key` has already been encountered as a + /// directory path, if a parent directory of `key` has already been + /// encountered as a file path, or if `key` equals the previous key and + /// this is the second time that `add()` was called with that key & + /// `old_filename` value. pub(super) fn add( &mut self, key: &KeyPath, @@ -100,6 +147,8 @@ impl TreeTracker { Ok(popped_dirs) } + /// Indicate to the `TreeTracker` that all keys have been encountered. + /// Returns all remaining "open" directories, innermost first. pub(super) fn finish(mut self) -> Vec> { let mut dirs = Vec::new(); while !self.0.is_empty() { @@ -108,10 +157,16 @@ impl TreeTracker { dirs } + /// Returns `true` if the `TreeTracker` is empty, i.e., if the stack is + /// empty or its only item is empty. fn is_empty(&self) -> bool { self.0.is_empty() || (self.0.len() == 1 && self.0[0].is_empty()) } + /// Call [`TreeTracker::push_dir()`] on `first_dirname`, and then call + /// [`TreeTracker::push_dir()`] or [`TreeTracker::push_file()`], as + /// appropriate, on each element of the iterator + /// `rest`. fn push_parts( &mut self, first_dirname: &str, @@ -129,6 +184,13 @@ impl TreeTracker { Ok(()) } + /// Open a directory named `name` inside the current innermost open + /// directory. + /// + /// # Panics + /// + /// Panics if the tracker is void or if the current innermost directory + /// already contains an open directory. fn push_dir(&mut self, name: &str) { let Some(pd) = self.0.last_mut() else { panic!("TreeTracker::push_dir() called on void tracker"); @@ -141,6 +203,23 @@ impl TreeTracker { self.0.push(PartialDirectory::new()); } + /// Add the key with filename `name` to the current innermost open + /// directory if is not already present. If `old_filename` is `None`, + /// `value` is used as the payload for the latest version of the key; + /// otherwise, the given old filename is added to the file's collection of + /// old filenames, and `value` is used as the corresponding payload. + /// + /// # Errors + /// + /// This method returns `Err` if `key` equals the previous key and there is + /// already a payload for the given `old_filename` value. + /// + /// # Panics + /// + /// Panics if the tracker is void, if the current innermost directory + /// already contains an open directory, if `name` is less than the previous + /// name added to the innermost open directory, or if `name` equals the + /// previous name and the previously-added entry is a directory. fn push_file( &mut self, name: &str, @@ -189,6 +268,13 @@ impl TreeTracker { Ok(()) } + /// "Close" the current innermost open directory and return it as a + /// [`Directory`]. + /// + /// # Panics + /// + /// Panics if the tracker is void or if the current innermost directory + /// already contains an open directory. fn pop(&mut self) -> Directory { let Some(pd) = self.0.pop() else { panic!("TreeTracker::pop() called on void tracker"); @@ -228,6 +314,12 @@ impl TreeTracker { } } + /// Returns the key most recently added to the tracker. + /// + /// # Panics + /// + /// Panics if no keys have been added to the tracker or the stack contains + /// more than one element yet one of them is empty. fn last_key(&self) -> String { let mut s = String::new(); for pd in &self.0 { @@ -252,27 +344,42 @@ impl TreeTracker { } } +/// A directory, along with a collection of the names of the files & +/// subdirectories within. #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct Directory { - path: Option, // `None` for the root + /// The forward-slash-separated path to the directory, relative to the root + /// of the tree tracked by the creating [`TreeTracker`]. If the directory + /// is the root directory, this is `None`. + path: Option, + + /// A mapping from file names (including "old filenames") in the directory + /// to their payloads. files: HashMap, + + /// A set of the names of the subdirectories within the directory directories: HashSet, } impl Directory { + /// Returns the forward-slash-separated path to the directory, relative to + /// the root of the tree tracked by the creating [`TreeTracker`]. If the + /// directory is the root directory, this is `None`. pub(super) fn path(&self) -> Option<&str> { self.path.as_deref() } + /// Returns `true` if the directory contains a file with the given name pub(super) fn contains_file(&self, name: &str) -> bool { self.files.contains_key(name) } + /// Returns `true` if the directory contains a subdirectory with the given name pub(super) fn contains_dir(&self, name: &str) -> bool { self.directories.contains(name) } - #[allow(dead_code)] + /// Apply the given function to all file payloads in the directory pub(super) fn map U>(self, mut f: F) -> Directory { Directory { path: self.path, @@ -286,14 +393,30 @@ impl Directory { } } +/// Error returned by [`TreeTracker::add()`] #[derive(Clone, Debug, Eq, Error, PartialEq)] pub(super) enum TreeTrackerError { + /// The given key is lexicographically less than the previously-added key #[error("received keys in unsorted order: {before:?} came before {after:?}")] - Unsorted { before: String, after: String }, + Unsorted { + /// The previously-added key + before: String, + + /// The key passed to the [`TreeTracker::add()`] call + after: String, + }, + + /// A path was registered as both a file and a directory #[error("path {0:?} is used as both a file and a directory")] Conflict(String), + + /// The given key was provided with an `old_filename` of `None`, and this + /// is the second time that happened. #[error("file key {0:?} encountered more than once")] DuplicateFile(String), + + /// The given key was provided with a non-`None` `old_filename`, and this + /// is the second time that key & old filename were provided. #[error("key {key:?} has multiple non-latest versions with filename {old_filename:?}")] DuplicateOldFile { key: String, old_filename: String }, } diff --git a/src/util.rs b/src/util.rs index 43da560..1f7f753 100644 --- a/src/util.rs +++ b/src/util.rs @@ -113,6 +113,9 @@ pub(crate) fn force_create_dir_all>>( Ok(()) } +/// Construct the base filename for backing up an object that is not the latest +/// version of its key, where `basename` is the filename portion of the key, +/// `version_id` is the object's version ID, and `etag` is its etag. pub(crate) fn make_old_filename(basename: &str, version_id: &str, etag: &str) -> String { format!("{basename}.old.{version_id}.{etag}") } From 5637bc50499d7f4fd3dd17ee72edeb9a73fc11e5 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 10 Jan 2025 16:27:50 -0500 Subject: [PATCH 20/25] Fix --- src/syncer/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index 3130d9b..6f2ca0f 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -546,10 +546,11 @@ impl Syncer { if is_dir { !dir.contains_dir(name) } else { - if !is_special_component(name) { + let b = !dir.contains_file(name) && name != METADATA_FILENAME; + if b && !is_special_component(name) { dbdeletions.push(name.to_owned()); } - !dir.contains_file(name) && name != METADATA_FILENAME + b } } None => true, From a4abc26657d2b0f982776c1b50fe1f4e505f508f Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Sat, 11 Jan 2025 10:33:04 -0500 Subject: [PATCH 21/25] Typo caught by codespell --- src/nursery.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nursery.rs b/src/nursery.rs index 40c3f1a..b95d70d 100644 --- a/src/nursery.rs +++ b/src/nursery.rs @@ -6,7 +6,7 @@ use std::task::{ready, Context, Poll}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; /// Type returned by [`FutureExt::catch_unwind()`]. If the inner task ran to -/// completion, this is `Ok`; otherwise, if the taks panicked, this is `Err`. +/// completion, this is `Ok`; otherwise, if the task panicked, this is `Err`. type UnwindResult = Result>; /// A handle for spawning new tasks in a task group/nursery. From cd658fe3361657c621c7b895a89cc0e73085ae7d Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 13 Jan 2025 08:26:55 -0500 Subject: [PATCH 22/25] Increase MSRV to 1.81 For AWS SDK crates --- CHANGELOG.md | 1 + Cargo.toml | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9b2ba5..3d0d556 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ In Development favor of a new `--jobs` option - Files & directories in the backup tree that are not listed in the inventory are deleted +- Increased MSRV to 1.81 v0.1.0-alpha.2 (2025-01-06) --------------------------- diff --git a/Cargo.toml b/Cargo.toml index 06172ac..c7bedb9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "s3invsync" version = "0.1.0-alpha.2" edition = "2021" -rust-version = "1.80" +rust-version = "1.81" description = "AWS S3 Inventory-based backup tool with efficient incremental & versionId support" authors = [ "DANDI Developers ", diff --git a/README.md b/README.md index d3c9ca5..d52f60d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) [![CI Status](https://github.com/dandi/s3invsync/actions/workflows/test.yml/badge.svg)](https://github.com/dandi/s3invsync/actions/workflows/test.yml) [![codecov.io](https://codecov.io/gh/dandi/s3invsync/branch/main/graph/badge.svg)](https://codecov.io/gh/dandi/s3invsync) -[![Minimum Supported Rust Version](https://img.shields.io/badge/MSRV-1.80-orange)](https://www.rust-lang.org) +[![Minimum Supported Rust Version](https://img.shields.io/badge/MSRV-1.81-orange)](https://www.rust-lang.org) [![MIT License](https://img.shields.io/github/license/dandi/s3invsync.svg)](https://opensource.org/licenses/MIT) [GitHub](https://github.com/dandi/s3invsync) | [Issues](https://github.com/dandi/s3invsync/issues) | [Changelog](https://github.com/dandi/s3invsync/blob/main/CHANGELOG.md) From 1c089339e2ced61f89ad770e1c97a1980a06ab5a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 13 Jan 2025 08:31:51 -0500 Subject: [PATCH 23/25] Improve nursery.rs --- Cargo.lock | 1 + Cargo.toml | 1 + src/nursery.rs | 154 ++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 136 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6744e3a..cab046e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1796,6 +1796,7 @@ dependencies = [ "md-5", "memory-stats", "percent-encoding", + "pin-project-lite", "regex", "rstest", "serde", diff --git a/Cargo.toml b/Cargo.toml index c7bedb9..0a10738 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ lockable = "0.1.1" md-5 = "0.10.6" memory-stats = "1.2.0" percent-encoding = "2.3.1" +pin-project-lite = "0.2.16" regex = "1.11.1" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.135" diff --git a/src/nursery.rs b/src/nursery.rs index b95d70d..d45b56e 100644 --- a/src/nursery.rs +++ b/src/nursery.rs @@ -1,13 +1,13 @@ -//! Simple tokio-based task group -use futures_util::{FutureExt, Stream}; +//! Simple tokio-based task group/nursery +use futures_util::{stream::FuturesUnordered, Stream, StreamExt}; +use pin_project_lite::pin_project; use std::future::Future; use std::pin::Pin; use std::task::{ready, Context, Poll}; -use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; - -/// Type returned by [`FutureExt::catch_unwind()`]. If the inner task ran to -/// completion, this is `Ok`; otherwise, if the task panicked, this is `Err`. -type UnwindResult = Result>; +use tokio::{ + sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}, + task::{JoinError, JoinHandle}, +}; /// A handle for spawning new tasks in a task group/nursery. /// @@ -17,7 +17,7 @@ type UnwindResult = Result>; /// corresponding [`NurseryStream`] can yield `None`. #[derive(Debug)] pub(crate) struct Nursery { - sender: UnboundedSender>, + sender: UnboundedSender>, } impl Nursery { @@ -26,7 +26,13 @@ impl Nursery { /// futures that will be spawned in the nursery. pub(crate) fn new() -> (Nursery, NurseryStream) { let (sender, receiver) = unbounded_channel(); - (Nursery { sender }, NurseryStream { receiver }) + ( + Nursery { sender }, + NurseryStream { + receiver, + tasks: FuturesUnordered::new(), + }, + ) } /// Spawn a future that returns `T` in the nursery. @@ -34,11 +40,7 @@ impl Nursery { where Fut: Future + Send + 'static, { - let sender = self.sender.clone(); - tokio::spawn(async move { - let task = std::panic::AssertUnwindSafe(fut).catch_unwind(); - let _ = sender.send(task.await); - }); + let _ = self.sender.send(FragileHandle::new(tokio::spawn(fut))); } } @@ -56,33 +58,88 @@ impl Clone for Nursery { /// /// The corresponding [`Nursery`] and all clones thereof must be dropped before /// the stream can yield `None`. +/// +/// When a `NurseryStream` is dropped, all tasks in the nursery are aborted. #[derive(Debug)] pub(crate) struct NurseryStream { - receiver: UnboundedReceiver>, + receiver: UnboundedReceiver>, + tasks: FuturesUnordered>, } impl Stream for NurseryStream { type Item = T; - /// Poll for one of the tasks in the group to complete and return its + /// Poll for one of the tasks in the nursery to complete and return its /// return value. /// /// # Panics /// /// If a task panics, this method resumes unwinding the panic. fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match ready!(self.receiver.poll_recv(cx)) { + let closed = loop { + match self.receiver.poll_recv(cx) { + Poll::Pending => break false, + Poll::Ready(Some(handle)) => self.tasks.push(handle), + Poll::Ready(None) => break true, + } + }; + match ready!(self.tasks.poll_next_unpin(cx)) { Some(Ok(r)) => Some(r).into(), - Some(Err(e)) => std::panic::resume_unwind(e), - None => None.into(), + Some(Err(e)) => match e.try_into_panic() { + Ok(barf) => std::panic::resume_unwind(barf), + Err(e) => unreachable!( + "Task in nursery should not have been aborted before dropping stream, but got {e:?}" + ), + }, + None => { + if closed { + // All Nursery clones dropped and all results yielded; end + // of stream + None.into() + } else { + Poll::Pending + } + } } } } +pin_project! { + /// A wrapper around `tokio::task::JoinHandle` that aborts the task on drop. + #[derive(Debug)] + struct FragileHandle { + #[pin] + inner: JoinHandle + } + + impl PinnedDrop for FragileHandle { + fn drop(this: Pin<&mut Self>) { + this.project().inner.abort(); + } + } +} + +impl FragileHandle { + fn new(inner: JoinHandle) -> Self { + FragileHandle { inner } + } +} + +impl Future for FragileHandle { + type Output = Result; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + this.inner.poll(cx) + } +} + #[cfg(test)] mod tests { use super::*; - use futures_util::StreamExt; + use futures_util::{FutureExt, StreamExt}; + use std::time::Duration; + use tokio::{sync::oneshot, time::timeout}; #[test] fn nursery_is_send() { @@ -134,4 +191,61 @@ mod tests { .await; assert!(r.is_err()); } + + #[tokio::test] + async fn no_close_until_drop() { + let (nursery, mut nursery_stream) = Nursery::new(); + nursery.spawn(std::future::ready(1)); + nursery.spawn(std::future::ready(2)); + nursery.spawn(std::future::ready(3)); + let mut values = Vec::new(); + values.push(nursery_stream.next().await.unwrap()); + values.push(nursery_stream.next().await.unwrap()); + values.push(nursery_stream.next().await.unwrap()); + values.sort_unstable(); + assert_eq!(values, vec![1, 2, 3]); + let r = timeout(Duration::from_millis(100), nursery_stream.next()).await; + assert!(r.is_err()); + drop(nursery); + let r = timeout(Duration::from_millis(100), nursery_stream.next()).await; + assert_eq!(r, Ok(None)); + } + + #[tokio::test] + async fn drop_tasks_on_drop_stream() { + enum Void {} + + let (nursery, nursery_stream) = Nursery::new(); + let (sender, receiver) = oneshot::channel::(); + nursery.spawn({ + async move { + std::future::pending::<()>().await; + drop(sender); + } + }); + drop(nursery); + drop(nursery_stream); + assert!(receiver.await.is_err()); + } + + #[tokio::test] + async fn nest_nurseries() { + let (nursery, nursery_stream) = Nursery::new(); + nursery.spawn(async { + let (nursery, nursery_stream) = Nursery::new(); + nursery.spawn(std::future::ready(1)); + nursery.spawn(std::future::ready(2)); + nursery.spawn(std::future::ready(3)); + drop(nursery); + nursery_stream + .fold(0, |accum, i| async move { accum + i }) + .await + }); + nursery.spawn(std::future::ready(4)); + nursery.spawn(std::future::ready(5)); + drop(nursery); + let mut values = nursery_stream.collect::>().await; + values.sort_unstable(); + assert_eq!(values, vec![4, 5, 6]); + } } From b3b1deb179b8778f4f630d20c8fd39481c10bfcc Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 13 Jan 2025 08:53:38 -0500 Subject: [PATCH 24/25] Break up `Syncer::run()` --- src/syncer/mod.rs | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/syncer/mod.rs b/src/syncer/mod.rs index 6f2ca0f..2f8cb32 100644 --- a/src/syncer/mod.rs +++ b/src/syncer/mod.rs @@ -104,6 +104,20 @@ impl Syncer { } pub(crate) async fn run(self: &Arc, manifest: CsvManifest) -> Result<(), MultiError> { + self.spawwn_cltrc_listener(); + let fspecs = self.sort_csvs_by_first_line(manifest.files).await?; + tracing::trace!(path = %self.outdir.display(), "Creating root output directory"); + fs_err::create_dir_all(&self.outdir).map_err(|e| MultiError(vec![e.into()]))?; + let (nursery, nursery_stream) = Nursery::new(); + self.spawn_inventory_task(&nursery, fspecs); + self.spawn_object_tasks(&nursery); + drop(nursery); + let r = self.await_nursery(nursery_stream).await; + self.filterlog.finish(); + r + } + + fn spawwn_cltrc_listener(self: &Arc) { tokio::spawn({ let this = self.clone(); async move { @@ -114,12 +128,13 @@ impl Syncer { } } }); + } - let fspecs = self.sort_csvs_by_first_line(manifest.files).await?; - - tracing::trace!(path = %self.outdir.display(), "Creating root output directory"); - fs_err::create_dir_all(&self.outdir).map_err(|e| MultiError(vec![e.into()]))?; - let (nursery, nursery_stream) = Nursery::new(); + fn spawn_inventory_task( + self: &Arc, + nursery: &Nursery>, + fspecs: Vec, + ) { let obj_sender = { let guard = self .obj_sender @@ -130,9 +145,7 @@ impl Syncer { .cloned() .expect("obj_sender should not be None") }; - let this = self.clone(); - let sender = obj_sender.clone(); let subnursery = nursery.clone(); nursery.spawn( self.until_cancelled_ok(async move { @@ -159,7 +172,7 @@ impl Syncer { } else { None }; - if sender.send((item, notify)).await.is_err() { + if obj_sender.send((item, notify)).await.is_err() { // Assume we're shutting down return Ok(()); } @@ -178,7 +191,6 @@ impl Syncer { Ok(()) }) ); - drop(obj_sender); { let mut guard = self .obj_sender @@ -186,7 +198,9 @@ impl Syncer { .expect("obj_sender mutex should not be poisoned"); *guard = None; } + } + fn spawn_object_tasks(self: &Arc, nursery: &Nursery>) { for _ in 0..self.jobs.get() { let this = self.clone(); let recv = self.obj_receiver.clone(); @@ -204,11 +218,6 @@ impl Syncer { Ok(()) }); } - - drop(nursery); - let r = self.await_nursery(nursery_stream).await; - self.filterlog.finish(); - r } /// Fetch the first line of each inventory list file in `specs` and sort From 132cc1d4f71e8e111d99546c447eaecb3f901f94 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 13 Jan 2025 09:02:58 -0500 Subject: [PATCH 25/25] More tests for KeyPath --- src/keypath.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/keypath.rs b/src/keypath.rs index ab919c9..774ca41 100644 --- a/src/keypath.rs +++ b/src/keypath.rs @@ -170,6 +170,23 @@ mod tests { use assert_matches::assert_matches; use rstest::rstest; + #[rstest] + #[case("foo", "foo")] + #[case("foo/bar/baz", "baz")] + fn test_name(#[case] p: KeyPath, #[case] name: &str) { + assert_eq!(p.name(), name); + } + + #[rstest] + #[case("foo", None, "foo")] + #[case("foo/bar", Some("foo"), "bar")] + #[case("foo/bar/baz", Some("foo/bar"), "baz")] + fn test_split(#[case] p: KeyPath, #[case] dirname: Option<&str>, #[case] filename: &str) { + let (d, f) = p.split(); + assert_eq!(d, dirname); + assert_eq!(f, filename); + } + #[rstest] #[case("foo.nwb")] #[case("foo/bar.nwb")]