diff --git a/CHANGELOG.md b/CHANGELOG.md index 673e934..8ba6c6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- - Reduced the sizes of a number of streams & futures +- Added doc comments to much of the code v0.4.0 (2024-07-09) ------------------- diff --git a/Cargo.lock b/Cargo.lock index faa0114..2c5e1c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -852,7 +852,7 @@ dependencies = [ "memory-stats", "moka", "percent-encoding", - "pin-project-lite", + "pin-project", "pretty_assertions", "reqwest", "reqwest-middleware", diff --git a/Cargo.toml b/Cargo.toml index 4dad865..de57ace 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ itertools = "0.13.0" memory-stats = "1.2.0" moka = { version = "0.12.8", features = ["future"] } percent-encoding = "2.3.1" -pin-project-lite = "0.2.14" +pin-project = "1.1.5" reqwest = { version = "0.12.5", default-features = false, features = ["json", "rustls-tls-native-roots"] } reqwest-middleware = "0.3.2" reqwest-retry = "0.6.0" diff --git a/README.md b/README.md index 4b2b029..e60ac5b 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,16 @@ Features - Hierarchies served: - - `/dandisets/`: A view of Dandisets & assets (including Zarr entries, and - including version metadata as virtual `dandiset.yaml` files) in Dandi - Archive, retrieved via the Dandi Archive and S3 APIs + - `/dandisets/`: A view of Dandisets & assets in Dandi Archive, retrieved + via the Dandi Archive and S3 APIs + + - Dandiset versions include a `dandiset.yaml` file as a top-level + resource. + + - Zarr assets are represented as collections of their entries. + + - HTML views of collections include links to version & asset metadata + on the API server. - `/zarrs/`: A view of all Zarrs in the Dandi Archive at various points in time, as recorded by/at diff --git a/src/consts.rs b/src/consts.rs index 4db1281..48bbd93 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,3 +1,4 @@ +//! Constants and program-wide compile-time configuration use time::{format_description::FormatItem, macros::format_description}; /// The "User-Agent" value sent in outgoing HTTP requests diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index b023570..1002ca8 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -1,3 +1,4 @@ +//! The implementation of the data source for the `/dandisets/` hierarchy mod dandiset_id; mod streams; mod types; @@ -20,14 +21,35 @@ use std::sync::Arc; use thiserror::Error; use url::Url; +/// A client for fetching data about Dandisets, their versions, and their +/// assets from a DANDI Archive instance #[derive(Clone, Debug)] pub(crate) struct DandiClient { + /// The HTTP client used for making requests to the Archive instance's API inner: Client, + + /// The base API URL of the Archive instance api_url: Url, + + /// A cache of [`S3Client`] instances that are used for listing Zarr + /// entries on the Archive's S3 bucket. + /// + /// In order to avoid the user running `dandidav` having to supply details + /// on the Archive instance's S3 bucket, these details are instead derived + /// automatically from the `contentUrl` fields of Zarr assets' metadata + /// once they are needed. Each bucket needs an `S3Client` to access it, + /// and as construction of the inner `aws_sdk_s3::Client` is expensive, we + /// cache them. s3clients: Cache>, } impl DandiClient { + /// Construct a new `DandiClient` for the Archive instance with the given + /// base API URL + /// + /// # Errors + /// + /// Returns an error if construction of the inner `reqwest::Client` fails pub(crate) fn new(api_url: Url) -> Result { let inner = Client::new()?; let s3clients = CacheBuilder::new(S3CLIENT_CACHE_SIZE) @@ -40,6 +62,8 @@ impl DandiClient { }) } + /// Return the URL formed by appending the given path segments and a + /// trailing slash to the path of the API base URL fn get_url(&self, segments: I) -> Url where I: IntoIterator, @@ -48,59 +72,76 @@ impl DandiClient { urljoin_slashed(&self.api_url, segments) } + /// Perform a `GET` request to the given URL and return the deserialized + /// JSON response body async fn get(&self, url: Url) -> Result { self.inner.get_json(url).await.map_err(Into::into) } + /// Return a [`futures_util::Stream`] that makes paginated `GET` requests + /// to the given URL and its subsequent pages and yields a `Result` value for each item deserialized from the responses fn paginate(&self, url: Url) -> Paginate { Paginate::new(self, url) } - async fn get_s3client(&self, loc: S3Location) -> Result { - let S3Location { + /// Given a Zarr asset, return a [`PrefixedS3Client`] for fetching + /// information from S3 about the keys under the Zarr's key prefix on its + /// bucket. If a client has not already been constructed for the bucket in + /// question, one is constructed & cached. + /// + /// Specifically, the first `contentUrl` of the Zarr that can be parsed by + /// [`S3Location::parse_url()`] into a bucket, optional region, and key + /// prefix is used to construct the `PrefixedS3Client` (with a trailing + /// slash appended to the key prefix if one isn't already present), with + /// the assumption that the Zarr's entries are laid out under the given key + /// prefix on the given bucket using the same names & directory structure + /// as the actual Zarr. + async fn get_s3client_for_zarr( + &self, + zarr: &ZarrAsset, + ) -> Result { + let Some(S3Location { bucket_spec, mut key, - } = loc; + }) = zarr.s3location() + else { + return Err(DandiError::ZarrToS3Error { + asset_id: zarr.asset_id.clone(), + source: ZarrToS3Error::ZarrLacksS3Url, + }); + }; if !key.ends_with('/') { key.push('/'); } - let prefix = PureDirPath::try_from(key).map_err(ZarrToS3Error::BadS3Key)?; - // Box the future passed to moka in order to minimize the size of the - // moka future (cf. ): + let prefix = PureDirPath::try_from(key).map_err(|source| DandiError::ZarrToS3Error { + asset_id: zarr.asset_id.clone(), + source: ZarrToS3Error::BadS3Key(source), + })?; match self .s3clients .try_get_with_by_ref( &bucket_spec, + // Box the future passed to moka in order to minimize the size + // of the moka future (cf. + // ): Box::pin(async { bucket_spec.clone().into_s3client().await.map(Arc::new) }), ) .await { Ok(client) => Ok(client.with_prefix(prefix)), - Err(source) => Err(ZarrToS3Error::LocateBucket { - bucket: bucket_spec.bucket, - source, + Err(source) => Err(DandiError::ZarrToS3Error { + asset_id: zarr.asset_id.clone(), + source: ZarrToS3Error::LocateBucket { + bucket: bucket_spec.bucket, + source, + }, }), } } - async fn get_s3client_for_zarr( - &self, - zarr: &ZarrAsset, - ) -> Result { - let Some(s3loc) = zarr.s3location() else { - return Err(DandiError::ZarrToS3Error { - asset_id: zarr.asset_id.clone(), - source: ZarrToS3Error::ZarrLacksS3Url, - }); - }; - self.get_s3client(s3loc) - .await - .map_err(|source| DandiError::ZarrToS3Error { - asset_id: zarr.asset_id.clone(), - source, - }) - } - + /// Return a [`futures_util::Stream`] that yields a `Dandiset` for each + /// Dandiset on the Archive instance pub(crate) fn get_all_dandisets( &self, ) -> impl Stream> + '_ { @@ -108,10 +149,14 @@ impl DandiClient { .map_ok(|ds| ds.with_metadata_urls(self)) } + /// Return an endpoint object for making requests for information about the + /// given Dandiset pub(crate) fn dandiset(&self, dandiset_id: DandisetId) -> DandisetEndpoint<'_> { DandisetEndpoint::new(self, dandiset_id) } + /// Return the URL for the metadata for the given version of the given + /// Dandiset fn version_metadata_url(&self, dandiset_id: &DandisetId, version_id: &VersionId) -> Url { self.get_url([ "dandisets", @@ -122,13 +167,18 @@ impl DandiClient { } } +/// An object for making requests relating to a specific Dandiset #[derive(Clone, Debug)] pub(crate) struct DandisetEndpoint<'a> { + /// Client for the Dandiset's Archive instance client: &'a DandiClient, + + /// The ID of the Dandiset this instance operates on dandiset_id: DandisetId, } impl<'a> DandisetEndpoint<'a> { + /// Construct a `DandisetEndpoint` for the given `client` and `dandiset_id` fn new(client: &'a DandiClient, dandiset_id: DandisetId) -> Self { Self { client, @@ -136,10 +186,13 @@ impl<'a> DandisetEndpoint<'a> { } } + /// Return an endpoint object for making requests for information about the + /// given version of the Dandiset pub(crate) fn version(self, version_id: VersionId) -> VersionEndpoint<'a> { VersionEndpoint::new(self, version_id) } + /// Retrieve information about the Dandiset pub(crate) async fn get(&self) -> Result { self.client .get::( @@ -150,6 +203,8 @@ impl<'a> DandisetEndpoint<'a> { .map(|ds| ds.with_metadata_urls(self.client)) } + /// Return a [`futures_util::Stream`] that yields a `DandisetVersion` for + /// each version of the Dandiset pub(crate) fn get_all_versions( &self, ) -> impl Stream> + '_ { @@ -168,14 +223,21 @@ impl<'a> DandisetEndpoint<'a> { } } +/// An object for making requests relating to a specific version of a Dandiset #[derive(Clone, Debug)] pub(crate) struct VersionEndpoint<'a> { + /// Client for the Archive instance client: &'a DandiClient, + + /// The ID of the Dandiset this instance operates on dandiset_id: DandisetId, + + /// The ID of the version this instance operates on version_id: VersionId, } impl<'a> VersionEndpoint<'a> { + /// Construct a `VersionEndpoint` from a `DandisetEndpoint` and `VersionId` fn new(upper: DandisetEndpoint<'a>, version_id: VersionId) -> Self { Self { client: upper.client, @@ -184,6 +246,7 @@ impl<'a> VersionEndpoint<'a> { } } + /// Retrieve information about the version pub(crate) async fn get(&self) -> Result { self.client .get::(self.client.get_url([ @@ -197,22 +260,7 @@ impl<'a> VersionEndpoint<'a> { .map(|v| v.with_metadata_url(self.metadata_url())) } - fn metadata_url(&self) -> Url { - self.client - .version_metadata_url(&self.dandiset_id, &self.version_id) - } - - fn asset_metadata_url(&self, asset_id: &str) -> Url { - self.client.get_url([ - "dandisets", - self.dandiset_id.as_ref(), - "versions", - self.version_id.as_ref(), - "assets", - asset_id, - ]) - } - + /// Retrieve the version's metadata as serialized YAML pub(crate) async fn get_metadata(&self) -> Result { let data = self .client @@ -221,22 +269,68 @@ impl<'a> VersionEndpoint<'a> { Ok(VersionMetadata(dump_json_as_yaml(data).into_bytes())) } - async fn get_asset_by_id(&self, id: &str) -> Result { - let raw_asset = self - .client - .get::(self.client.get_url([ - "dandisets", - self.dandiset_id.as_ref(), - "versions", - self.version_id.as_ref(), - "assets", - id, - "info", - ])) - .await?; - raw_asset.try_into_asset(self).map_err(Into::into) + /// Get details on the resource at the given `path` in the version's file + /// hierarchy, treating Zarrs as directories of their entries + /// + /// Although `path` is a `PurePath`, the resulting resource may be a + /// collection. + pub(crate) async fn get_resource(&self, path: &PurePath) -> Result { + self.get_resource_with_s3(path).await.map(Into::into) } + /// Get details on the resource at the given `path` in the version's file + /// hierarchy (treating Zarrs as directories of their entries) along with + /// its immediate child resources (if any). + /// + /// Although `path` is a `PurePath`, the resulting resource may be a + /// collection. + pub(crate) async fn get_resource_with_children( + &self, + path: &PurePath, + ) -> Result { + match self.get_resource_with_s3(path).await? { + DandiResourceWithS3::Folder(folder) => { + let mut children = Vec::new(); + let mut stream = self.get_folder_entries(&folder); + while let Some(child) = stream.try_next().await? { + let child = match child { + FolderEntry::Folder(subf) => DandiResource::Folder(subf), + FolderEntry::Asset { id, path } => match self.get_asset_by_id(&id).await { + Ok(asset) => DandiResource::Asset(asset), + Err(DandiError::Http(HttpError::NotFound { .. })) => { + return Err(DandiError::DisappearingAsset { asset_id: id, path }) + } + Err(e) => return Err(e), + }, + }; + children.push(child); + } + Ok(DandiResourceWithChildren::Folder { folder, children }) + } + DandiResourceWithS3::Asset(Asset::Blob(r)) => Ok(DandiResourceWithChildren::Blob(r)), + DandiResourceWithS3::Asset(Asset::Zarr(zarr)) => { + let s3 = self.client.get_s3client_for_zarr(&zarr).await?; + let children = s3 + .get_root_entries() + .map_ok(|child| zarr.make_resource(child)) + .try_collect::>() + .await?; + Ok(DandiResourceWithChildren::Zarr { zarr, children }) + } + DandiResourceWithS3::ZarrFolder { folder, s3 } => { + let children = s3 + .get_folder_entries(&folder.path) + .map_ok(|child| folder.make_resource(child)) + .try_collect::>() + .await?; + Ok(DandiResourceWithChildren::ZarrFolder { folder, children }) + } + DandiResourceWithS3::ZarrEntry(r) => Ok(DandiResourceWithChildren::ZarrEntry(r)), + } + } + + /// Return a [`futures_util::Stream`] that yields the resources at the root + /// of the version's file hierarchy pub(crate) fn get_root_children( &self, ) -> impl Stream> + '_ { @@ -255,26 +349,95 @@ impl<'a> VersionEndpoint<'a> { }) } - fn get_folder_entries(&self, path: &AssetFolder) -> Paginate { - self.get_entries_under_path(Some(&path.path)) + /// Get details on the resource at the given `path` in the version's file + /// hierarchy, treating Zarrs as directories of their entries + /// + /// In order to determine whether `path` consists of a path to a Zarr asset + /// followed by a path to a resource within that Zarr, we perform the + /// following algorithm, which is efficient but not always correct (cf. + /// and + /// ). + /// + /// - For each non-final component in `path` from left to right that has a + /// `.zarr` or `.ngff` extension (case sensitive), query the asset path + /// up through that component. If 404, return 404. If blob asset, + /// return 404. If folder, go to next candidate. Otherwise, we have a + /// Zarr asset, and the rest of the original path is the Zarr entry path. + /// + /// - If all components are exhausted without erroring or finding a Zarr, + /// treat the entirety of `path` as an asset/folder path. + async fn get_resource_with_s3( + &self, + path: &PurePath, + ) -> Result { + for (zarr_path, entry_path) in path.split_zarr_candidates() { + match self.get_path(&zarr_path).await? { + AtAssetPath::Folder(_) => continue, + AtAssetPath::Asset(Asset::Blob(_)) => { + return Err(DandiError::PathUnderBlob { + path: path.clone(), + blob_path: zarr_path, + }) + } + AtAssetPath::Asset(Asset::Zarr(zarr)) => { + let s3 = self.client.get_s3client_for_zarr(&zarr).await?; + return match s3.get_path(&entry_path).await? { + Some(entry) => Ok(zarr.make_resource(entry).with_s3(s3)), + None => Err(DandiError::ZarrEntryNotFound { + zarr_path, + entry_path, + }), + }; + } + } + } + self.get_path(path).await.map(Into::into) } - fn get_entries_under_path(&self, path: Option<&PureDirPath>) -> Paginate { - let mut url = self.client.get_url([ + /// Return the URL for the version's metadata + fn metadata_url(&self) -> Url { + self.client + .version_metadata_url(&self.dandiset_id, &self.version_id) + } + + /// Retrieve information on the asset in this version with the given asset + /// ID + async fn get_asset_by_id(&self, id: &str) -> Result { + self.client + .get::(self.client.get_url([ + "dandisets", + self.dandiset_id.as_ref(), + "versions", + self.version_id.as_ref(), + "assets", + id, + "info", + ])) + .await? + .try_into_asset(self) + .map_err(Into::into) + } + + /// Return the URL for the metadata of the asset in this version with the + /// given asset ID + fn asset_metadata_url(&self, asset_id: &str) -> Url { + self.client.get_url([ "dandisets", self.dandiset_id.as_ref(), "versions", self.version_id.as_ref(), "assets", - "paths", - ]); - if let Some(path) = path { - url.query_pairs_mut() - .append_pair("path_prefix", path.as_ref()); - } - self.client.paginate(url) + asset_id, + ]) } + /// Get details on the resource (an asset or folder) at the given `path` in + /// the version's file hierarchy, treating Zarrs as non-collections. + /// + /// This method paginates over all assets in the version whose paths start + /// with `path`, sorted by asset paths in lexicographic order. If an exact + /// match is found, that asset is returned. If an asset is found whose + /// path is a descendant of `path`, then `path` is a folder. async fn get_path(&self, path: &PurePath) -> Result { let mut url = self.client.get_url([ "dandisets", @@ -301,95 +464,33 @@ impl<'a> VersionEndpoint<'a> { Err(DandiError::PathNotFound { path: path.clone() }) } - async fn get_resource_with_s3( - &self, - path: &PurePath, - ) -> Result { - /* - Algorithm for efficiently (yet not always correctly) splitting `path` - into an asset path and an optional Zarr entry path (cf. - ): - - - For each non-final component in `path` from left to right that has a - `.zarr` or `.ngff` extension (case sensitive), query the asset path - up through that component. If 404, return 404. If blob asset, - return 404. If folder, go to next candidate. Otherwise, we have a - Zarr asset, and the rest of the original path is the Zarr entry path. - - - If all components are exhausted without erroring or finding a Zarr, - treat the entirety of `path` as an asset/folder path. - */ - for (zarr_path, entry_path) in path.split_zarr_candidates() { - match self.get_path(&zarr_path).await? { - AtAssetPath::Folder(_) => continue, - AtAssetPath::Asset(Asset::Blob(_)) => { - return Err(DandiError::PathUnderBlob { - path: path.clone(), - blob_path: zarr_path, - }) - } - AtAssetPath::Asset(Asset::Zarr(zarr)) => { - let s3 = self.client.get_s3client_for_zarr(&zarr).await?; - return match s3.get_path(&entry_path).await? { - Some(entry) => Ok(zarr.make_resource(entry).with_s3(s3)), - None => Err(DandiError::ZarrEntryNotFound { - zarr_path, - entry_path, - }), - }; - } - } + /// Return a [`futures_util::Stream`] that yields a [`FolderEntry`] object + /// for each immediate child resource (both assets and folders) of the + /// folder at `path` in the version's file hierarchy, treating Zarrs as + /// non-collections. If `path` is `None`, the resources at the root of the + /// file hierarchy are yielded. + fn get_entries_under_path(&self, path: Option<&PureDirPath>) -> Paginate { + let mut url = self.client.get_url([ + "dandisets", + self.dandiset_id.as_ref(), + "versions", + self.version_id.as_ref(), + "assets", + "paths", + ]); + if let Some(path) = path { + url.query_pairs_mut() + .append_pair("path_prefix", path.as_ref()); } - self.get_path(path).await.map(Into::into) - } - - pub(crate) async fn get_resource(&self, path: &PurePath) -> Result { - self.get_resource_with_s3(path).await.map(Into::into) + self.client.paginate(url) } - pub(crate) async fn get_resource_with_children( - &self, - path: &PurePath, - ) -> Result { - match self.get_resource_with_s3(path).await? { - DandiResourceWithS3::Folder(folder) => { - let mut children = Vec::new(); - let mut stream = self.get_folder_entries(&folder); - while let Some(child) = stream.try_next().await? { - let child = match child { - FolderEntry::Folder(subf) => DandiResource::Folder(subf), - FolderEntry::Asset { id, path } => match self.get_asset_by_id(&id).await { - Ok(asset) => DandiResource::Asset(asset), - Err(DandiError::Http(HttpError::NotFound { .. })) => { - return Err(DandiError::DisappearingAsset { asset_id: id, path }) - } - Err(e) => return Err(e), - }, - }; - children.push(child); - } - Ok(DandiResourceWithChildren::Folder { folder, children }) - } - DandiResourceWithS3::Asset(Asset::Blob(r)) => Ok(DandiResourceWithChildren::Blob(r)), - DandiResourceWithS3::Asset(Asset::Zarr(zarr)) => { - let s3 = self.client.get_s3client_for_zarr(&zarr).await?; - let children = s3 - .get_root_entries() - .map_ok(|child| zarr.make_resource(child)) - .try_collect::>() - .await?; - Ok(DandiResourceWithChildren::Zarr { zarr, children }) - } - DandiResourceWithS3::ZarrFolder { folder, s3 } => { - let children = s3 - .get_folder_entries(&folder.path) - .map_ok(|child| folder.make_resource(child)) - .try_collect::>() - .await?; - Ok(DandiResourceWithChildren::ZarrFolder { folder, children }) - } - DandiResourceWithS3::ZarrEntry(r) => Ok(DandiResourceWithChildren::ZarrEntry(r)), - } + /// Return a [`futures_util::Stream`] that yields a [`FolderEntry`] object + /// for each immediate child resource (both assets and folders) of the + /// folder at `path` in the version's file hierarchy, treating Zarrs as + /// non-collections. + fn get_folder_entries(&self, path: &AssetFolder) -> Paginate { + self.get_entries_under_path(Some(&path.path)) } } @@ -406,7 +507,7 @@ pub(crate) enum DandiError { zarr_path: PurePath, entry_path: PurePath, }, - #[error("folder listing included asset ID={asset_id} at path {path:?}, but request to asset returned 404")] + #[error("folder listing included asset ID {asset_id} at path {path:?}, but request to asset returned 404")] DisappearingAsset { asset_id: String, path: PurePath }, #[error("failed to acquire S3 client for Zarr with asset ID {asset_id}")] ZarrToS3Error { @@ -420,6 +521,7 @@ pub(crate) enum DandiError { } impl DandiError { + /// Was the error ultimately caused by something not being found? pub(crate) fn is_404(&self) -> bool { matches!( self, @@ -444,6 +546,11 @@ pub(crate) enum ZarrToS3Error { }, } +/// Serialize the given deserialized JSON value as YAML +/// +/// # Panics +/// +/// Panics if the value cannot be serialized. This should not happen. fn dump_json_as_yaml(data: serde_json::Value) -> String { serde_yaml::to_string(&data).expect("converting JSON to YAML should not fail") } diff --git a/src/dandi/streams.rs b/src/dandi/streams.rs index 7c8bf14..624149f 100644 --- a/src/dandi/streams.rs +++ b/src/dandi/streams.rs @@ -2,21 +2,19 @@ use super::types::Page; use super::{DandiClient, DandiError}; use crate::httputil::{Client, HttpError}; use futures_util::{future::BoxFuture, FutureExt, Stream}; -use pin_project_lite::pin_project; +use pin_project::pin_project; use serde::de::DeserializeOwned; use std::pin::Pin; use std::task::{ready, Context, Poll}; use url::Url; -pin_project! { - // Implementing paginate() as a manually-implemented Stream instead of via - // async_stream lets us save about 4700 bytes on dandidav's top-level - // Futures. - #[must_use = "streams do nothing unless polled"] - pub(super) struct Paginate { - client: Client, - state: PaginateState, - } +// Implementing paginate() as a manually-implemented Stream instead of via +// async_stream lets us save about 4700 bytes on dandidav's top-level Futures. +#[pin_project] +#[must_use = "streams do nothing unless polled"] +pub(super) struct Paginate { + client: Client, + state: PaginateState, } enum PaginateState { diff --git a/src/dandi/types.rs b/src/dandi/types.rs index 5b5116b..0b3c8d2 100644 --- a/src/dandi/types.rs +++ b/src/dandi/types.rs @@ -13,7 +13,7 @@ pub(super) struct Page { } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct RawDandiset { +pub(super) struct RawDandiset { identifier: DandisetId, #[serde(with = "time::serde::rfc3339")] created: OffsetDateTime, @@ -54,7 +54,7 @@ pub(crate) struct Dandiset { } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct RawDandisetVersion { +pub(super) struct RawDandisetVersion { pub(super) version: VersionId, //name: String, //asset_count: u64, @@ -102,7 +102,7 @@ impl From for Vec { } } -// Item in a `/dandisets/{dandiset_id}/versions/{version_id}/assets/paths` +// Item in a `/dandisets/{dandiset_id}/versions/{version_id}/assets/paths/` // response #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(from = "RawFolderEntry")] @@ -131,7 +131,7 @@ impl From for FolderEntry { } } -// Raw item in a `/dandisets/{dandiset_id}/versions/{version_id}/assets/paths` +// Raw item in a `/dandisets/{dandiset_id}/versions/{version_id}/assets/paths/` // response #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] struct RawFolderEntry { @@ -220,14 +220,17 @@ impl ZarrAsset { } } - pub(crate) fn make_folder(&self, folder: S3Folder) -> ZarrFolder { + /// Return a `ZarrFolder` for the folder within this Zarr described by + /// `folder` + fn make_folder(&self, folder: S3Folder) -> ZarrFolder { ZarrFolder { zarr_path: self.path.clone(), path: folder.key_prefix, } } - pub(crate) fn make_entry(&self, obj: S3Object) -> ZarrEntry { + /// Return a `ZarrEntry` for the entry within this Zarr described by `obj` + fn make_entry(&self, obj: S3Object) -> ZarrEntry { ZarrEntry { zarr_path: self.path.clone(), path: obj.key, diff --git a/src/dav/html.rs b/src/dav/html.rs index e1a2e6e..3c07cfa 100644 --- a/src/dav/html.rs +++ b/src/dav/html.rs @@ -1,3 +1,4 @@ +//! Rendering resource listings as HTML documents use super::util::Href; use super::{DavCollection, DavItem, DavResource, ResourceKind}; use crate::consts::HTML_TIMESTAMP_FORMAT; @@ -9,58 +10,89 @@ use tera::{Context, Error, Filter, Tera, Value}; use thiserror::Error; use time::OffsetDateTime; +/// The [Tera](https://keats.github.io/tera/) template for HTML collection +/// views static COLLECTION_TEMPLATE: &str = include_str!("templates/collection.html.tera"); +/// A template manager pub(crate) struct Templater(Tera); impl Templater { + /// Create a new templater and load all templates into it + /// + /// # Errors + /// + /// If any template fails to load, a [`TemplateError::Load`] is returned. pub(crate) fn load() -> Result { let mut engine = Tera::default(); engine.register_filter("formatsize", FormatSizeFilter); engine .add_raw_template("collection.html", COLLECTION_TEMPLATE) .map_err(|source| TemplateError::Load { - path: "collection.html", + template_name: "collection.html", source, })?; Ok(Templater(engine)) } + /// Render an HTML document containing a table listing the resources in + /// `entries` using the site title `title`. `pathparts` contains the + /// individual components of the request URL path. pub(super) fn render_collection( + &self, + entries: Vec, + title: &str, + pathparts: Vec, + ) -> Result { + self.render_collection_from_context(CollectionContext::new(entries, title, pathparts)) + } + + fn render_collection_from_context( &self, context: CollectionContext, ) -> Result { let context = Context::from_serialize(context).map_err(|source| TemplateError::MakeContext { - path: "collection.html", + template_name: "collection.html", source, })?; self.0 .render("collection.html", &context) .map_err(|source| TemplateError::Render { - path: "collection.html", + template_name: "collection.html", source, }) } } +/// Context to provide to the `collection.html` template #[derive(Clone, Debug, Eq, PartialEq, Serialize)] -pub(super) struct CollectionContext { - pub(super) title: String, - pub(super) breadcrumbs: Vec, - pub(super) rows: Vec, - pub(super) package_url: &'static str, - pub(super) package_version: &'static str, +struct CollectionContext { + /// Page title + title: String, + + /// Breadcrumb links + breadcrumbs: Vec, + + /// Rows of the table + rows: Vec, + + /// URL to link "dandidav" in the page's footer to + package_url: &'static str, + + /// `dandidav` version + package_version: &'static str, + + /// Current `dandidav` commit hash (if known) #[serde(skip_serializing_if = "Option::is_none")] - pub(super) package_commit: Option<&'static str>, + package_commit: Option<&'static str>, } impl CollectionContext { - pub(super) fn new( - entries: Vec, - title: &str, - pathparts: Vec, - ) -> CollectionContext { + /// Construct the context for displaying the given `entries` using the site + /// title `title`. `pathparts` contains the individual components of the + /// request URL path. + fn new(entries: Vec, title: &str, pathparts: Vec) -> CollectionContext { let mut rows = entries.into_iter().map(ColRow::from).collect::>(); rows.sort_unstable(); if let Some((_, pp)) = pathparts.split_last() { @@ -82,36 +114,59 @@ impl CollectionContext { } } +/// A hyperlink to display in an HTML document #[derive(Clone, Debug, Eq, PartialEq, Serialize)] -pub(super) struct Link { - name: String, +struct Link { + /// The text of the link + text: String, + + /// The value of the link's `href` attribute href: Href, } +/// A row of a table listing the resources within a collection #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd, Serialize)] -pub(super) struct ColRow { +struct ColRow { + /// Resource basename name: String, + + /// URL to link the resource to href: Href, + + /// `true` iff the resource is a collection is_dir: bool, + + /// Type of resource kind: ResourceKind, + + /// The size of the resource #[serde(skip_serializing_if = "Option::is_none")] size: Option, + + /// The timestamp at which the resource was created #[serde( skip_serializing_if = "Option::is_none", serialize_with = "maybe_timestamp" )] created: Option, + + /// The timestamp at which the resource was last modified #[serde( skip_serializing_if = "Option::is_none", serialize_with = "maybe_timestamp" )] modified: Option, + + /// A URL for retrieving the resource's associated metadata (if any) from + /// the Archive instance #[serde(skip_serializing_if = "Option::is_none")] metadata_url: Option, } impl ColRow { - pub(super) fn parentdir(href: Href) -> ColRow { + /// Construct a `ColRow` representing the parent of the current collection, + /// served at `href` + fn parentdir(href: Href) -> ColRow { ColRow { name: "..".to_owned(), href, @@ -166,14 +221,30 @@ impl From for ColRow { #[derive(Debug, Error)] pub(crate) enum TemplateError { - #[error("failed to load template {path:?}")] - Load { path: &'static str, source: Error }, - #[error("failed to create context for template {path:?}")] - MakeContext { path: &'static str, source: Error }, - #[error("failed to render template {path:?}")] - Render { path: &'static str, source: Error }, + /// Failed to load a template + #[error("failed to load template {template_name:?}")] + Load { + template_name: &'static str, + source: Error, + }, + + /// Failed to create context for a template + #[error("failed to create context for template {template_name:?}")] + MakeContext { + template_name: &'static str, + source: Error, + }, + + /// Failed to render a template + #[error("failed to render template {template_name:?}")] + Render { + template_name: &'static str, + source: Error, + }, } +/// If `ts` is non-`None`, format it and serialize the resulting string to +/// `serializer` fn maybe_timestamp( ts: &Option, serializer: S, @@ -190,24 +261,29 @@ fn maybe_timestamp( } } +/// Create breadcrumbs for the given request URL path components. +/// +/// `title` is the site title, for use as the text of the first breadcrumb. fn make_breadcrumbs(title: &str, pathparts: Vec) -> Vec { let mut links = Vec::with_capacity(pathparts.len().saturating_add(1)); let mut cumpath = String::from("/"); links.push(Link { - name: title.to_owned(), + text: title.to_owned(), href: Href::from_path(&cumpath), }); for p in pathparts { cumpath.push_str(&p); cumpath.push('/'); links.push(Link { - name: p.into(), + text: p.into(), href: Href::from_path(&cumpath), }); } links } +/// Given an iterator of `&Component` values, join them together with forward +/// slashes and add a leading & trailing slash. fn abs_dir_from_components<'a, I>(iter: I) -> String where I: IntoIterator, @@ -220,6 +296,10 @@ where s } +/// A custom Tera filter for formatting file sizes. +/// +/// Unlike the `filesizeformat` filter built into Tera, this filter uses binary +/// units with unambiguous abbreviations, e.g., "10 KiB". #[derive(Copy, Clone, Debug, Eq, PartialEq)] struct FormatSizeFilter; @@ -236,6 +316,9 @@ impl Filter for FormatSizeFilter { } } +/// Format a file size in binary units using unambiguous abbreviations. +/// +/// This function is separate from `FormatSizeFilter` for testing purposes. fn formatsize(size: i64) -> String { format_size_i(size, BINARY) } @@ -261,7 +344,7 @@ mod tests { assert_eq!(formatsize(size), s); } - mod render_collection { + mod render_collection_from_context { use super::*; use crate::dav::{DavContent, DavResourceWithChildren}; use pretty_assertions::assert_eq; @@ -347,7 +430,7 @@ mod tests { "baz".parse().unwrap(), ], ); - let rendered = templater.render_collection(context).unwrap(); + let rendered = templater.render_collection_from_context(context).unwrap(); let commit_str = match option_env!("GIT_COMMIT") { Some(s) => Cow::from(format!(", commit {s}")), None => Cow::from(""), @@ -375,7 +458,7 @@ mod tests { panic!("DavResourceWithChildren::root() should be a Collection"); }; let context = CollectionContext::new(children, "Dandidav Test", Vec::new()); - let rendered = templater.render_collection(context).unwrap(); + let rendered = templater.render_collection_from_context(context).unwrap(); let commit_str = match option_env!("GIT_COMMIT") { Some(s) => Cow::from(format!(", commit {s}")), None => Cow::from(""), diff --git a/src/dav/mod.rs b/src/dav/mod.rs index f8c862b..d7086a9 100644 --- a/src/dav/mod.rs +++ b/src/dav/mod.rs @@ -1,3 +1,4 @@ +//! The WebDAV component of `dandidav` mod html; mod path; mod types; @@ -25,21 +26,48 @@ use futures_util::TryStreamExt; use std::convert::Infallible; use thiserror::Error; +/// HTTP headers to include in all responses for WebDAV resources const WEBDAV_RESPONSE_HEADERS: [(&str, &str); 2] = [ ("Allow", "GET, HEAD, OPTIONS, PROPFIND"), // ("DAV", "1, 3"), ]; +/// Manager for handling WebDAV requests pub(crate) struct DandiDav { + /// A client for fetching data from the Dandi Archive pub(crate) dandi: DandiClient, + + /// A client for fetching data from + /// pub(crate) zarrman: ZarrManClient, + + /// Manager for templating of HTML responses pub(crate) templater: Templater, + + /// Site title to display in HTML responses pub(crate) title: String, + + /// Whether `GET` requests for blob assets should be responded to with + /// redirects to S3 (`true`) or to Archive download URLs that then redirect + /// to S3 (`false`). The latter setting results in the final response + /// having a `Content-Disposition` header, so that the blob is downloaded + /// to the same filename as the asset, rather than to a file named after + /// the blob ID. On the other hand, certain WebDAV clients (i.e., davfs2) + /// do not support multi-step redirects, so setting this to `true` is + /// necessary to allow such clients to download from `dandidav`. pub(crate) prefer_s3_redirects: bool, } impl DandiDav { + /// Handle an incoming HTTP request and return a response. This method + /// must return `Result` for compatibility with `axum`. + /// + /// This method delegates almost all work to + /// [`DandiDav::inner_handle_request()`], after which it handles any + /// errors returned by logging them and converting them to 404 or 500 + /// responses, as appropriate. The final response also has + /// [`WEBDAV_RESPONSE_HEADERS`] added. pub(crate) async fn handle_request( &self, req: Request, @@ -60,14 +88,20 @@ impl DandiDav { Ok((WEBDAV_RESPONSE_HEADERS, resp).into_response()) } + /// Extract & parse request parameters from the URL path and (for + /// `PROPFIND`) "Depth" header and request body. The parsed parameters are + /// then passed to the appropriate method for the request's verb for + /// dedicated handling. async fn inner_handle_request(&self, req: Request) -> Result, DavError> { let uri_path = req.uri().path(); match req.method() { &Method::GET => { let Some(parts) = split_uri_path(uri_path) else { + // TODO: Log something return Ok(not_found()); }; let Some(path) = DavPath::from_components(parts.clone()) else { + // TODO: Log something return Ok(not_found()); }; self.get(&path, parts).await @@ -75,6 +109,7 @@ impl DandiDav { &Method::OPTIONS => Ok(StatusCode::NO_CONTENT.into_response()), m if m.as_str().eq_ignore_ascii_case("PROPFIND") => { let Some(path) = split_uri_path(uri_path).and_then(DavPath::from_components) else { + // TODO: Log something return Ok(not_found()); }; match req.extract::<(FiniteDepth, PropFind), _>().await { @@ -86,15 +121,21 @@ impl DandiDav { } } + /// Handle a `GET` request for the given `path`. + /// + /// `pathparts` contains the individual components of the request URL path + /// prior to parsing into `path`. It is needed for things like breadcrumbs + /// in HTML views of collection resources. async fn get( &self, path: &DavPath, pathparts: Vec, ) -> Result, DavError> { - match self.resolve_with_children(path).await? { + match self.get_resource_with_children(path).await? { DavResourceWithChildren::Collection { children, .. } => { - let context = CollectionContext::new(children, &self.title, pathparts); - let html = self.templater.render_collection(context)?; + let html = self + .templater + .render_collection(children, &self.title, pathparts)?; Ok(([(CONTENT_TYPE, HTML_CONTENT_TYPE)], html).into_response()) } DavResourceWithChildren::Item(DavItem { @@ -112,10 +153,16 @@ impl DandiDav { DavResourceWithChildren::Item(DavItem { content: DavContent::Missing, .. - }) => Ok(not_found()), + }) => { + // TODO: Log something + Ok(not_found()) + } } } + /// Handle a `PROPFIND` request for the given `path`. `depth` is the value + /// of the `Depth` header, and `query` is the parsed request body (with an + /// empty body already defaulted to "allprop" as per the RFC). async fn propfind( &self, path: &DavPath, @@ -123,16 +170,8 @@ impl DandiDav { query: PropFind, ) -> Result, DavError> { let resources = match depth { - FiniteDepth::Zero => vec![self.resolve(path).await?], - FiniteDepth::One => match self.resolve_with_children(path).await? { - DavResourceWithChildren::Collection { col, children } => { - let mut reses = Vec::with_capacity(children.len().saturating_add(1)); - reses.push(DavResource::from(col)); - reses.extend(children); - reses - } - DavResourceWithChildren::Item(item) => vec![DavResource::Item(item)], - }, + FiniteDepth::Zero => vec![self.get_resource(path).await?], + FiniteDepth::One => self.get_resource_with_children(path).await?.into_vec(), }; let response = resources .into_iter() @@ -146,6 +185,9 @@ impl DandiDav { .into_response()) } + /// Obtain a handler for fetching resources for the given version of the + /// given Dandiset. If `version` is `VersionSpec::Latest`, the most recent + /// published version of the Dandiset is used. async fn get_version_handler<'a>( &'a self, dandiset_id: &'a DandisetId, @@ -171,7 +213,8 @@ impl DandiDav { }) } - async fn resolve(&self, path: &DavPath) -> Result { + /// Get details on the resource at the given `path` + async fn get_resource(&self, path: &DavPath) -> Result { match path { DavPath::Root => Ok(DavResource::root()), DavPath::DandisetIndex => Ok(DavResource::Collection(DavCollection::dandiset_index())), @@ -222,7 +265,12 @@ impl DandiDav { } } - async fn resolve_with_children( + /// Get details on the resource at the given `path` along with its + /// immediate child resources (if any). + /// + /// If `path` points to a Dandiset version, the child resources will + /// include `dandiset.yaml` as a virtual asset. + async fn get_resource_with_children( &self, path: &DavPath, ) -> Result { @@ -325,6 +373,10 @@ impl DandiDav { } } +/// A handler for fetching resources belonging to a certain Dandiset & version. +/// +/// Resources returned by this type's methods all have their paths prefixed +/// with the path to the Dandiset & version. #[derive(Clone, Debug)] struct VersionHandler<'a> { dandiset_id: &'a DandisetId, @@ -333,12 +385,15 @@ struct VersionHandler<'a> { } impl VersionHandler<'_> { + /// Get details on the version itself as a collection sans children async fn get(&self) -> Result { let v = self.endpoint.get().await?; let path = version_path(self.dandiset_id, self.version_spec); Ok(DavCollection::dandiset_version(v, path)) } + /// Get details on all resources at the root of the version's file tree + /// (not including the `dandiset.yaml` file) async fn get_root_children(&self) -> Result, DandiError> { self.endpoint .get_root_children() @@ -349,16 +404,20 @@ impl VersionHandler<'_> { .await } + /// Get the version's `dandiset.yaml` file async fn get_dandiset_yaml(&self) -> Result { let md = self.endpoint.get_metadata().await?; Ok(DavItem::from(md).under_version_path(self.dandiset_id, self.version_spec)) } + /// Get details on the resource at the given `path` async fn get_resource(&self, path: &PurePath) -> Result { let res = self.endpoint.get_resource(path).await?; Ok(DavResource::from(res).under_version_path(self.dandiset_id, self.version_spec)) } + /// Get details on the resource at the given `path` along with its + /// immediate child resources (if any) async fn get_resource_with_children( &self, path: &PurePath, @@ -386,6 +445,7 @@ pub(crate) enum DavError { } impl DavError { + /// Was the error ultimately caused by something not being found? pub(crate) fn is_404(&self) -> bool { match self { DavError::Dandi(e) => e.is_404(), @@ -396,6 +456,7 @@ impl DavError { } } +/// Generate a 404 response fn not_found() -> Response { (StatusCode::NOT_FOUND, "404\n").into_response() } diff --git a/src/dav/path.rs b/src/dav/path.rs index 0092fff..85d91e9 100644 --- a/src/dav/path.rs +++ b/src/dav/path.rs @@ -1,37 +1,72 @@ +//! Parsing request paths use crate::consts::FAST_NOT_EXIST; use crate::dandi::{DandisetId, PublishedVersionId}; use crate::paths::{Component, ParseComponentError, PurePath}; +/// A parsed request path #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum DavPath { + /// The root of the hierarchy served by `dandidav` Root, + + /// The list of Dandisets at `/dandisets/` DandisetIndex, - Dandiset { - dandiset_id: DandisetId, - }, - DandisetReleases { - dandiset_id: DandisetId, - }, + + /// A listing for a Dandiset at `/dandiset/{dandiset_id}/` + Dandiset { dandiset_id: DandisetId }, + + /// A listing for a Dandiset's published versions at + /// `/dandiset/{dandiset_id}/releases/` + DandisetReleases { dandiset_id: DandisetId }, + + /// A listing of the top level of a Dandiset version's file hierarchy + /// + /// This corresponds to the following request paths: + /// + /// - `/dandiset/{dandiset_id}/draft/` + /// - `/dandiset/{dandiset_id}/latest/` + /// - `/dandiset/{dandiset_id}/releases/{version_id}/` Version { + /// The Dandiset ID dandiset_id: DandisetId, + + /// The version specifier version: VersionSpec, }, + + /// The `dandiset.yaml` file for a given Dandiset version, served at the + /// path `dandiset.yaml` immediately beneath each version path DandisetYaml { + /// The Dandiset ID dandiset_id: DandisetId, + + /// The version specifier version: VersionSpec, }, + + /// Any other path beneath a Dandiset version path DandiResource { + /// The Dandiset ID dandiset_id: DandisetId, + + /// The version specifier version: VersionSpec, + + /// The portion of the path after the version specifier path: PurePath, }, + + /// The top of the Zarr manifest tree at `/zarrs/` ZarrIndex, - ZarrPath { - path: PurePath, - }, + + /// A path beneath `/zarrs/` + ZarrPath { path: PurePath }, } impl DavPath { + /// Parse a sequence of request path components into a `DavPath`. + /// + /// Returns `None` if the request path is invalid/does not exist. pub(super) fn from_components(parts: Vec) -> Option { let mut iter = parts.into_iter(); let Some(p1) = iter.next() else { @@ -88,13 +123,29 @@ impl DavPath { } } +/// A Dandiset version as specified in a request path #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum VersionSpec { + /// Draft version Draft, + + /// Published version Published(PublishedVersionId), + + /// Most recent published version Latest, } +/// Given a request path `path`, percent-decode it as UTF-8 and split it into +/// its path components/path segments. +/// +/// Splitting is performed on runs of forward slashes after stripping leading & +/// trailing slashes. Single-dot components are ignored. Double-dot +/// components are discarded along with the immediately preceding components. +/// +/// Returns `None` if the path is invalid (i.e., cannot be percent-decoded or +/// contains a NUL character) or if any component is accepted by +/// [`is_fast_not_exist()`]. pub(super) fn split_uri_path(s: &str) -> Option> { // TODO: Convert decoding-failures into DavError: let path = percent_encoding::percent_decode_str(s).decode_utf8().ok()?; @@ -120,10 +171,13 @@ pub(super) fn split_uri_path(s: &str) -> Option> { Some(parts) } +/// An iterator over the substrings of a given string, separated by runs of +/// forward slashes after stripping leading & trailing slashes #[derive(Clone, Debug, Eq, PartialEq)] struct SplitComponents<'a>(&'a str); impl<'a> SplitComponents<'a> { + /// Construct an iterator over the components of `s` fn new(s: &'a str) -> Self { SplitComponents(s.trim_start_matches('/')) } @@ -148,6 +202,8 @@ impl<'a> Iterator for SplitComponents<'a> { impl std::iter::FusedIterator for SplitComponents<'_> {} +/// Returns `true` if the request path component `s` should be treated as +/// nonexistent without having to make any requests to outside services fn is_fast_not_exist(s: &str) -> bool { let s = s.to_ascii_lowercase(); FAST_NOT_EXIST.binary_search(&&*s).is_ok() diff --git a/src/dav/templates/collection.html.tera b/src/dav/templates/collection.html.tera index 3299543..d7adad8 100644 --- a/src/dav/templates/collection.html.tera +++ b/src/dav/templates/collection.html.tera @@ -7,7 +7,7 @@ diff --git a/src/dav/types.rs b/src/dav/types.rs index 52e7fb4..fb7d9a2 100644 --- a/src/dav/types.rs +++ b/src/dav/types.rs @@ -10,17 +10,47 @@ use serde::{ser::Serializer, Serialize}; use time::OffsetDateTime; use url::Url; +/// Trait for querying the values of WebDAV properties from WebDAV resources +/// +/// If a property is queried on a resource that does not have it defined, the +/// query method should return `None`. #[enum_dispatch] pub(super) trait HasProperties { + /// Return the value of the "href" element to use in a "response" for this + /// resource. + /// + /// For `dandidav`, this is the absolute path at which the resource is + /// served. + /// + /// This is technically not a WebDAV property, but it's close enough for + /// our purposes. fn href(&self) -> Href; + + /// Return the value of the "creationdate" property in RFC 3339 format fn creationdate(&self) -> Option; + + /// Return the value of the "displayname" property + /// + /// For `dandidav`, this is the same as the resource's filename. fn displayname(&self) -> Option; + + /// Return the value of the "getcontentlength" property fn getcontentlength(&self) -> Option; + + /// Return the value of the "getcontenttype" property fn getcontenttype(&self) -> Option; + + /// Return the value of the "getetag" property fn getetag(&self) -> Option; + + /// Return the value of the "getlastmodified" property in RFC 1123 format fn getlastmodified(&self) -> Option; + + /// Return `true` iff this is a collection resource fn is_collection(&self) -> bool; + /// Return the value of the given property. `Property::Custom` inputs will + /// always evaluate to `None`. fn property(&self, prop: &Property) -> Option { match prop { Property::CreationDate => self.creationdate().map(Into::into), @@ -36,11 +66,12 @@ pub(super) trait HasProperties { Some(PropValue::Empty) } } - Property::Custom { .. } => None, + Property::Custom(_) => None, } } } +/// Information about a WebDAV resource, not including child resources #[allow(clippy::large_enum_variant)] #[enum_dispatch(HasProperties)] #[derive(Clone, Debug, Eq, PartialEq)] @@ -50,10 +81,16 @@ pub(super) enum DavResource { } impl DavResource { + /// Construct a `DavResource` representing the root of the hierarchy served + /// by `dandidav` pub(super) fn root() -> Self { DavResource::Collection(DavCollection::root()) } + /// Prefix the resource's path with the path at which `dandidav` serves the + /// given Dandiset & version under `/dandisets/`. + /// + /// See [`version_path()`] for more information. pub(super) fn under_version_path( self, dandiset_id: &DandisetId, @@ -93,16 +130,23 @@ impl From for DavResource { } } +/// Information about a WebDAV resource and its immediate child resources (if +/// any) #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum DavResourceWithChildren { Collection { + /// A collection resource col: DavCollection, + + /// The child resources of the collection children: Vec, }, Item(DavItem), } impl DavResourceWithChildren { + /// Construct a `DavResourceWithChildren` representing the root of the + /// hierarchy served by `dandidav` pub(super) fn root() -> Self { DavResourceWithChildren::Collection { col: DavCollection::root(), @@ -113,6 +157,11 @@ impl DavResourceWithChildren { } } + /// Prefix the paths of the resource and its child resources with the path + /// at which `dandidav` serves the given Dandiset & version under + /// `/dandisets/`. + /// + /// See [`version_path()`] for more information. pub(super) fn under_version_path( self, dandiset_id: &DandisetId, @@ -133,6 +182,19 @@ impl DavResourceWithChildren { } } } + + /// Convert to a `Vec` of all `DavResources`s represented within `self` + pub(super) fn into_vec(self) -> Vec { + match self { + DavResourceWithChildren::Collection { col, children } => { + let mut vec = Vec::with_capacity(children.len().saturating_add(1)); + vec.push(DavResource::from(col)); + vec.extend(children); + vec + } + DavResourceWithChildren::Item(item) => vec![DavResource::Item(item)], + } + } } impl From for DavResourceWithChildren { @@ -192,21 +254,45 @@ impl From for DavResourceWithChildren { } } +/// Information on a collection resource #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct DavCollection { - pub(super) path: Option, // None for root collection + /// The path at which the collection is served by `dandidav`. This is + /// `None` iff the collection is the root collection. + /// + /// Note that collections inside a Dandiset version need to have + /// `under_version_path()` called on them in order for `path` to be + /// complete. + pub(super) path: Option, + + /// The timestamp at which the resource was created pub(super) created: Option, + + /// The timestamp at which the resource was last modified pub(super) modified: Option, + + /// The size of the resource. + /// + /// When defined, this is the sum of the sizes of all descendant + /// non-collection resources within the collection. pub(super) size: Option, + + /// The type of resource, for display in the "Type" column of HTML tables pub(super) kind: ResourceKind, + + /// A URL for retrieving the resource's associated metadata (if any) from + /// the Archive instance pub(super) metadata_url: Option, } impl DavCollection { + /// Return the base name of the resource's path pub(super) fn name(&self) -> Option<&str> { self.path.as_ref().map(PureDirPath::name_str) } + /// Return the link to use for the resource in the HTML view of its parent + /// collection as an absolute URL path (including leading slash) pub(super) fn web_link(&self) -> Href { match self.path { Some(ref p) => Href::from_path(&format!("/{p}")), @@ -214,6 +300,10 @@ impl DavCollection { } } + /// Prefix the resource's path with the path at which `dandidav` serves the + /// given Dandiset & version under `/dandisets/`. + /// + /// See [`version_path()`] for more information. pub(super) fn under_version_path( mut self, dandiset_id: &DandisetId, @@ -228,6 +318,8 @@ impl DavCollection { self } + /// Construct a `DavCollection` representing the root of the hierarchy + /// served by `dandidav` pub(super) fn root() -> Self { DavCollection { path: None, @@ -239,6 +331,8 @@ impl DavCollection { } } + /// Construct a `DavCollection` representing the list of Dandisets at + /// `/dandisets/` pub(super) fn dandiset_index() -> Self { DavCollection { path: Some( @@ -254,6 +348,8 @@ impl DavCollection { } } + /// Construct a `DavCollection` representing the listing for the given + /// Dandiset's published versions at `/dandiset/{dandiset_id}/releases/` pub(super) fn dandiset_releases(dandiset_id: &DandisetId) -> Self { DavCollection { path: Some( @@ -268,6 +364,8 @@ impl DavCollection { } } + /// Construct a `DavCollection` representing the Dandiset version `v` + /// as served at path `path` pub(super) fn dandiset_version(v: DandisetVersion, path: PureDirPath) -> Self { DavCollection { path: Some(path), @@ -279,6 +377,8 @@ impl DavCollection { } } + /// Construct a `DavCollection` representing the top of the Zarr manifest + /// tree at `/zarrs/` pub(super) fn zarr_index() -> Self { DavCollection { path: Some( @@ -423,24 +523,46 @@ impl From for DavCollection { } } +/// Information on a non-collection resource #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct DavItem { + /// The path at which the resource is served by `dandidav` pub(super) path: PurePath, + + /// The timestamp at which the resource was created pub(super) created: Option, + + /// The timestamp at which the resource was last modified pub(super) modified: Option, + + /// The resource's Content-Type/MIME type pub(super) content_type: String, + + /// The size of the resource pub(super) size: Option, + + /// The resource's ETag pub(super) etag: Option, + + /// The type of resource, for display in the "Type" column of HTML tables pub(super) kind: ResourceKind, + + /// The content of the resource or a link to it pub(super) content: DavContent, + + /// A URL for retrieving the resource's associated metadata (if any) from + /// the Archive instance pub(super) metadata_url: Option, } impl DavItem { + /// Return the base name of the resource's path pub(super) fn name(&self) -> &str { self.path.name_str() } + /// Return the link to use for the resource in the HTML view of its parent + /// collection as an absolute URL path (including leading slash) pub(super) fn web_link(&self) -> Href { if let DavContent::Redirect(ref redir) = self.content { // Link directly to the download URL in the web view in order to @@ -451,6 +573,10 @@ impl DavItem { } } + /// Prefix the resource's path with the path at which `dandidav` serves the + /// given Dandiset & version under `/dandisets/`. + /// + /// See [`version_path()`] for more information. pub(super) fn under_version_path( mut self, dandiset_id: &DandisetId, @@ -579,21 +705,40 @@ impl From for DavItem { } } +/// The content of a non-collection resource or a link thereto #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum DavContent { + /// The raw content to serve in response to a `GET` request for the + /// resource. + /// + /// This is only used for `dandiset.yaml` resources, for which the content + /// is automatically generated by `dandidav`. Blob(Vec), + + /// A URL that `dandidav` should redirect to when a `GET` request is made + /// for the resource Redirect(Redirect), - // Used when a blob asset lacks an S3 download URL + + /// No download URL could be determined for the resource Missing, } +/// A URL or choice of URLs to redirect a request to #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum Redirect { + /// A single URL to always redirect to Direct(Url), + + /// An S3 URL and an Archive instance URL, to be selected between based on + /// whether `--prefer-s3-redirects` was supplied at program invocation Alt { s3: Url, archive: Url }, } impl Redirect { + /// Resolve to a single URL. + /// + /// If `prefer_s3` is `true`, `Alt` variants resolve to their `s3` field; + /// otherwise, they resolve to their `archive` field. pub(super) fn get_url(&self, prefer_s3: bool) -> &Url { match self { Redirect::Direct(u) => u, @@ -608,24 +753,49 @@ impl Redirect { } } -// For use in rendering the "Type" column in HTML views +/// An enumeration of resource types for use in the "Type" column of HTML views #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub(super) enum ResourceKind { + /// The root of the hierarchy served by `dandidav` Root, + + /// Link to parent directory Parent, + + /// The list of Dandisets at `/dandisets/` DandisetIndex, + + /// A listing for a Dandiset at `/dandiset/{dandiset_id}/` Dandiset, + + /// A listing for a Dandiset's published versions at + /// `/dandiset/{dandiset_id}/releases/` DandisetReleases, + + /// A listing of the top level of a Dandiset version's file hierarchy Version, + + /// The `dandiset.yaml` file for a Dandiset version VersionMetadata, + + /// A generic directory Directory, + + /// A blob asset Blob, + + /// A Zarr asset Zarr, + + /// A Zarr entry ZarrEntry, + + /// The top of the Zarr manifest tree at `/zarrs/` ZarrIndex, } impl ResourceKind { + /// Return a human-readable string to display in a "Type" column pub(super) fn as_str(&self) -> &'static str { match self { ResourceKind::Root => "Root", // Not actually shown diff --git a/src/dav/util.rs b/src/dav/util.rs index c3946e7..3bc11b6 100644 --- a/src/dav/util.rs +++ b/src/dav/util.rs @@ -19,11 +19,16 @@ use time::{ OffsetDateTime, }; +/// Timestamp format for display of the "getlastmodified" property in WebDAV +/// XML documents static RFC1123: &[FormatItem<'_>] = format_description!( "[weekday repr:short], [day] [month repr:short] [year] [hour]:[minute]:[second] GMT" ); -// Selection of safe characters based on Python's `urllib.parse.quote()` +/// ASCII bytes in "href" values to percent-encode +/// +/// The character set is based on the behavior of Python's +/// `urllib.parse.quote()` static PERCENT_ESCAPED: &AsciiSet = &NON_ALPHANUMERIC .remove(b'-') .remove(b'.') @@ -31,6 +36,8 @@ static PERCENT_ESCAPED: &AsciiSet = &NON_ALPHANUMERIC .remove(b'_') .remove(b'~'); +/// Response body to return in reply to `PROPFIND` requests with missing or +/// "infinite" `Depth` headers static INFINITE_DEPTH_RESPONSE: &str = indoc! {r#" @@ -38,6 +45,14 @@ static INFINITE_DEPTH_RESPONSE: &str = indoc! {r#" "#}; +/// Return the path at which `dandidav` serves the given Dandiset & version +/// under `/dandisets/`. +/// +/// The returned value will have one of the following formats: +/// +/// - `dandiset/{dandiset_id}/draft/` +/// - `dandiset/{dandiset_id}/latest/` +/// - `dandiset/{dandiset_id}/releases/{version_id}/` pub(super) fn version_path(dandiset_id: &DandisetId, version: &VersionSpec) -> PureDirPath { fn writer(s: &mut String, dandiset_id: &DandisetId, version: &VersionSpec) -> fmt::Result { write!(s, "dandisets/{dandiset_id}/")?; @@ -55,17 +70,22 @@ pub(super) fn version_path(dandiset_id: &DandisetId, version: &VersionSpec) -> P PureDirPath::try_from(s).expect("should be a valid dir path") } +/// Format a timestamp for display as a "creationdate" property in a WebDAV XML +/// document pub(super) fn format_creationdate(dt: OffsetDateTime) -> String { dt.format(&Rfc3339) .expect("formatting an OffsetDateTime in RFC 3339 format should not fail") } +/// Format a timestamp for display as a "getlastmodified" property in a WebDAV +/// XML document pub(super) fn format_modifieddate(dt: OffsetDateTime) -> String { dt.to_offset(time::UtcOffset::UTC) .format(&RFC1123) .expect("formatting an OffsetDateTime in RFC 1123 format should not fail") } +/// A non-infinite `Depth` WebDAV header value #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub(super) enum FiniteDepth { Zero, diff --git a/src/dav/xml/mod.rs b/src/dav/xml/mod.rs index b29b383..5c1c134 100644 --- a/src/dav/xml/mod.rs +++ b/src/dav/xml/mod.rs @@ -1,3 +1,4 @@ +//! Working with WebDAV XML documents mod multistatus; mod propfind; pub(super) use self::multistatus::*; diff --git a/src/httputil.rs b/src/httputil.rs index 6451f1f..739bb34 100644 --- a/src/httputil.rs +++ b/src/httputil.rs @@ -1,3 +1,4 @@ +//! HTTP utilities use crate::consts::USER_AGENT; use reqwest::{Method, Request, Response, StatusCode}; use reqwest_middleware::{Middleware, Next}; @@ -8,10 +9,16 @@ use thiserror::Error; use tracing::Instrument; use url::Url; +/// An HTTP client that logs all requests and retries failed requests #[derive(Debug, Clone)] pub(crate) struct Client(reqwest_middleware::ClientWithMiddleware); impl Client { + /// Construct a new client + /// + /// # Errors + /// + /// Returns an error if construction of the inner `reqwest::Client` fails pub(crate) fn new() -> Result { let retry_policy = ExponentialBackoff::builder() .base(2) @@ -29,6 +36,12 @@ impl Client { Ok(Client(client)) } + /// Perform an HTTP request with the given method to the given URL + /// + /// # Errors + /// + /// If sending the request fails or the response has a 4xx or 5xx status, + /// an error is returned. pub(crate) async fn request(&self, method: Method, url: Url) -> Result { let r = self .0 @@ -46,14 +59,33 @@ impl Client { .map_err(|source| HttpError::Status { url, source }) } + /// Perform a `HEAD` request to the given URL + /// + /// # Errors + /// + /// If sending the request fails or the response has a 4xx or 5xx status, + /// an error is returned. pub(crate) async fn head(&self, url: Url) -> Result { self.request(Method::HEAD, url).await } + /// Perform a `GET` request to the given URL + /// + /// # Errors + /// + /// If sending the request fails or the response has a 4xx or 5xx status, + /// an error is returned. pub(crate) async fn get(&self, url: Url) -> Result { self.request(Method::GET, url).await } + /// Perform a `GET` request to the given URL and deserialize the response + /// body as JSON into `T` + /// + /// # Errors + /// + /// If sending the request fails, the response has a 4xx or 5xx status, or + /// deserialization of the response body fails, an error is returned. pub(crate) fn get_json( &self, url: Url, @@ -61,7 +93,7 @@ impl Client { // Clone the client and move it into an async block (as opposed to just // writing a "normal" async function) so that the resulting Future will // be 'static rather than retaining a reference to &self, thereby - // facilitating the Future's use by the Paginate stream. + // simplifying the Future's use by the Paginate stream. let client = self.clone(); async move { client @@ -74,6 +106,8 @@ impl Client { } } +/// Middleware for a `reqwest::Client` that adds logging of HTTP requests and +/// their responses #[derive(Copy, Clone, Debug, Eq, PartialEq)] struct SimpleReqwestLogger; @@ -101,25 +135,45 @@ impl Middleware for SimpleReqwestLogger { } } +/// Error returned if initializing an HTTP client fails #[derive(Debug, Error)] #[error("failed to initialize HTTP client")] pub(crate) struct BuildClientError(#[from] reqwest::Error); +/// Error returned if an outgoing HTTP request fails #[derive(Debug, Error)] pub(crate) enum HttpError { + /// Sending the request failed #[error("failed to make request to {url}")] Send { url: Url, source: reqwest_middleware::Error, }, + + /// The server returned a 404 response #[error("no such resource: {url}")] NotFound { url: Url }, + + /// The server returned a 4xx or 5xx response other than 404 #[error("request to {url} returned error")] Status { url: Url, source: reqwest::Error }, + + /// Deserializing the response body as JSON failed #[error("failed to deserialize response body from {url}")] Deserialize { url: Url, source: reqwest::Error }, } +/// Create a URL by extending `url`'s path with the path segments `segments`. +/// The resulting URL will not end with a slash (but see +/// [`urljoin_slashed()`]). +/// +/// If `url` does not end with a forward slash, one will be appended, and then +/// the segments will be added after that. +/// +/// # Panics +/// +/// Panics if `url` cannot be a base URL. (Note that HTTP(S) URLs can be base +/// URLs.) pub(crate) fn urljoin(url: &Url, segments: I) -> Url where I: IntoIterator, @@ -133,6 +187,16 @@ where url } +/// Create a URL by extending `url`'s path with the path segments `segments` +/// and then terminating the result with a forward slash. +/// +/// If `url` does not end with a forward slash, one will be appended, and then +/// the segments will be added after that. +/// +/// # Panics +/// +/// Panics if `url` cannot be a base URL. (Note that HTTP(S) URLs can be base +/// URLs.) pub(crate) fn urljoin_slashed(url: &Url, segments: I) -> Url where I: IntoIterator, diff --git a/src/main.rs b/src/main.rs index 3a94dbf..4a1104f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -36,6 +36,7 @@ use tower_http::{set_header::response::SetResponseHeaderLayer, trace::TraceLayer use tracing::Level; use tracing_subscriber::{filter::Targets, fmt::time::OffsetTime, prelude::*}; +/// The content of the CSS stylesheet to serve at `/.static/styles.css` static STYLESHEET: &str = include_str!("dav/static/styles.css"); /// WebDAV view to DANDI Archive @@ -140,6 +141,8 @@ async fn run() -> anyhow::Result<()> { Ok(()) } +/// Handle `HEAD` requests by converting them to `GET` requests and discarding +/// the resulting response body async fn handle_head(method: Method, mut request: Request, next: Next) -> Response { if method == Method::HEAD { *request.method_mut() = Method::GET; diff --git a/src/paths/mod.rs b/src/paths/mod.rs index 2e5f24d..8359fbb 100644 --- a/src/paths/mod.rs +++ b/src/paths/mod.rs @@ -1,3 +1,4 @@ +//! Path types with restricted formats mod component; mod dirpath; mod purepath; diff --git a/src/s3/mod.rs b/src/s3/mod.rs index e50d9bd..b55b9cd 100644 --- a/src/s3/mod.rs +++ b/src/s3/mod.rs @@ -1,3 +1,4 @@ +//! Facilities for retrieving information from an S3 bucket mod streams; use self::streams::ListEntryPages; use crate::httputil::{self, BuildClientError, HttpError}; @@ -160,6 +161,18 @@ pub(crate) struct S3Location { } impl S3Location { + /// Parse an S3 URL into an `S3Location`. The URL must have a scheme of + /// "http" or "https" and have a domain in one of the following formats: + /// + /// - `{bucket}.s3.{region}.amazonaws.com` + /// - `{bucket}.s3-{region}.amazonaws.com` + /// - `{bucket}.s3.amazonaws.com` + /// + /// The bucket and optional region are extracted from the domain and used + /// to construct the `bucket_spec` field of the resulting `S3Location`. + /// + /// The path component of the URL has its leading forward slash (if any) + /// stripped and is then percent-decoded to produce the `key` field. pub(crate) fn parse_url(url: &Url) -> Result { // cf. if !matches!(url.scheme(), "http" | "https") { diff --git a/src/streamutil.rs b/src/streamutil.rs index 5503002..b079733 100644 --- a/src/streamutil.rs +++ b/src/streamutil.rs @@ -1,9 +1,15 @@ +//! Extensions for stream types use futures_util::{Stream, TryStream}; -use pin_project_lite::pin_project; +use pin_project::pin_project; use std::pin::Pin; use std::task::{ready, Context, Poll}; +/// Extension methods for [`futures_util::TryStream`] pub(crate) trait TryStreamUtil: TryStream { + /// Wraps the current stream in a new stream that maps the success values + /// through `f` to produce an iterator; the success values of the new + /// stream will then be the elements of the concatenation of those + /// iterators. fn try_flat_iter_map(self, f: F) -> TryFlatIterMap where F: FnMut(Self::Ok) -> I, @@ -16,18 +22,27 @@ pub(crate) trait TryStreamUtil: TryStream { impl TryStreamUtil for S {} -pin_project! { - #[derive(Clone, Debug)] - #[must_use = "streams do nothing unless polled"] - pub(crate) struct TryFlatIterMap { - #[pin] - inner: S, - f: F, - iter: Option, - } +/// Return type of [`TryStreamUtil::try_flat_iter_map()`] +#[derive(Clone, Debug)] +// We need to use pin_project instead of pin_project_lite because the latter +// doesn't seem to support comments on fields. +#[pin_project] +#[must_use = "streams do nothing unless polled"] +pub(crate) struct TryFlatIterMap { + /// The stream that `try_flat_iter_map()` was called on + #[pin] + inner: S, + + /// The function passed to `try_flat_iter_map()` + f: F, + + /// The iterator produced by the current success element of `inner`, if any + /// and if not yet exhausted + iter: Option, } impl TryFlatIterMap { + /// Construct a `TryFlatIterMap` for a call to `inner.try_flat_iter_map(f)` fn new(inner: S, f: F) -> Self { TryFlatIterMap { inner, @@ -63,3 +78,27 @@ where } } } + +#[cfg(test)] +mod tests { + use super::*; + use futures_util::TryStreamExt; + + #[tokio::test] + async fn test_try_flat_iter_map() { + let mut stream = futures_util::stream::iter(vec![Ok(5), Ok(2), Err(42), Ok(3)]) + .try_flat_iter_map(|x| 0..x); + assert_eq!(stream.try_next().await, Ok(Some(0))); + assert_eq!(stream.try_next().await, Ok(Some(1))); + assert_eq!(stream.try_next().await, Ok(Some(2))); + assert_eq!(stream.try_next().await, Ok(Some(3))); + assert_eq!(stream.try_next().await, Ok(Some(4))); + assert_eq!(stream.try_next().await, Ok(Some(0))); + assert_eq!(stream.try_next().await, Ok(Some(1))); + assert_eq!(stream.try_next().await, Err(42)); + assert_eq!(stream.try_next().await, Ok(Some(0))); + assert_eq!(stream.try_next().await, Ok(Some(1))); + assert_eq!(stream.try_next().await, Ok(Some(2))); + assert_eq!(stream.try_next().await, Ok(None)); + } +} diff --git a/src/validstr.rs b/src/validstr.rs index e21cc1a..1265756 100644 --- a/src/validstr.rs +++ b/src/validstr.rs @@ -1,12 +1,49 @@ +//! Autogenerating trait implementations for constrained strings use thiserror::Error; +/// The `Error` type of `TryFrom for $t` implementations generated by +/// `validstr!()` #[derive(Clone, Debug, Eq, Error, PartialEq)] #[error("{source}: {string:?}")] pub(crate) struct TryFromStringError { + /// The error returned by `$validator` pub(crate) source: E, + + /// The string that failed validation pub(crate) string: String, } +/// Autogenerate trait implementations for constrained/"validated" strings +/// +/// Given: +/// +/// - A type `$t` defined as a tuple struct whose only field is either `String` +/// or a compatible type like [`smartstring::alias::CompactString`] +/// +/// - An error type `$err` +/// +/// - A function name `$validator` with the signature `(&str) -> Result<(), +/// $err>` +/// +/// - A string `$expecting` +/// +/// `validstr($t, $err, $validator, $expecting)` defines a number of trait +/// implementations for `$t` that allow for, among other things: +/// +/// - Converting `$t` and `&$t` to a `String` +/// +/// - Converting `&str` and `String` values to `$t` if & only if they are +/// accepted by `$validator` +/// +/// - The error types returned by these operations are `$err` and +/// `TryFromStringError<$err>`, respectively. +/// +/// - Comparing `$t` to string values +/// +/// - Dereferencing `$t` as a `&str` +/// +/// - Using serde to serialize `$t` to a string and deserialize `$t` from a +/// string (with the latter using `$expecting` as the "expecting" message) macro_rules! validstr { ($t:ident, $err:ty, $validator:ident, $expecting:literal) => { impl From<$t> for String {