diff --git a/scripts/generate_test_resource.sh b/scripts/generate_test_resource.sh new file mode 100755 index 00000000..c22c2439 --- /dev/null +++ b/scripts/generate_test_resource.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +viash run src/process_datasets/convert/config.vsh.yaml -- \ + --input_sc resources_test/datasets_raw/MOBNEW/dataset_sc.rds \ + --input_sp resources_test/datasets_raw/MOBNEW/dataset_sp.rds \ + --output_sc resources_test/datasets/MOBNEW/dataset_sc.h5ad \ + --output_sp resources_test/datasets/MOBNEW/dataset_sp.h5ad \ + --dataset_id MOBNEW \ + --dataset_name "MOBNEW" \ + --dataset_description "MOBNEW" \ + --dataset_summary "MOBNEW" \ + --dataset_reference "..." \ + --dataset_organism "mus_musculus" diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml deleted file mode 100644 index 0a5a05fc..00000000 --- a/src/api/file_common_dataset.yaml +++ /dev/null @@ -1,41 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/common/pancreas/dataset.h5ad" -info: - label: "Common Dataset" - summary: A subset of the common dataset. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false diff --git a/src/api/file_dataset_sc.yaml b/src/api/file_dataset_sc.yaml new file mode 100644 index 00000000..426b97eb --- /dev/null +++ b/src/api/file_dataset_sc.yaml @@ -0,0 +1,65 @@ +type: file +example: "resources_test/common/mobnew/dataset_sc.h5ad" +info: + label: Single-cell dataset + summary: An unprocessed single-cell dataset as output by a dataset loader. + description: | + This dataset contains raw counts and metadata as output by a dataset loader. + + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/api/file_dataset_sp.yaml b/src/api/file_dataset_sp.yaml new file mode 100644 index 00000000..131a4c47 --- /dev/null +++ b/src/api/file_dataset_sp.yaml @@ -0,0 +1,80 @@ +type: file +example: "resources_test/common/mobnew/dataset_sp.h5ad" +info: + label: Spatial dataset + summary: An unprocessed spatial dataset as output by a dataset loader. + description: | + This dataset contains raw counts and metadata as output by a dataset loader. + + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: integer + name: logcounts + description: Log-transformed counts + required: true + obs: + - type: double + name: col + description: Column index of the cell in the spatial grid. + required: true + - type: double + name: row + description: Row index of the cell in the spatial grid. + required: true + - type: double + name: sizeFactor + description: Size factor for the cell. + required: true + - type: integer + name: spatial_cluster + description: Spatial cluster assignment for the cell. + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + obsm: + - type: float + name: celltype_proportions + description: Spot-by-celltype matrix of celltype proportions generated by CARD. + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/process_datasets/convert/config.vsh.yaml b/src/process_datasets/convert/config.vsh.yaml new file mode 100644 index 00000000..cef9c7b4 --- /dev/null +++ b/src/process_datasets/convert/config.vsh.yaml @@ -0,0 +1,78 @@ +name: convert +namespace: process_datasets +description: Transform the figshare rds into an HDF5-backed AnnData file. +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input_sc + description: Raw single-cell dataset + example: resources_test/datasets_raw/MOBNEW/dataset_sc.rds + required: true + + - type: file + name: --input_sp + description: Raw spatial dataset + example: resources_test/datasets_raw/MOBNEW/dataset_sp.rds + required: true + - name: Outputs + arguments: + - type: file + name: --output_sc + description: Processed single-cell dataset + example: resources_test/datasets/MOBNEW/dataset_sc.h5ad + direction: output + required: true + + - type: file + name: --output_sp + description: Processed spatial dataset + example: resources_test/datasets/MOBNEW/dataset_sp.h5ad + direction: output + required: true + - name: Dataset metadata + arguments: + - type: string + name: --dataset_id + description: A unique identifier for the dataset. + required: true + - type: string + name: --dataset_name + description: A human-readable name for the dataset. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - type: string + name: --dataset_reference + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - type: string + name: --dataset_summary + description: Short description of the dataset. + required: true + - type: string + name: --dataset_description + description: Long description of the dataset. + required: true + - type: string + name: --dataset_organism + description: Organism from which the dataset was derived. + required: true + +resources: + - type: r_script + path: script.R + +engines: + - type: docker + image: ghcr.io/openproblems-bio/base_images/r:1.1.0 + setup: + - type: r + bioc: [SingleCellExperiment] + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/process_datasets/convert/script.R b/src/process_datasets/convert/script.R new file mode 100644 index 00000000..9e6f7899 --- /dev/null +++ b/src/process_datasets/convert/script.R @@ -0,0 +1,96 @@ +suppressMessages(library(SingleCellExperiment, quietly = TRUE)) + +## VIASH START +par <- list( + # inputs + input_sc = "resources_test/datasets_raw/MOBNEW/dataset_sc.rds", + input_sp = "resources_test/datasets_raw/MOBNEW/dataset_sp.rds", + + # outputs + output_sc = "resources_test/datasets/MOBNEW/dataset_sc.rds", + output_sp = "resources_test/datasets/MOBNEW/dataset_sp.rds", + + # dataset metadata + dataset_id = "MOBNEW", + dataset_name = "MOBNEW", + dataset_description = "MOBNEW", + dataset_url = "...", + dataset_reference = "...", + dataset_summary = "...", + dataset_organism = "..." +) +## VIASH END + +cat("Read input files\n") +input_sc <- readRDS(par$input_sc) +input_sp <- readRDS(par$input_sp) + +cat("Single cell dataset:\n") +print(input_sc) + +cat("Spatial dataset:\n") +print(input_sp) + +cat("Transforming single cell into AnnData\n") +output_sc <- anndata::AnnData( + layers = list( + counts = Matrix::t(assay(input_sc, "counts")) + ), + obs = data.frame( + row.names = colnames(input_sc), + cell_type = colData(input_sc)$cellType, + donor_id = colData(input_sc)$sampleInfo + ), + var = data.frame( + row.names = rownames(input_sc), + feature_id = rownames(input_sc), + feature_name = rownames(input_sc) + ), + uns = list( + dataset_id = par$dataset_id, + dataset_name = par$dataset_name, + dataset_description = par$dataset_description, + dataset_url = par$dataset_url, + dataset_reference = par$dataset_reference, + dataset_summary = par$dataset_summary, + dataset_organism = par$dataset_organism + ) +) + +cat("Transforming spatial into AnnData\n") +celltype_proportions <- metadata(input_sp)[["celltype_prop"]] + +output_sp <- anndata::AnnData( + layers = list( + counts = Matrix::t(assay(input_sp, "counts")), + logcounts = Matrix::t(assay(input_sp, "logcounts")) + ), + obs = data.frame( + row.names = colnames(input_sp), + col = colData(input_sp)$col, + row = colData(input_sp)$row, + sizeFactor = colData(input_sp)$sizeFactor, + spatial_cluster = colData(input_sp)$spatial.cluster + ), + var = data.frame( + row.names = rownames(input_sp), + feature_id = rownames(input_sp), + feature_name = rownames(input_sp) + ), + obsm = list( + celltype_proportions = celltype_proportions + ), + uns = list( + dataset_id = par$dataset_id, + dataset_name = par$dataset_name, + dataset_description = par$dataset_description, + dataset_url = par$dataset_url, + dataset_reference = par$dataset_reference, + dataset_summary = par$dataset_summary, + dataset_organism = par$dataset_organism + ) +) + +cat("Write output files\n") +output_sc$write_h5ad(par$output_sc, compression = "gzip") +output_sp$write_h5ad(par$output_sp, compression = "gzip")