add conversion component

openproblems-bio · Jul 8, 2024 · 3de84d0 · 3de84d0
1 parent 8c33533
commit 3de84d0
Show file tree

Hide file tree

Showing 6 changed files with 332 additions and 41 deletions.
diff --git a/scripts/generate_test_resource.sh b/scripts/generate_test_resource.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+viash run src/process_datasets/convert/config.vsh.yaml -- \
+  --input_sc resources_test/datasets_raw/MOBNEW/dataset_sc.rds \
+  --input_sp resources_test/datasets_raw/MOBNEW/dataset_sp.rds \
+  --output_sc resources_test/datasets/MOBNEW/dataset_sc.h5ad \
+  --output_sp resources_test/datasets/MOBNEW/dataset_sp.h5ad \
+  --dataset_id MOBNEW \
+  --dataset_name "MOBNEW" \
+  --dataset_description "MOBNEW" \
+  --dataset_summary "MOBNEW" \
+  --dataset_reference "..." \
+  --dataset_organism "mus_musculus"
diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml
diff --git a/src/api/file_dataset_sc.yaml b/src/api/file_dataset_sc.yaml
@@ -0,0 +1,65 @@
+type: file
+example: "resources_test/common/mobnew/dataset_sc.h5ad"
+info:
+  label: Single-cell dataset
+  summary: An unprocessed single-cell dataset as output by a dataset loader.
+  description: |
+    This dataset contains raw counts and metadata as output by a dataset loader.
+
+    The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: cell_type
+        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
+        required: false
+
+      - type: string
+        name: donor_id
+        description: Identifier for the donor from whom the cell sample is obtained.
+        required: false
+    var:
+      - type: string
+        name: feature_id
+        description: Unique identifier for the feature, usually a ENSEMBL gene id.
+        required: false
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
+        required: true
+      - name: dataset_name
+        type: string
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        multiple: true
diff --git a/src/api/file_dataset_sp.yaml b/src/api/file_dataset_sp.yaml
@@ -0,0 +1,80 @@
+type: file
+example: "resources_test/common/mobnew/dataset_sp.h5ad"
+info:
+  label: Spatial dataset
+  summary: An unprocessed spatial dataset as output by a dataset loader.
+  description: |
+    This dataset contains raw counts and metadata as output by a dataset loader.
+
+    The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: integer
+        name: logcounts
+        description: Log-transformed counts
+        required: true
+    obs:
+      - type: double
+        name: col
+        description: Column index of the cell in the spatial grid.
+        required: true
+      - type: double
+        name: row
+        description: Row index of the cell in the spatial grid.
+        required: true
+      - type: double
+        name: sizeFactor
+        description: Size factor for the cell.
+        required: true
+      - type: integer
+        name: spatial_cluster
+        description: Spatial cluster assignment for the cell.
+        required: true
+    var:
+      - type: string
+        name: feature_id
+        description: Unique identifier for the feature, usually a ENSEMBL gene id.
+        required: false
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        required: true
+    obsm:
+      - type: float
+        name: celltype_proportions
+        description: Spot-by-celltype matrix of celltype proportions generated by CARD.
+    uns:
+      - type: string
+        name: dataset_id
+        description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
+        required: true
+      - name: dataset_name
+        type: string
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        multiple: true
diff --git a/src/process_datasets/convert/config.vsh.yaml b/src/process_datasets/convert/config.vsh.yaml
@@ -0,0 +1,78 @@
+name: convert
+namespace: process_datasets
+description: Transform the figshare rds into an HDF5-backed AnnData file.
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input_sc
+        description: Raw single-cell dataset
+        example: resources_test/datasets_raw/MOBNEW/dataset_sc.rds
+        required: true
+
+      - type: file
+        name: --input_sp
+        description: Raw spatial dataset
+        example: resources_test/datasets_raw/MOBNEW/dataset_sp.rds
+        required: true
+  - name: Outputs
+    arguments:
+      - type: file
+        name: --output_sc
+        description: Processed single-cell dataset
+        example: resources_test/datasets/MOBNEW/dataset_sc.h5ad
+        direction: output
+        required: true
+
+      - type: file
+        name: --output_sp
+        description: Processed spatial dataset
+        example: resources_test/datasets/MOBNEW/dataset_sp.h5ad
+        direction: output
+        required: true
+  - name: Dataset metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: A unique identifier for the dataset.
+        required: true
+      - type: string
+        name: --dataset_name
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - type: string
+        name: --dataset_reference
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - type: string
+        name: --dataset_summary
+        description: Short description of the dataset.
+        required: true
+      - type: string
+        name: --dataset_description
+        description: Long description of the dataset.
+        required: true
+      - type: string
+        name: --dataset_organism
+        description: Organism from which the dataset was derived.
+        required: true
+
+resources:
+  - type: r_script
+    path: script.R
+
+engines:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_images/r:1.1.0
+    setup:
+      - type: r
+        bioc: [SingleCellExperiment]
+
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/process_datasets/convert/script.R b/src/process_datasets/convert/script.R
@@ -0,0 +1,96 @@
+suppressMessages(library(SingleCellExperiment, quietly = TRUE))
+
+## VIASH START
+par <- list(
+  # inputs
+  input_sc = "resources_test/datasets_raw/MOBNEW/dataset_sc.rds",
+  input_sp = "resources_test/datasets_raw/MOBNEW/dataset_sp.rds",
+
+  # outputs
+  output_sc = "resources_test/datasets/MOBNEW/dataset_sc.rds",
+  output_sp = "resources_test/datasets/MOBNEW/dataset_sp.rds",
+
+  # dataset metadata
+  dataset_id = "MOBNEW",
+  dataset_name = "MOBNEW",
+  dataset_description = "MOBNEW",
+  dataset_url = "...",
+  dataset_reference = "...",
+  dataset_summary = "...",
+  dataset_organism = "..."
+)
+## VIASH END
+
+cat("Read input files\n")
+input_sc <- readRDS(par$input_sc)
+input_sp <- readRDS(par$input_sp)
+
+cat("Single cell dataset:\n")
+print(input_sc)
+
+cat("Spatial dataset:\n")
+print(input_sp)
+
+cat("Transforming single cell into AnnData\n")
+output_sc <- anndata::AnnData(
+  layers = list(
+    counts = Matrix::t(assay(input_sc, "counts"))
+  ),
+  obs = data.frame(
+    row.names = colnames(input_sc),
+    cell_type = colData(input_sc)$cellType,
+    donor_id = colData(input_sc)$sampleInfo
+  ),
+  var = data.frame(
+    row.names = rownames(input_sc),
+    feature_id = rownames(input_sc),
+    feature_name = rownames(input_sc)
+  ),
+  uns = list(
+    dataset_id = par$dataset_id,
+    dataset_name = par$dataset_name,
+    dataset_description = par$dataset_description,
+    dataset_url = par$dataset_url,
+    dataset_reference = par$dataset_reference,
+    dataset_summary = par$dataset_summary,
+    dataset_organism = par$dataset_organism
+  )
+)
+
+cat("Transforming spatial into AnnData\n")
+celltype_proportions <- metadata(input_sp)[["celltype_prop"]]
+
+output_sp <- anndata::AnnData(
+  layers = list(
+    counts = Matrix::t(assay(input_sp, "counts")),
+    logcounts = Matrix::t(assay(input_sp, "logcounts"))
+  ),
+  obs = data.frame(
+    row.names = colnames(input_sp),
+    col = colData(input_sp)$col,
+    row = colData(input_sp)$row,
+    sizeFactor = colData(input_sp)$sizeFactor,
+    spatial_cluster = colData(input_sp)$spatial.cluster
+  ),
+  var = data.frame(
+    row.names = rownames(input_sp),
+    feature_id = rownames(input_sp),
+    feature_name = rownames(input_sp)
+  ),
+  obsm = list(
+    celltype_proportions = celltype_proportions
+  ),
+  uns = list(
+    dataset_id = par$dataset_id,
+    dataset_name = par$dataset_name,
+    dataset_description = par$dataset_description,
+    dataset_url = par$dataset_url,
+    dataset_reference = par$dataset_reference,
+    dataset_summary = par$dataset_summary,
+    dataset_organism = par$dataset_organism
+  )
+)
+
+cat("Write output files\n")
+output_sc$write_h5ad(par$output_sc, compression = "gzip")
+output_sp$write_h5ad(par$output_sp, compression = "gzip")