Skip to content

Commit

Permalink
add conversion component
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Jul 8, 2024
1 parent 8c33533 commit 3de84d0
Show file tree
Hide file tree
Showing 6 changed files with 332 additions and 41 deletions.
13 changes: 13 additions & 0 deletions scripts/generate_test_resource.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

viash run src/process_datasets/convert/config.vsh.yaml -- \
--input_sc resources_test/datasets_raw/MOBNEW/dataset_sc.rds \
--input_sp resources_test/datasets_raw/MOBNEW/dataset_sp.rds \
--output_sc resources_test/datasets/MOBNEW/dataset_sc.h5ad \
--output_sp resources_test/datasets/MOBNEW/dataset_sp.h5ad \
--dataset_id MOBNEW \
--dataset_name "MOBNEW" \
--dataset_description "MOBNEW" \
--dataset_summary "MOBNEW" \
--dataset_reference "..." \
--dataset_organism "mus_musculus"
41 changes: 0 additions & 41 deletions src/api/file_common_dataset.yaml

This file was deleted.

65 changes: 65 additions & 0 deletions src/api/file_dataset_sc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
type: file
example: "resources_test/common/mobnew/dataset_sc.h5ad"
info:
label: Single-cell dataset
summary: An unprocessed single-cell dataset as output by a dataset loader.
description: |
This dataset contains raw counts and metadata as output by a dataset loader.
The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
slots:
layers:
- type: integer
name: counts
description: Raw counts
required: true
obs:
- type: string
name: cell_type
description: Classification of the cell type based on its characteristics and function within the tissue or organism.
required: false

- type: string
name: donor_id
description: Identifier for the donor from whom the cell sample is obtained.
required: false
var:
- type: string
name: feature_id
description: Unique identifier for the feature, usually a ENSEMBL gene id.
required: false
- type: string
name: feature_name
description: A human-readable name for the feature, usually a gene symbol.
required: true
uns:
- type: string
name: dataset_id
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
required: true
- name: dataset_name
type: string
description: A human-readable name for the dataset.
required: true
- type: string
name: dataset_url
description: Link to the original source of the dataset.
required: false
- name: dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
multiple: true
- name: dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: dataset_description
type: string
description: Long description of the dataset.
required: true
- name: dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
multiple: true
80 changes: 80 additions & 0 deletions src/api/file_dataset_sp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
type: file
example: "resources_test/common/mobnew/dataset_sp.h5ad"
info:
label: Spatial dataset
summary: An unprocessed spatial dataset as output by a dataset loader.
description: |
This dataset contains raw counts and metadata as output by a dataset loader.
The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
slots:
layers:
- type: integer
name: counts
description: Raw counts
required: true
- type: integer
name: logcounts
description: Log-transformed counts
required: true
obs:
- type: double
name: col
description: Column index of the cell in the spatial grid.
required: true
- type: double
name: row
description: Row index of the cell in the spatial grid.
required: true
- type: double
name: sizeFactor
description: Size factor for the cell.
required: true
- type: integer
name: spatial_cluster
description: Spatial cluster assignment for the cell.
required: true
var:
- type: string
name: feature_id
description: Unique identifier for the feature, usually a ENSEMBL gene id.
required: false
- type: string
name: feature_name
description: A human-readable name for the feature, usually a gene symbol.
required: true
obsm:
- type: float
name: celltype_proportions
description: Spot-by-celltype matrix of celltype proportions generated by CARD.
uns:
- type: string
name: dataset_id
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
required: true
- name: dataset_name
type: string
description: A human-readable name for the dataset.
required: true
- type: string
name: dataset_url
description: Link to the original source of the dataset.
required: false
- name: dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
multiple: true
- name: dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: dataset_description
type: string
description: Long description of the dataset.
required: true
- name: dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
multiple: true
78 changes: 78 additions & 0 deletions src/process_datasets/convert/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
name: convert
namespace: process_datasets
description: Transform the figshare rds into an HDF5-backed AnnData file.
argument_groups:
- name: Inputs
arguments:
- type: file
name: --input_sc
description: Raw single-cell dataset
example: resources_test/datasets_raw/MOBNEW/dataset_sc.rds
required: true

- type: file
name: --input_sp
description: Raw spatial dataset
example: resources_test/datasets_raw/MOBNEW/dataset_sp.rds
required: true
- name: Outputs
arguments:
- type: file
name: --output_sc
description: Processed single-cell dataset
example: resources_test/datasets/MOBNEW/dataset_sc.h5ad
direction: output
required: true

- type: file
name: --output_sp
description: Processed spatial dataset
example: resources_test/datasets/MOBNEW/dataset_sp.h5ad
direction: output
required: true
- name: Dataset metadata
arguments:
- type: string
name: --dataset_id
description: A unique identifier for the dataset.
required: true
- type: string
name: --dataset_name
description: A human-readable name for the dataset.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- type: string
name: --dataset_reference
description: Bibtex reference of the paper in which the dataset was published.
required: false
multiple: true
- type: string
name: --dataset_summary
description: Short description of the dataset.
required: true
- type: string
name: --dataset_description
description: Long description of the dataset.
required: true
- type: string
name: --dataset_organism
description: Organism from which the dataset was derived.
required: true

resources:
- type: r_script
path: script.R

engines:
- type: docker
image: ghcr.io/openproblems-bio/base_images/r:1.1.0
setup:
- type: r
bioc: [SingleCellExperiment]

runners:
- type: executable
- type: nextflow
96 changes: 96 additions & 0 deletions src/process_datasets/convert/script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
suppressMessages(library(SingleCellExperiment, quietly = TRUE))

## VIASH START
par <- list(
# inputs
input_sc = "resources_test/datasets_raw/MOBNEW/dataset_sc.rds",
input_sp = "resources_test/datasets_raw/MOBNEW/dataset_sp.rds",

# outputs
output_sc = "resources_test/datasets/MOBNEW/dataset_sc.rds",
output_sp = "resources_test/datasets/MOBNEW/dataset_sp.rds",

# dataset metadata
dataset_id = "MOBNEW",
dataset_name = "MOBNEW",
dataset_description = "MOBNEW",
dataset_url = "...",
dataset_reference = "...",
dataset_summary = "...",
dataset_organism = "..."
)
## VIASH END

cat("Read input files\n")
input_sc <- readRDS(par$input_sc)
input_sp <- readRDS(par$input_sp)

cat("Single cell dataset:\n")
print(input_sc)

cat("Spatial dataset:\n")
print(input_sp)

cat("Transforming single cell into AnnData\n")
output_sc <- anndata::AnnData(
layers = list(
counts = Matrix::t(assay(input_sc, "counts"))
),
obs = data.frame(
row.names = colnames(input_sc),
cell_type = colData(input_sc)$cellType,
donor_id = colData(input_sc)$sampleInfo
),
var = data.frame(
row.names = rownames(input_sc),
feature_id = rownames(input_sc),
feature_name = rownames(input_sc)
),
uns = list(
dataset_id = par$dataset_id,
dataset_name = par$dataset_name,
dataset_description = par$dataset_description,
dataset_url = par$dataset_url,
dataset_reference = par$dataset_reference,
dataset_summary = par$dataset_summary,
dataset_organism = par$dataset_organism
)
)

cat("Transforming spatial into AnnData\n")
celltype_proportions <- metadata(input_sp)[["celltype_prop"]]

output_sp <- anndata::AnnData(
layers = list(
counts = Matrix::t(assay(input_sp, "counts")),
logcounts = Matrix::t(assay(input_sp, "logcounts"))
),
obs = data.frame(
row.names = colnames(input_sp),
col = colData(input_sp)$col,
row = colData(input_sp)$row,
sizeFactor = colData(input_sp)$sizeFactor,
spatial_cluster = colData(input_sp)$spatial.cluster
),
var = data.frame(
row.names = rownames(input_sp),
feature_id = rownames(input_sp),
feature_name = rownames(input_sp)
),
obsm = list(
celltype_proportions = celltype_proportions
),
uns = list(
dataset_id = par$dataset_id,
dataset_name = par$dataset_name,
dataset_description = par$dataset_description,
dataset_url = par$dataset_url,
dataset_reference = par$dataset_reference,
dataset_summary = par$dataset_summary,
dataset_organism = par$dataset_organism
)
)

cat("Write output files\n")
output_sc$write_h5ad(par$output_sc, compression = "gzip")
output_sp$write_h5ad(par$output_sp, compression = "gzip")

0 comments on commit 3de84d0

Please sign in to comment.