-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
332 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
viash run src/process_datasets/convert/config.vsh.yaml -- \ | ||
--input_sc resources_test/datasets_raw/MOBNEW/dataset_sc.rds \ | ||
--input_sp resources_test/datasets_raw/MOBNEW/dataset_sp.rds \ | ||
--output_sc resources_test/datasets/MOBNEW/dataset_sc.h5ad \ | ||
--output_sp resources_test/datasets/MOBNEW/dataset_sp.h5ad \ | ||
--dataset_id MOBNEW \ | ||
--dataset_name "MOBNEW" \ | ||
--dataset_description "MOBNEW" \ | ||
--dataset_summary "MOBNEW" \ | ||
--dataset_reference "..." \ | ||
--dataset_organism "mus_musculus" |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
type: file | ||
example: "resources_test/common/mobnew/dataset_sc.h5ad" | ||
info: | ||
label: Single-cell dataset | ||
summary: An unprocessed single-cell dataset as output by a dataset loader. | ||
description: | | ||
This dataset contains raw counts and metadata as output by a dataset loader. | ||
The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). | ||
slots: | ||
layers: | ||
- type: integer | ||
name: counts | ||
description: Raw counts | ||
required: true | ||
obs: | ||
- type: string | ||
name: cell_type | ||
description: Classification of the cell type based on its characteristics and function within the tissue or organism. | ||
required: false | ||
|
||
- type: string | ||
name: donor_id | ||
description: Identifier for the donor from whom the cell sample is obtained. | ||
required: false | ||
var: | ||
- type: string | ||
name: feature_id | ||
description: Unique identifier for the feature, usually a ENSEMBL gene id. | ||
required: false | ||
- type: string | ||
name: feature_name | ||
description: A human-readable name for the feature, usually a gene symbol. | ||
required: true | ||
uns: | ||
- type: string | ||
name: dataset_id | ||
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | ||
required: true | ||
- name: dataset_name | ||
type: string | ||
description: A human-readable name for the dataset. | ||
required: true | ||
- type: string | ||
name: dataset_url | ||
description: Link to the original source of the dataset. | ||
required: false | ||
- name: dataset_reference | ||
type: string | ||
description: Bibtex reference of the paper in which the dataset was published. | ||
required: false | ||
multiple: true | ||
- name: dataset_summary | ||
type: string | ||
description: Short description of the dataset. | ||
required: true | ||
- name: dataset_description | ||
type: string | ||
description: Long description of the dataset. | ||
required: true | ||
- name: dataset_organism | ||
type: string | ||
description: The organism of the sample in the dataset. | ||
required: false | ||
multiple: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
type: file | ||
example: "resources_test/common/mobnew/dataset_sp.h5ad" | ||
info: | ||
label: Spatial dataset | ||
summary: An unprocessed spatial dataset as output by a dataset loader. | ||
description: | | ||
This dataset contains raw counts and metadata as output by a dataset loader. | ||
The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). | ||
slots: | ||
layers: | ||
- type: integer | ||
name: counts | ||
description: Raw counts | ||
required: true | ||
- type: integer | ||
name: logcounts | ||
description: Log-transformed counts | ||
required: true | ||
obs: | ||
- type: double | ||
name: col | ||
description: Column index of the cell in the spatial grid. | ||
required: true | ||
- type: double | ||
name: row | ||
description: Row index of the cell in the spatial grid. | ||
required: true | ||
- type: double | ||
name: sizeFactor | ||
description: Size factor for the cell. | ||
required: true | ||
- type: integer | ||
name: spatial_cluster | ||
description: Spatial cluster assignment for the cell. | ||
required: true | ||
var: | ||
- type: string | ||
name: feature_id | ||
description: Unique identifier for the feature, usually a ENSEMBL gene id. | ||
required: false | ||
- type: string | ||
name: feature_name | ||
description: A human-readable name for the feature, usually a gene symbol. | ||
required: true | ||
obsm: | ||
- type: float | ||
name: celltype_proportions | ||
description: Spot-by-celltype matrix of celltype proportions generated by CARD. | ||
uns: | ||
- type: string | ||
name: dataset_id | ||
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | ||
required: true | ||
- name: dataset_name | ||
type: string | ||
description: A human-readable name for the dataset. | ||
required: true | ||
- type: string | ||
name: dataset_url | ||
description: Link to the original source of the dataset. | ||
required: false | ||
- name: dataset_reference | ||
type: string | ||
description: Bibtex reference of the paper in which the dataset was published. | ||
required: false | ||
multiple: true | ||
- name: dataset_summary | ||
type: string | ||
description: Short description of the dataset. | ||
required: true | ||
- name: dataset_description | ||
type: string | ||
description: Long description of the dataset. | ||
required: true | ||
- name: dataset_organism | ||
type: string | ||
description: The organism of the sample in the dataset. | ||
required: false | ||
multiple: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
name: convert | ||
namespace: process_datasets | ||
description: Transform the figshare rds into an HDF5-backed AnnData file. | ||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- type: file | ||
name: --input_sc | ||
description: Raw single-cell dataset | ||
example: resources_test/datasets_raw/MOBNEW/dataset_sc.rds | ||
required: true | ||
|
||
- type: file | ||
name: --input_sp | ||
description: Raw spatial dataset | ||
example: resources_test/datasets_raw/MOBNEW/dataset_sp.rds | ||
required: true | ||
- name: Outputs | ||
arguments: | ||
- type: file | ||
name: --output_sc | ||
description: Processed single-cell dataset | ||
example: resources_test/datasets/MOBNEW/dataset_sc.h5ad | ||
direction: output | ||
required: true | ||
|
||
- type: file | ||
name: --output_sp | ||
description: Processed spatial dataset | ||
example: resources_test/datasets/MOBNEW/dataset_sp.h5ad | ||
direction: output | ||
required: true | ||
- name: Dataset metadata | ||
arguments: | ||
- type: string | ||
name: --dataset_id | ||
description: A unique identifier for the dataset. | ||
required: true | ||
- type: string | ||
name: --dataset_name | ||
description: A human-readable name for the dataset. | ||
required: true | ||
- type: string | ||
name: --dataset_url | ||
description: Link to the original source of the dataset. | ||
required: false | ||
- type: string | ||
name: --dataset_reference | ||
description: Bibtex reference of the paper in which the dataset was published. | ||
required: false | ||
multiple: true | ||
- type: string | ||
name: --dataset_summary | ||
description: Short description of the dataset. | ||
required: true | ||
- type: string | ||
name: --dataset_description | ||
description: Long description of the dataset. | ||
required: true | ||
- type: string | ||
name: --dataset_organism | ||
description: Organism from which the dataset was derived. | ||
required: true | ||
|
||
resources: | ||
- type: r_script | ||
path: script.R | ||
|
||
engines: | ||
- type: docker | ||
image: ghcr.io/openproblems-bio/base_images/r:1.1.0 | ||
setup: | ||
- type: r | ||
bioc: [SingleCellExperiment] | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
suppressMessages(library(SingleCellExperiment, quietly = TRUE)) | ||
|
||
## VIASH START | ||
par <- list( | ||
# inputs | ||
input_sc = "resources_test/datasets_raw/MOBNEW/dataset_sc.rds", | ||
input_sp = "resources_test/datasets_raw/MOBNEW/dataset_sp.rds", | ||
|
||
# outputs | ||
output_sc = "resources_test/datasets/MOBNEW/dataset_sc.rds", | ||
output_sp = "resources_test/datasets/MOBNEW/dataset_sp.rds", | ||
|
||
# dataset metadata | ||
dataset_id = "MOBNEW", | ||
dataset_name = "MOBNEW", | ||
dataset_description = "MOBNEW", | ||
dataset_url = "...", | ||
dataset_reference = "...", | ||
dataset_summary = "...", | ||
dataset_organism = "..." | ||
) | ||
## VIASH END | ||
|
||
cat("Read input files\n") | ||
input_sc <- readRDS(par$input_sc) | ||
input_sp <- readRDS(par$input_sp) | ||
|
||
cat("Single cell dataset:\n") | ||
print(input_sc) | ||
|
||
cat("Spatial dataset:\n") | ||
print(input_sp) | ||
|
||
cat("Transforming single cell into AnnData\n") | ||
output_sc <- anndata::AnnData( | ||
layers = list( | ||
counts = Matrix::t(assay(input_sc, "counts")) | ||
), | ||
obs = data.frame( | ||
row.names = colnames(input_sc), | ||
cell_type = colData(input_sc)$cellType, | ||
donor_id = colData(input_sc)$sampleInfo | ||
), | ||
var = data.frame( | ||
row.names = rownames(input_sc), | ||
feature_id = rownames(input_sc), | ||
feature_name = rownames(input_sc) | ||
), | ||
uns = list( | ||
dataset_id = par$dataset_id, | ||
dataset_name = par$dataset_name, | ||
dataset_description = par$dataset_description, | ||
dataset_url = par$dataset_url, | ||
dataset_reference = par$dataset_reference, | ||
dataset_summary = par$dataset_summary, | ||
dataset_organism = par$dataset_organism | ||
) | ||
) | ||
|
||
cat("Transforming spatial into AnnData\n") | ||
celltype_proportions <- metadata(input_sp)[["celltype_prop"]] | ||
|
||
output_sp <- anndata::AnnData( | ||
layers = list( | ||
counts = Matrix::t(assay(input_sp, "counts")), | ||
logcounts = Matrix::t(assay(input_sp, "logcounts")) | ||
), | ||
obs = data.frame( | ||
row.names = colnames(input_sp), | ||
col = colData(input_sp)$col, | ||
row = colData(input_sp)$row, | ||
sizeFactor = colData(input_sp)$sizeFactor, | ||
spatial_cluster = colData(input_sp)$spatial.cluster | ||
), | ||
var = data.frame( | ||
row.names = rownames(input_sp), | ||
feature_id = rownames(input_sp), | ||
feature_name = rownames(input_sp) | ||
), | ||
obsm = list( | ||
celltype_proportions = celltype_proportions | ||
), | ||
uns = list( | ||
dataset_id = par$dataset_id, | ||
dataset_name = par$dataset_name, | ||
dataset_description = par$dataset_description, | ||
dataset_url = par$dataset_url, | ||
dataset_reference = par$dataset_reference, | ||
dataset_summary = par$dataset_summary, | ||
dataset_organism = par$dataset_organism | ||
) | ||
) | ||
|
||
cat("Write output files\n") | ||
output_sc$write_h5ad(par$output_sc, compression = "gzip") | ||
output_sp$write_h5ad(par$output_sp, compression = "gzip") |