From ec64db52931516cbf2a597aae6e6580c56b8d9e4 Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Mon, 6 Jan 2025 17:16:27 +0400 Subject: [PATCH] Add geneformer (#7) * add geneformer * add to workflow * inherit from base method * fix variable name * fix mapping * cleanup script * update info * minor changes --- src/methods/geneformer/config.vsh.yaml | 46 +++++ src/methods/geneformer/script.py | 213 ++++++++++++++++++++ src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 1 + 4 files changed, 261 insertions(+) create mode 100644 src/methods/geneformer/config.vsh.yaml create mode 100644 src/methods/geneformer/script.py diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml new file mode 100644 index 0000000..582b9cd --- /dev/null +++ b/src/methods/geneformer/config.vsh.yaml @@ -0,0 +1,46 @@ +__merge__: ../../api/base_method.yaml + +name: geneformer +label: Geneformer +summary: Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology. +description: | + Geneformer is a context-aware, attention-based deep learning model pretrained on a large-scale corpus of single-cell transcriptomes to enable context-specific predictions in settings with limited data in network biology. Here, a pre-trained Geneformer model is fine-tuned and used to predict cell type labels for an unlabelled dataset. + +info: + preferred_normalization: counts + +references: + doi: + - 10.1038/s41586-023-06139-9 + - 10.1101/2024.08.16.608180 + +links: + documentation: https://geneformer.readthedocs.io/en/latest/index.html + repository: https://huggingface.co/ctheodoris/Geneformer + +arguments: + - name: "--model" + type: "string" + description: String representing the Geneformer model to use + choices: ["gf-6L-30M-i2048", "gf-12L-30M-i2048", "gf-12L-95M-i4096", "gf-20L-95M-i4096"] + default: "gf-12L-95M-i4096" + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + pip: + - pyarrow<15.0.0a0,>=14.0.1 + - huggingface_hub + - git+https://huggingface.co/ctheodoris/Geneformer.git + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu, gpu] diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py new file mode 100644 index 0000000..463c96a --- /dev/null +++ b/src/methods/geneformer/script.py @@ -0,0 +1,213 @@ +import os +from tempfile import TemporaryDirectory +import anndata as ad +from geneformer import Classifier, TranscriptomeTokenizer, DataCollatorForCellClassification +from huggingface_hub import hf_hub_download +import numpy as np +import datasets +import pickle +from transformers import BertForSequenceClassification, Trainer + +## VIASH START +par = { + 'input_train': 'resources_test/task_label_projection/cxg_immune_cell_atlas/train.h5ad', + 'input_test': 'resources_test/task_label_projection/cxg_immune_cell_atlas/test.h5ad', + 'output': 'output.h5ad', + "model": "gf-12L-95M-i4096", +} +meta = { + 'name': 'geneformer' +} +## VIASH END + +n_processors = os.cpu_count() + +print('>>> Reading input files', flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +if input_train.uns["dataset_organism"] != "homo_sapiens": + raise ValueError( + f"Geneformer can only be used with human data " + f"(dataset_organism == '{input_train.uns['dataset_organism']}')" + ) + +is_ensembl = all(var_name.startswith("ENSG") for var_name in input_train.var_names) +if not is_ensembl: + raise ValueError(f"Geneformer requires input_train.var_names to contain ENSEMBL gene ids") + +print(f">>> Getting settings for model '{par['model']}'...", flush=True) +model_split = par["model"].split("-") +model_details = { + "layers": model_split[1], + "dataset": model_split[2], + "input_size": int(model_split[3][1:]), +} +print(model_details, flush=True) + +print(">>> Getting model dictionary files...", flush=True) +if model_details["dataset"] == "95M": + dictionaries_subfolder = "geneformer" +elif model_details["dataset"] == "30M": + dictionaries_subfolder = "geneformer/gene_dictionaries_30m" +else: + raise ValueError(f"Invalid model dataset: {model_details['dataset']}") +print(f"Dictionaries subfolder: '{dictionaries_subfolder}'") + +dictionary_files = { + "ensembl_mapping": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder=dictionaries_subfolder, + filename=f"ensembl_mapping_dict_gc{model_details['dataset']}.pkl", + ), + "gene_median": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder=dictionaries_subfolder, + filename=f"gene_median_dictionary_gc{model_details['dataset']}.pkl", + ), + "gene_name_id": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder=dictionaries_subfolder, + filename=f"gene_name_id_dict_gc{model_details['dataset']}.pkl", + ), + "token": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder=dictionaries_subfolder, + filename=f"token_dictionary_gc{model_details['dataset']}.pkl", + ), +} + +print(">>> Creating working directory...", flush=True) +work_dir = TemporaryDirectory() + +input_train_dir = os.path.join(work_dir.name, "input_train") +os.makedirs(input_train_dir) +tokenized_train_dir = os.path.join(work_dir.name, "tokenized_train") +os.makedirs(tokenized_train_dir) +classifier_train_dir = os.path.join(work_dir.name, "classifier_train") +os.makedirs(classifier_train_dir) +classifier_fine_tuned_dir = os.path.join(work_dir.name, "classifier_fine_tuned") +os.makedirs(classifier_fine_tuned_dir) +input_test_dir = os.path.join(work_dir.name, "input_test") +os.makedirs(input_test_dir) +tokenized_test_dir = os.path.join(work_dir.name, "tokenized_test") +os.makedirs(tokenized_test_dir) + +print(f"Working directory: '{work_dir.name}'", flush=True) + +print(f">>> Getting model files for model '{par['model']}'...", flush=True) +model_files = { + "model": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder=par["model"], + filename="model.safetensors", + ), + "config": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder=par["model"], + filename="config.json", + ), +} +model_dir = os.path.dirname(model_files["model"]) + +print(">>> Preparing input data...", flush=True) +input_train.X = input_train.layers["counts"] +input_train.var["ensembl_id"] = input_train.var["feature_id"] +input_train.obs["n_counts"] = input_train.layers["counts"].sum(axis=1) +input_train.obs["celltype"] = input_train.obs["label"] +num_types = len(input_train.obs["celltype"].unique()) +input_train.write_h5ad(os.path.join(input_train_dir, "input_train.h5ad")) + +input_test.X = input_test.layers["counts"] +input_test.var["ensembl_id"] = input_test.var["feature_id"] +input_test.obs["n_counts"] = input_test.layers["counts"].sum(axis=1) +input_test.write_h5ad(os.path.join(input_test_dir, "input_test.h5ad")) + +print(">>> Tokenizing train data...", flush=True) +special_token = model_details["dataset"] == "95M" +print(f"Input size: {model_details['input_size']}, Special token: {special_token}") +tokenizer = TranscriptomeTokenizer( + custom_attr_name_dict={"celltype": "celltype"}, + nproc=n_processors, + model_input_size=model_details["input_size"], + special_token=special_token, + gene_median_file=dictionary_files["gene_median"], + token_dictionary_file=dictionary_files["token"], + gene_mapping_file=dictionary_files["ensembl_mapping"], +) +tokenizer.tokenize_data(input_train_dir, tokenized_train_dir, "tokenized", file_format="h5ad") + +print(">>> Tokenizing test data...", flush=True) +special_token = model_details["dataset"] == "95M" +print(f"Input size: {model_details['input_size']}, Special token: {special_token}") +tokenizer = TranscriptomeTokenizer( + nproc=n_processors, + model_input_size=model_details["input_size"], + special_token=special_token, + gene_median_file=dictionary_files["gene_median"], + token_dictionary_file=dictionary_files["token"], + gene_mapping_file=dictionary_files["ensembl_mapping"], +) +tokenizer.tokenize_data(input_test_dir, tokenized_test_dir, "tokenized", file_format="h5ad") + +print(">>> Fine-tuning pre-trained geneformer model for cell state classification...", flush=True) +cc = Classifier( + classifier="cell", + cell_state_dict = {"state_key": "celltype", "states": "all"}, + nproc=n_processors, + token_dictionary_file=dictionary_files["token"], + num_crossval_splits=1, + split_sizes={"train": 0.9, "valid": 0.1, "test": 0.0}, +) + +cc.prepare_data( + input_data_file=os.path.join(tokenized_train_dir, "tokenized.dataset"), + output_directory=classifier_train_dir, + output_prefix="classifier", +) + +train_data = datasets.load_from_disk(classifier_train_dir + "/classifier_labeled.dataset") + +cc.train_classifier( + model_directory=model_dir, + num_classes=num_types, + train_data=train_data, + eval_data=None, + output_directory=classifier_fine_tuned_dir, + predict=False +) + +print(">>> Generating predictions...", flush=True) + +# dictionary mapping labels from classifier to cell types +with open(f"{classifier_train_dir}/classifier_id_class_dict.pkl", "rb") as f: + id_class_dict = pickle.load(f) + +with open(dictionary_files["token"], "rb") as f: + token_dict = pickle.load(f) + +# Load fine-tuned model +model = BertForSequenceClassification.from_pretrained(classifier_fine_tuned_dir) + +test_data = datasets.load_from_disk(tokenized_test_dir + "/tokenized.dataset") +test_data = test_data.add_column("label", [0] * len(test_data)) + +# Get predictions +trainer = Trainer(model=model, data_collator=DataCollatorForCellClassification(token_dictionary=token_dict)) +predictions = trainer.predict(test_data) + +# Select the most likely cell type based on the probability vector from the predictions of each cell +predicted_label_ids = np.argmax(predictions.predictions, axis=1) +predicted_logits = [predictions.predictions[i][predicted_label_ids[i]] for i in range(len(predicted_label_ids))] +input_test.obs['label_pred'] = [id_class_dict[p] for p in predicted_label_ids] + +print(">>> Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_test.obs[["label_pred"]], + uns={ + 'method_id': meta['name'], + 'dataset_id': input_test.uns['dataset_id'], + 'normalization_id': input_test.uns['normalization_id'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index a1c288e..ddc0359 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -80,6 +80,7 @@ dependencies: - name: control_methods/majority_vote - name: control_methods/random_labels - name: control_methods/true_labels + - name: methods/geneformer - name: methods/knn - name: methods/logistic_regression - name: methods/mlp diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 8d8d30a..793e482 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -11,6 +11,7 @@ methods = [ majority_vote, random_labels, true_labels, + geneformer, knn, logistic_regression, mlp,