From 18cfb5933e6d111b166e7f72c9bcf0c50220cf53 Mon Sep 17 00:00:00 2001 From: Lucille Delisle Date: Wed, 22 Nov 2023 17:32:03 +0100 Subject: [PATCH] my proposed changes --- .travis.yml | 5 ++--- cite_seq_count/__main__.py | 2 ++ cite_seq_count/preprocessing.py | 15 ++++++++------- setup.py | 3 ++- .../correct_R1_with_cell_barcode_mm.fastq.gz | Bin 0 -> 2391 bytes tests/test_data/matrix/.~lock.test_matrix.csv# | 1 - tests/test_data/tags/pass/correct_3.csv | 5 +++++ tests/test_data/whitelist.csv | 3 +++ 8 files changed, 22 insertions(+), 12 deletions(-) create mode 100644 tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz delete mode 100644 tests/test_data/matrix/.~lock.test_matrix.csv# create mode 100644 tests/test_data/tags/pass/correct_3.csv create mode 100644 tests/test_data/whitelist.csv diff --git a/.travis.yml b/.travis.yml index f647f52..8a91330 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,11 +19,10 @@ install: - conda config --add channels defaults - conda config --add channels conda-forge - conda config --add channels bioconda - - conda install -c bioconda -c conda-forge snakemake - - conda create -q -n snakemake snakemake>=5.3.1 python=3.6 + - conda create -q -n snakemake 'snakemake>=5.3.1' 'python>=3.10' script: - - python3.6 setup.py install + - pip install . - pytest after_success: diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index 0e36e13..6684880 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -94,6 +94,8 @@ def main(): print("Skipping cell barcode correction") bcs_corrected = 0 + ###### HERE IT STOPS WORKING ########## + # Create sparse matrices for reads results read_results_matrix = processing.generate_sparse_matrices( final_results=final_results, diff --git a/cite_seq_count/preprocessing.py b/cite_seq_count/preprocessing.py index baff2d1..4462608 100644 --- a/cite_seq_count/preprocessing.py +++ b/cite_seq_count/preprocessing.py @@ -57,7 +57,7 @@ def parse_barcode_reference( barcode_pattern = rf"^[ATGC]{{{barcode_length}}}" header = barcodes_pl.columns - set_dif = set(required_header) - set(header) + set_dif = set([required_header]) - set(header) if len(set_dif) != 0: set_diff_string = ",".join(list(set_dif)) raise SystemExit(f"The header is missing {set_diff_string}. Exiting") @@ -74,7 +74,7 @@ def parse_barcode_reference( else: barcodes_pl = barcodes_pl.with_columns( - reference=pl.col(REFERENCE_COLUMN).str.strip_chars(STRIP_CHARS), + reference=pl.col(required_header).str.strip_chars(STRIP_CHARS), ) check_sequence_pattern( @@ -110,7 +110,7 @@ def parse_tags_csv(file_name: str) -> pl.DataFrame: TTCCGCCTCTCTTTG,Hashtag_3 Args: - file_name (str): file path as a tring + file_name (str): file path as a string Returns: pl.DataFrame: polars dataframe with the csv content @@ -381,16 +381,17 @@ def get_barcode_subset( enable_barcode_correction = True if barcode_whitelist: barcode_subset = parse_barcode_reference( - filename=expected_barcodes, + filename=barcode_whitelist, barcode_length=(chemistry.cell_barcode_end - chemistry.cell_barcode_start), required_header=WHITELIST_COLUMN, ) + n_barcodes = len(barcode_subset) # ??? else: - n_barcodes = barcode_whitelist + n_barcodes = expected_barcodes if barcode_reference is not None: barcode_subset = ( barcodes_df.filter( - pl.col(BARCODE_COLUMN).str.is_in( + pl.col(BARCODE_COLUMN).is_in( barcode_reference[REFERENCE_COLUMN] ) ) @@ -399,7 +400,7 @@ def get_barcode_subset( .sort("count", descending=True) .head(n_barcodes * 1.2) .drop("count") - .rename({SEQUENCE_COLUMN: WHITELIST_COLUMN}) + .rename({BARCODE_COLUMN: WHITELIST_COLUMN}) ) else: raw_barcodes_dict = ( diff --git a/setup.py b/setup.py index 58f6cec..49c715d 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,8 @@ "pyyaml==6.0", "pooch==1.6.0", "six==1.16.0", + "polars" ], - python_requires=">=3.8", + python_requires=">=3.10", package_data={"report_template": ["templates/*.json"]}, ) diff --git a/tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz b/tests/test_data/fastq/correct_R1_with_cell_barcode_mm.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..4dc0f1d5fdc35f1694d987ddff4599d6f794580e GIT binary patch literal 2391 zcmV-d38?lTiwFo#8D3=o17mM;a%E$5Us5qHW?^%5aR9YkOS0oO4BWp{WS>Q>zr@wB zw9t+#ULhyQ+`odREJF4W7!!PxsibUII}kyj(E#cB@$=vRes(VU!yo(M{SkNi&+hhc z;C0-C>;CEfN~o`Q6obD0{@TQUzJB2AJsF`d9d{#s5BMu`BE|ASgpbtFxeD`h^jWEb&guG2& zfGgZ08ez}}MuyVHle?2k-1;;!6gK_>eHg-TMkp`?JV6`3-9q>k#h_**-Q#<#6gK4u zd)O&Nqjy~=?h2!9Yl+7Z=}(}ST`#VMQMSFO_aWspSsBF5XOz)nJN-G33$ByXo>9iM zzj;i_`x=jxX_EdA^ftOqPJBigci+Pk0W8iaoLoa(d_Zelh0=*FM-u1m~C8Mmn?~^m&F}dP1 z%DOMP;`gJnFBK)f#QflMBm{YEl9@lFY;1Ysv3?fWvX#duawyr5*sfQqenwfVKFJa) zQODtWt?XYYGsDn>VryM5=RTusYq>GZ@DSHh#V`dcAwp;or5|qSL@SIkdkpVMKDa^2 z{fshtj9vshBw3N`KBJ5=mR`{Z(G8;Th5 zV@lo6C?k)-84-Hk4RY=?%3Ld{Hwh$!+@R!sMp;!KzMo!>4@LD|DBHf=JT`XXzEpWk z*Zku*Y5tOx)XHIA;Ko6=l2JA*NH}VL={emf*L_AA83q#%ddfiiBIiD%tO}NLjXFA5 zku^V~tjv$H0qNjIx$ZN{xE7DT6rZ_K*8GgJ_LWd30e$7T$!wWX&3bH->wc9UE5;ZK zVLweW#p;Y(%`5vP&rS4~xy-Ny5%F0RZ_ZijPiBC$uU(<<~)`I6m z!x-Eo=RTvFvvN{2e@#}FI2pAmH4biCWlqkhW^FmixnHF%OKU-YjgHZ~S?>24W$j+^ zkf?)hmg_#F%sn!dbGY5jO5M+><~i>y=YG|ox7fV~Ql{wV-7M!mqneq2mg{~i^CJ`R ztk`{7^E1lGV?67d<=k(c^=wE1Gs?P_ zJX2a`Wi1smrBc3NT!cSlMeg?*Wu9GJ8fKCE{VEMpsxLgdSme52HM=M?4Ei}JrBylO z%P1=+Ljt4Zx>cU@Gs;?DFu*<6(ku1lLYXzp)xWGtzhBe8=>7isP5pjx4s%Y}D8^QK z&d(_8X~L-+h+$Sa_Zel){o^|2S+(3h+D>_xVOFK?*Dy@+gcp>KjB~3z>tmGlVnSH+M-~kvBQ_TPt#^gXyfuZQA+%g=)V-qMY$H-tM?D z|E8SpH!}Z$<_954Ru;K`$te5oUy-cz`>k_J?kqNK&R@}4l&tu40R@`Bd&tSpu#e23 zP(%;zOE&|KbI^~;7tC<>zvCG8;r@HY%)Rt`yBRp`m(_%H>P6(=J?>}x@2k0BAI906 znUD@8o_ljI{;xQO+04AGW)jV`uEwVmT!bo_nME@Uvzdvws)@|QWwk{#!>|t{?cHf; zPCc@O%-qHQo6|DPYR1QKIc-zSTrgWer}V&WG+Isf5CfWF<`8)H_GG5hLf{FrIV~pc z0)>&8UUVA6Y^zDaF5P$~GlRGqhP8)4H-1kD6wHt>u9samiDnpPTn*|w)GDQCH_NNZ zFlz`r!xc_jln}UJ*1So?dxpTOB)%mNHE)|>M>-{~}23oy(WY3V`F^z3C8V3^Hm`*Zw;vewI5fMM1Uz)Sp! z!*ws${@z32g4vvwCSx9tOJ+LJX$&(mbn0F3;~Fv}vjD@s!HnDyw3<158jQ!ZH>1MP z7tE@m(~psG!LS?Tj(}mt5J+kQw=c-dEQdgb8ObJ`w-nQA6fKZp?OI^?CM{6XX*Uf$ z$SlyRq0=k|kxF|rDlCv;*1TcWIm#?hr8-N6AmEsnpvj^b#DwZQZcL!e3v6qOAM3AmBW$Q=R0jH`i|fHR2GCYc2oW>uXW(33nD zs0!#}dOiaczsr+ZfMI53gEE5a=H1JEz+YJ42{USe{W{7ZGl~|-Fl#^`=~por+f8yu zz%Z+T!ZYsa_n$9U0^if~g0+uSCwVSVHBv2wK>CeGy06vE%5*Tp+U4M^bObeWuo%!W zU6TUY%vwiq!EBs%AKuTh7N{ED7xRYa0<+u^RLuo`7Xk_k)P%rq#O*B41**hti3N^; zPCTXOtgt|aSv!J&+dnuZ?q;ncxL`)7`Hf(&cd(2J}Esx)+Qn8JIr$ShE` zHzf^?d$PD@1~al2U|4%cu>8S};Kl;WpPLT)BqYvyvnXc-8D<>~?R#F0q6IR{*b$_k z{NN)pqjUrrW)!z+Msl2aBXvc($XbA5?Ng3To(@(`If{G3l4Da?phj{O z8}`H_9z}Xnrh^$~Rh=K1wI)cWj`hCi=X( zmqOq+yx)|Q!p4LA6hw*T$&4}=$T0Km2UIH$hTaqw$T0g69=eJHe>adk7hsrf=6@hC J045Aj007>{sFVNz literal 0 HcmV?d00001 diff --git a/tests/test_data/matrix/.~lock.test_matrix.csv# b/tests/test_data/matrix/.~lock.test_matrix.csv# deleted file mode 100644 index 6f2f611..0000000 --- a/tests/test_data/matrix/.~lock.test_matrix.csv# +++ /dev/null @@ -1 +0,0 @@ -,proelli,proelli-ThinkPad-T470s,23.01.2019 16:02,file:///home/proelli/.config/libreoffice/4; \ No newline at end of file diff --git a/tests/test_data/tags/pass/correct_3.csv b/tests/test_data/tags/pass/correct_3.csv new file mode 100644 index 0000000..dcb4b31 --- /dev/null +++ b/tests/test_data/tags/pass/correct_3.csv @@ -0,0 +1,5 @@ +sequence,feature_name +CGTACGTAGCCTAGC,test1 +CGTAGCTCGAAAAAA,test2 +CGTCGAAGCTGAACG,test3 +CGTCGTAGCTGATCG,test4 diff --git a/tests/test_data/whitelist.csv b/tests/test_data/whitelist.csv new file mode 100644 index 0000000..2f7cd0f --- /dev/null +++ b/tests/test_data/whitelist.csv @@ -0,0 +1,3 @@ +whitelist +TACATATTCTTTACTG +TAGAGGGAAGTCAAGC