Skip to content

Commit

Permalink
Merge branch 'release/5.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
sb43 committed Nov 20, 2018
2 parents 344690c + b89880d commit c19bff5
Show file tree
Hide file tree
Showing 33 changed files with 4,619 additions and 398 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ perl/perltidy.LOG
perl/docs
testci/*
perl/t/testData/genome.fa
.idea/*
.nfs*
8 changes: 7 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# CHANGES

## v5.3.9
## v5.4.0
* Reduced memory footprint for augment vcf file
* Augmentation can now run per chromosome
* SAM flag filters are now controlled by vafConstantas.pm file
* .bas file now used by default to calculate library size
* added support for cram files

## v5.3.9
* Change max reads read when determining read length to 50k
* Modify _get_bam_header_data to get the max read length of ALL bam files encountered and therefore to produce the correct lib_size
* Converted internal SAM flag constants to point to `Bio::DB::HTS` constants (in the future internal constants could be replaced)
Expand Down
145 changes: 70 additions & 75 deletions perl/bin/cgpVaf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ BEGIN
};

use strict;

use Bio::DB::HTS::Tabix;
use File::Path qw(mkpath);
use FindBin qw($Bin);
Expand All @@ -41,7 +40,7 @@ BEGIN
use Capture::Tiny qw(:all);

use Log::Log4perl;

Log::Log4perl->init("$Bin/../config/log4perl.vaf.conf");
use lib "$Bin/../lib";
use Sanger::CGP::Vaf::Data::ReadVcf;
use Sanger::CGP::Vaf::VafConstants;
Expand All @@ -62,31 +61,31 @@ BEGIN
}
if ($options->{'a'} eq 'indel') {
$tags=$Sanger::CGP::Vaf::VafConstants::INDEL_TAGS;
}
}
my $vcf_obj = Sanger::CGP::Vaf::Data::ReadVcf->new($options);

my $progress_hash;
# progress checked before the processing starts , speed ups concatenation step
my ($chromosomes)=$vcf_obj->getChromosomes($options->{'chr'});
($progress_hash,$chromosomes)=$vcf_obj->getProgress($chromosomes);
$chromosomes=$vcf_obj->getProgress($chromosomes);

# this is called only once to add allSample names to vcf object
$vcf_obj->getAllSampleNames;
my($info_tag_val,$vcf_file_obj)=$vcf_obj->getVcfHeaderData;
my($bam_objects,$bas_files)=$vcf_obj->_get_bam_object;
my($bam_header_data,$lib_size)=$vcf_obj->_get_bam_header_data($bam_objects,$bas_files);
my($info_tag_val,$vcf_file_obj)=$vcf_obj->getVcfHeaderData();
my($bam_objects,$bas_files)=$vcf_obj->get_bam_object();
my $max_lib_size = $vcf_obj->get_lib_n_read_read_length($bam_objects,$bas_files);
# create variant object
my $variant=Sanger::CGP::Vaf::Process::Variant->new(
'location' => undef,
'tmp' =>$options->{'tmp'},
'varLine' => undef,
'varType' => $vcf_obj->{'_a'},
'libSize' => defined $lib_size?$lib_size:100,
'samples' => $vcf_obj->{'allSamples'},
'tumourName' => $vcf_obj->getTumourName,
'normalName' => $vcf_obj->getNormalName,
'vcfStatus' => $vcf_obj->{'vcf'},
'noVcf' => defined $vcf_obj->{'noVcf'}?$vcf_obj->{'noVcf'}:undef,
'lib_size' => $max_lib_size,
'outDir' => $vcf_obj->getOutputDir,
'passedOnly' => $vcf_obj->{'_r'},
'tabix_hdr' => defined $vcf_obj->{'_hdr'}?Bio::DB::HTS::Tabix->new(filename => $vcf_obj->{'_hdr'}):undef,
Expand All @@ -95,8 +94,7 @@ BEGIN
'exp' => $vcf_obj->{'_exp'},
);

foreach my $chr(@$chromosomes) {
my($progress_fhw,$progress_data)=@{$progress_hash->{$chr}};
foreach my $chr(keys %$chromosomes) {
my($data_for_all_samples,$unique_locations)=$vcf_obj->getMergedLocations($chr, $vcf_file_obj);
if(defined $options->{'b'} ){
my ($bed_locations)=$vcf_obj->getBedHash($chr);
Expand All @@ -106,47 +104,40 @@ BEGIN
($data_for_all_samples,$unique_locations)=$vcf_obj->populateBedLocations($data_for_all_samples,$unique_locations,$bed_locations);
}
}
($store_results)=$vcf_obj->processMergedLocations($data_for_all_samples
,$unique_locations
,$variant
,$bam_header_data
,$bam_objects
,$store_results
,$chr
,$tags
,$info_tag_val
,$progress_fhw
,$progress_data);

close $progress_fhw;
$vcf_obj->processMergedLocations($data_for_all_samples ,$unique_locations,$variant,$chromosomes->{$chr} ,$bam_objects,$chr,$tags,$info_tag_val);
}# completed all chromosomes;
# if augmentation option is selected then write augmented vcf file
if(defined $options->{'m'} && $options->{'m'} == 1) {
my($aug_vcf_fh,$aug_vcf_name)=$vcf_obj->WriteAugmentedHeader();
$vcf_obj->writeResults($aug_vcf_fh,$store_results,$aug_vcf_name);
if($options->{'ao'} == 1) {
my($cleaned)=$vcf_obj->check_and_cleanup_dir($options->{'tmp'});
exit(0);
}
}
# run following steps only if chromosome option is empty or user has selected option to concatenate files.
if($options->{'ct'} || @{$options->{'chr'}} == 0 ) {

# run following steps only if chromosome option is empty or user has selected option to concatenate files.
if($options->{'ct'} || @{$options->{'chr'}} == 0 ) {
if($options->{'m'}) {
$log->debug("Completed analysis for all PASS locations, writing non PASS variants");
my($aug_vcf_fhs,$aug_vcf_names)=$vcf_obj->WriteAugmentedHeader();
foreach my $sample (keys %$aug_vcf_names){
if(-e $aug_vcf_names->{$sample}.'.gz') {
$log->logcroak("Output file : $aug_vcf_names->{$sample}.gz exists , skipping concatenation step");
}
$vcf_obj->catFiles($options->{'tmp'},'vcf',$sample,$aug_vcf_names->{$sample});
$log->debug("Compressing and Validating augmented VCF file for sample: $sample");
my($aug_gz,$aug_tabix)=$vcf_obj->gzipAndIndexVcf($aug_vcf_names->{$sample});
}
}

my($outfile_name_no_ext)=$vcf_obj->writeFinalFileHeaders($info_tag_val,$tags);
if(!defined $outfile_name_no_ext) {
$log->logcroak("Output file exists, skipping concatenation step");
}
$vcf_obj->catFiles($options->{'tmp'},'vcf',$outfile_name_no_ext);
$vcf_obj->catFiles($options->{'tmp'},'tsv',$outfile_name_no_ext);
$log->debug("Compressing and Validating VCF file");
my($outfile_gz,$outfile_tabix)=$vcf_obj->gzipAndIndexVcf("$outfile_name_no_ext.vcf");
if ((-e $outfile_gz) && (-e $outfile_tabix)) {
my($cleaned)=$vcf_obj->check_and_cleanup_dir($options->{'tmp'});
}
if ($options->{'dbg'}){
$log->debug("==============================Parameters used===================");
$log->debug(Dumper($options));

if($outfile_name_no_ext){
$vcf_obj->catFiles($options->{'tmp'},'vcf',undef,$outfile_name_no_ext);
$vcf_obj->catFiles($options->{'tmp'},'tsv',undef,$outfile_name_no_ext);
$log->debug("Compressing and Validating VCF file");
my($outfile_gz,$outfile_tabix)=$vcf_obj->gzipAndIndexVcf("$outfile_name_no_ext.vcf");
if ((-e $outfile_gz) && (-e $outfile_tabix) && !$options->{'dbg'} ) {
my($cleaned)=$vcf_obj->check_and_cleanup_dir($options->{'tmp'});
}
}
}
if ($options->{'dbg'}){
$log->debug("==============================Parameters used===================");
$log->debug(Dumper($options));
}
}
}

catch {
Expand All @@ -171,13 +162,14 @@ sub option_builder {
'chr|chromosome=s{,}' => \@{$options{'chr'}},
'ct|concat=i' => \$options{'ct'},
'o|outDir=s' => \$options{'o'},
'm|augment=i' => \$options{'m'},
'ao|augment_only=i' => \$options{'ao'},
'm|augment' => \$options{'m'},
'mq|map_quality=i' => \$options{'mq'},
'bq|base_quality=i' => \$options{'bq'},
# provide at least 1 tumour sample name
'tn|tumour_name=s{1,}' => \@{$options{'tn'}},
'nn|normal_name=s' => \$options{'nn'},
'tb|tumour_bam=s{1,}' => \@{$options{'tb'}},
'nb|normal_bam=s' => \$options{'nb'},
'bo|bed_only=i' => \$options{'bo'},
'oe|output_vcfExtension=s' => \$options{'oe'},
'tmp|tmpdir=s' => \$options{'tmp'},
Expand All @@ -186,9 +178,9 @@ sub option_builder {
'pid|id_int_project=s' => \$options{'pid'},
'exp|exonerate_pct=i' => \$options{'exp'},
'vcf|vcf_files=s{,}' => \@{$options{'vcf'}},
'f|filter_inc=i' => \$options{'finc'},
'F|filter_exc=i' => \$options{'fexc'},
'dbg|debug=i' => \$options{'dbg'},
'finc|filter_inc=i' => \$options{'finc'},
'fexc|filter_exc=i' => \$options{'fexc'},
'dbg|debug' => \$options{'dbg'},
'v|version' => \$options{'v'}
);

Expand All @@ -213,7 +205,7 @@ sub option_builder {
}

if(!defined($options{'fexc'})){
$options{'fexc'} = $Sanger::CGP::Vaf::VafConstants::DEFAULT_READLEN_EXCLUDE;
$options{'fexc'} = $Sanger::CGP::Vaf::VafConstants::DEFAULT_READS_EXCLUDE_PILEUP;
}

if(!defined $options{'bo'}) { $options{'bo'}=0;}
Expand Down Expand Up @@ -251,10 +243,10 @@ sub option_builder {
if($options{'a'} eq 'indel' && !defined $options{'dp'}) {
$options{'dp'}='NR,PR';
}
if($options{'ao'} || $options{'m'}) {
$log->debug("Augmentation option selected, chromosome option will be overidden to all chromosomes");
$options{'chr'}=[];

if($options{'ct'}) {
$log->debug("Concatenation option selected, chromosome option will be set to all chromosomes");
$options{'chr'}=[];
}

if(!defined $options{'s'}) {
Expand All @@ -265,15 +257,11 @@ sub option_builder {
#default exonerate percentage
$options{'exp'}=92;
}
if(!defined $options{'ao'}) {
# augment vcf no merging step
$options{'ao'}=0;
}
if(!defined $options{'oe'}) {
# augment vcf extesnion
$options{'oe'}='.vaf.vcf';
}
if(($options{'ao'} || $options{'m'}) && lc($options{'a'}) eq 'snp') {
if($options{'m'} && lc($options{'a'}) eq 'snp' ) {
$log->logcroak("Warning: VCF augment option is only supported for indels");
}
if(!defined $options{'hdr'} && lc($options{'a'}) eq 'indel') {
Expand All @@ -290,38 +278,45 @@ =head1 NAME
=head1 SYNOPSIS
cgpVaf.pl [-h] -d -a -g -tn -nn -e -o [ -b -t -c -r -m -ao -mq -pid -bo -vcf -v]
cgpVaf.pl [-h] -d -a -g -tn -nn -e -o [ -tb -nb -b -t -c -r -m -ao -mq -pid -bo -vcf -v]
Required Options (inputDir and variant_type must be defined):
--variant_type (-a) variant type (snp or indel) [default snp]
--inputDir (-d) input directory path containing bam and vcf files
--genome (-g) genome fasta file name (default genome.fa)
--tumour_name (-tn) Toumour sample name [ list of space separated sample names ]
--normal_name (-nn) Normal sample name [ single sample used as normal for this analysis ]
--tumour_name (-tn) Toumour sample name [ list of space separated sample names for which (co-located bam/cram, index and bas files are present for each sample)]
--normal_name (-nn) Normal sample name [ single sample used as normal for this analysis for which (co-located bam/cram, index and bas files are present) ]
--outDir (-o) Output folder
--vcfExtension (-e) vcf file extension string after the sample name - INCLUDE's preceding dot (default: .caveman_c.annot.vcf.gz) [ optional if -bo 1 ]
Optional
--tumour_bam (-tb) tumour bam/cram file(s) space separated list [ optional if -tn is specified]
- if not defined will be deduced from tumour_name
- should be specified in same order as tumour sample names
--normal_bam (-nb) normal bam/cram file [optional if -nn is specified]
- if not defined will be deduced from --normal_name
- should be specified in same order as tumour sample names
--infoTags (-t) comma separated list of tags to be included in the tsv output, vcf file by default includes all data
(default: VD,VW,VT,VC for Vagrent annotations)
--bedIntervals (-b) tab separated file containing list of intervals in the form of <chr><pos> <ref><alt> (e.g 1 14000 A C)
--restrict_flag (-r) restrict analysis on (possible values 1 : PASS or 0 : ALL) [default 1 ]
--chromosome (-chr) restrict analysis to a chromosome list [space separated chromosome names] , not applicable if augment option is choosen
--concat (-ct) concat per chromosome results to a single vcf file
--augment (-m) Augment pindel file [ this will add additional fields[ MTR, WTR, AMB] to FORMAT column of NORMAL and TUMOUR samples ] (default 0: don not augment)
--augment_only (-ao) Only augment pindel VCF file (-m must be specified) [ do not merge input files and add non passed varinats to output file ] (default 0: augment and merge )
--chromosome (-chr) restrict analysis to a chromosome list [space separated chromosome names]
--concat (-ct) concat per chromosome results to a single vcf file
--augment (-m) Augment original vcf file (valid for indels)
- this will add additional fields[ MTR, WTR, AMB] to FORMAT column of NORMAL and TUMOUR samples ] (default FALSE: do not augment)
--map_quality (-mq) read mapping quality threshold
--base_quality (-bq) base quality threshold for snp
--exonerate_pct (-exp) report alignment over a percentage of the maximum score attainable by each query (exonerate specific parameter) [default 92]
--bamExtension (-be) Input read file extension
--depth (-dp) comma separated list of field(s) as specified in FORMAT field representing total depth at given location
--high_depth_bed (-hdr) High Depth Region(HDR) bed file (tabix indexed) to mask high depth regions in the genome
--id_int_project (-pid) Internal project id [WTSI only]
--bed_only (-bo) Only analyse bed intervals in the file (default 0: analyse vcf and bed interval)
--vcf (-vcf) user defined input vcf file name(s) , if not defined will be deduced from tumour sample name and vcfExtension [please specify in same order as tumour sample names ]
--filter_inc (-f) Sam flag values to include when checking reads for read length
--filter_exc (-F) Sam flag values to exclude when checking reads for read length
--vcf (-vcf) user defined input vcf file path(s) [ optional if -tn is specified ]
- if not defined will be deduced from --tumour_name and --vcfExtension
- should be specified in same order as tumour sample names
--filter_inc (-finc) Sam flag values to include when checking reads for read length
--filter_exc (-fexc) Sam flag values to exclude when checking reads for read length
--help (-h) Display this help message
--version (-v) provide version information for vaf
Expand Down
2 changes: 1 addition & 1 deletion perl/config/log4perl.vaf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
log4perl.rootLogger=TRACE, LOGFILE

log4perl.appender.LOGFILE=Log::Log4perl::Appender::File
log4perl.appender.LOGFILE.filename=vcfcommons.log
log4perl.appender.LOGFILE.filename=log_vafcorrect.log
log4perl.appender.LOGFILE.mode=append

log4perl.appender.LOGFILE.layout=PatternLayout
Expand Down
Binary file modified perl/docs.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion perl/lib/Sanger/CGP/Vaf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package Sanger::CGP::Vaf;
use strict;
use Const::Fast qw(const);

our $VERSION = '5.3.9';
our $VERSION = '5.4.0';

const my $LICENSE =>
"#################
Expand Down
Loading

0 comments on commit c19bff5

Please sign in to comment.