Skip to content

Commit

Permalink
v1.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
gwct committed Sep 27, 2019
1 parent 3024a18 commit 3222507
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 21 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# Version 1.1.2: September 27, 2019
01) Added and error message when trying to read .gz compressed FASTA reference file.
02) Added maxmimum input file size of 100GB to input -gl files to prevent users from duplicating large files and having intractable run times.
03) Added the ref_split.sh helper script to aid users that need to split large BAM files by scaffold.
03) Added the bam_split.sh and tab_split.sh helper scripts to aid users that need to split large BAM, pileup, or ANGSD files by scaffold.

# PREVIOUS:
# Version 1.1.1: September 24, 2019
Expand Down
40 changes: 20 additions & 20 deletions helper-scripts/ref_split.sh → helper-scripts/bam_split.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ display_usage() {
# Usage message for -h
echo "Usage:"
echo
echo "ref_split.sh -b <BAM file> -f <reference FASTA file> -r <1> -a -p -h"
echo "bam_split.sh -b <BAM file> -f <reference FASTA file> -r <1> -a -p -h"
echo
echo "Options:"
echo "-b: (required) The full, sorted and indexed BAM file with reads mapped back to the assembly."
Expand All @@ -26,11 +26,11 @@ display_usage() {
echo "03: ANGSD (required with -a; http://popgen.dk/angsd/index.php/Installation)"
echo
echo "This script will create the following directories and files:"
echo "01: refsplit-scaffold-list.txt -> A list of scaffolds in the input BAM file."
echo "02: refsplit-bam/ -> A directory to which the split BAM files will be written."
echo "03: refsplit-pileup/ -> If -p is specified, the split pileup files will be written here."
echo "01: bamsplit-scaffold-list.txt -> A list of scaffolds in the input BAM file."
echo "02: bamsplit-bam/ -> A directory to which the split BAM files will be written."
echo "03: bamsplit-pileup/ -> If -p is specified, the split pileup files will be written here."
echo "04: referee-pileup-input.txt -> If -p is specified, this file will have the paths to the split pileup files for Referee's -i flag."
echo "05: refsplit-angsd/ -> If -a is specified, the ANGSD output files on the split BAMs will be written here."
echo "05: bamsplit-angsd/ -> If -a is specified, the ANGSD output files on the split BAMs will be written here."
echo "06: referee-angsd-input.txt -> If -a is specified, this file will have the paths to the split ANGSD output files for Referee's -i flag."
echo "================================================="
exit 0
Expand Down Expand Up @@ -71,7 +71,7 @@ bamfile=false
fasta=false
pileup=false
angsd=false
scafffile="refsplit-scaffold-list.txt"
scafffile="bamsplit-scaffold-list.txt"
procs=1
# Defaults

Expand All @@ -98,24 +98,24 @@ fi
# fi
# Check to make sure only ANGSD or pileup has been specified as a run mode.

echo "STEP 01: GETTING LIST OF ALL SCAFFOLDS IN BAM FILE: refsplit-scaffold-list.txt"
samtools idxstats $bamfile | cut -f 1 | tail -n10 | grep -v "*" > $scafffile
echo "STEP 01: GETTING LIST OF ALL SCAFFOLDS IN BAM FILE: bamsplit-scaffold-list.txt"
samtools idxstats $bamfile | cut -f 1 | grep -v "*" > $scafffile
# Get a list of all scaffolds in the BAM file

echo
echo "STEP 02: SPLITTING BAM FILE BY SCAFFOLD"
mkdir "refsplit-bam"
mkdir "bamsplit-bam"
if [ $procs == 1 ]; then
while read scaff; do
echo "SPLIT BAM: " $scaff
samtools view $bamfile -b -o refsplit-bam/$scaff.bam $scaff
samtools view $bamfile -b -o bamsplit-bam/$scaff.bam $scaff
done < "$scafffile"
# Serial
else
echo
echo "SPLITTING BAM WITH GNU PARALLEL"
echo
cat $scafffile | parallel --progress --eta -j $procs "samtools view $bamfile -b -o refsplit-bam/{.}.bam {.}"
cat $scafffile | parallel --progress --eta -j $procs "samtools view $bamfile -b -o bamsplit-bam/{.}.bam {.}"
# Parallel
fi
# Split the BAM file by scaffold
Expand All @@ -124,24 +124,24 @@ fi
if [ "$pileup" = true ]; then
echo
echo "STEP 3A: RUNNING PILEUP ON SPLIT BAMS"
mkdir "refsplit-pileup"
mkdir "bamsplit-pileup"
if [ $procs == 1 ]; then
while read scaff; do
echo "PILEUP: " $scaff
samtools mpileup -d 999999999 -f $fasta -Q 0 -s -B -o refsplit-pileup/$scaff.pileup refsplit-bam/$scaff.bam
samtools mpileup -d 999999999 -f $fasta -Q 0 -s -B -o bamsplit-pileup/$scaff.pileup bamsplit-bam/$scaff.bam
done < "$scafffile"
# Serial
else
echo
echo "RUNNING PILEUP WITH GNU PARALLEL"
echo
cat $scafffile | parallel --progress --eta -j $procs "samtools mpileup -d 999999999 -f $fasta -Q 0 -s -B -o refsplit-pileup/{.}.pileup refsplit-bam/{.}.bam"
cat $scafffile | parallel --progress --eta -j $procs "samtools mpileup -d 999999999 -f $fasta -Q 0 -s -B -o bamsplit-pileup/{.}.pileup bamsplit-bam/{.}.bam"
fi
# Parallel
# Run pileup on the split BAMs

echo "-> GENERATIONG referee-pileup-input.txt"
ls -d -1 "$PWD/refsplit-pileup/"*.pileup > referee-pileup-input.txt
echo "-> GENERATING referee-pileup-input.txt"
ls -d -1 "$PWD/bamsplit-pileup/"*.pileup > referee-pileup-input.txt
# Create the referee input file.
fi
# If pileup is specified to run
Expand All @@ -150,24 +150,24 @@ fi
if [ "$angsd" = true ]; then
echo
echo "STEP 3B: RUNNING ANGSD ON SPLIT PILEUPS"
mkdir "refsplit-angsd"
mkdir "bamsplit-angsd"
if [ $procs == 1 ]; then
while read scaff; do
echo "ANGSD " $scaff
angsd -GL 2 -i refsplit-bam/$scaff.bam -ref $fasta -minQ 0 -doGlf 4 -out refsplit-angsd/$scaff-angsd
angsd -GL 2 -i bamsplit-bam/$scaff.bam -ref $fasta -minQ 0 -doGlf 4 -out bamsplit-angsd/$scaff-angsd
done < "$scafffile"
# Serial
else
echo
echo "RUNNING ANGSD WITH GNU PARALLEL"
echo
cat $scafffile | parallel --progress --eta -j $procs "angsd -GL 2 -i refsplit-bam/{.}.bam -ref $fasta -minQ 0 -doGlf 4 -out refsplit-angsd/{.}-angsd"
cat $scafffile | parallel --progress --eta -j $procs "angsd -GL 2 -i bamsplit-bam/{.}.bam -ref $fasta -minQ 0 -doGlf 4 -out bamsplit-angsd/{.}-angsd"
fi
# Parallel
# Run ANGSD on the split BAM files

echo "-> GENERATING referee-angsd-input.txt"
ls -d -1 "$PWD/refsplit-angsd/"*-angsd.glf.gz > referee-angsd-input.txt
ls -d -1 "$PWD/bamsplit-angsd/"*-angsd.glf.gz > referee-angsd-input.txt
# Get the list of ANGSD output files with full paths
fi
# If ANGSD is specified to run
Expand Down
109 changes: 109 additions & 0 deletions helper-scripts/tab_split.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/bin/bash

############################################################
# For Referee, 09.19
# Takes a sorted BAM file and runs through the steps to
# split and run mpileup on the file.
############################################################

display_usage() {
# Usage message for -h
echo "Usage:"
echo
echo "tab_split.sh -f <A bgzipped pileup or ANGSD output file> -r <1> -h"
echo
echo "Options:"
echo "-i: (required) A bgzipped, tab delimited file with scaffold in column 1 and position in column 2. Both pileup and ANGSD output files fit these criteria."
echo "-r: The number of processors to use. If > 1, GNU Parallel (https://www.gnu.org/software/parallel/) is required. Default: 1."
echo "-h: Display this help message."
echo
echo "Dependencies:"
echo "01: tabix (required; https://sourceforge.net/projects/samtools/files/tabix/)"
echo "02: GNU parallel (required for multiprocessing; https://www.gnu.org/software/parallel/)"
echo
echo "This script will create the following directories and files:"
echo "01: tabsplit-scaffold-list.txt -> A list of scaffolds in the input file."
echo "02: tabsplit/ -> A directory to which the split files will be written."
echo "03: referee-input.txt -> This file file will have the paths to the split files for Referee's -i flag."
echo "==========================================================================="
exit 0
}

##########
# Invalid input option message
invalid_opt() {
echo "Error 1: Invalid option: $1"
echo "==========================================================================="
exit 1
}

##########
# Invalid run mode message
invalid_file(){
echo "Error 2: A pileup or ANGSD output file must be provided (-i)"
echo "==========================================================================="
exit 1
}

############################################################

echo
echo "==========================================================================="
echo " Workflow script to split a tabbed pileup or ANGSD output file for Referee"
echo

input=false
scafffile="tabsplit-scaffold-list.txt"
procs=1
# Defaults

while getopts ":i:r:h" arg; do
case $arg in
h) display_usage;;
i) input=$OPTARG;;
r) procs=$OPTARG;;
\?) invalid_opt $OPTARG ;;
esac
done
# Parse input options

if [ "$input" == false ]; then
invalid_file
fi
# Check to make sure a file has been provided.

echo
echo "STEP 01: INDEXING INPUT FILE"
tabix -s 1 -b 2 -e 2 $input
# Index the input file

echo "STEP 02: GETTING LIST OF ALL SCAFFOLDS IN INPUT TAB FILE: tabsplit-scaffold-list.txt"
tabix -l $input | grep -v "*" > $scafffile
# Get a list of all scaffolds in the BAM file

echo "STEP 03: SPLITTING INPUT FILE"
mkdir "tabsplit"
if [ $procs == 1 ]; then
while read scaff; do
echo "SPLIT: " $scaff
tabix -s 1 -b 2 -e 2 $input $scaff > tabsplit/$scaff.tab
done < "$scafffile"
# Serial
else
echo
echo "SPLITTING INPUT FILE WITH GNU PARALLEL"
echo
cat $scafffile | parallel --progress --eta -j $procs "tabix -s 1 -b 2 -e 2 $input {.} > tabsplit/{.}.tab"
fi
# Parallel
# Split the input file

echo "-> GENERATING referee-input.txt"
ls -d -1 "$PWD/tabsplit/"*.tab > referee-input.txt
# Create the referee input file.

echo
echo "DONE! Please confirm that all your output files were created successfully."
echo "You can now use referee-input.txt as your -i input to Referee."
echo "==========================================================================="
echo

0 comments on commit 3222507

Please sign in to comment.