diff options
author | jwb <jwb@FreeBSD.org> | 2018-04-20 09:34:05 +0800 |
---|---|---|
committer | jwb <jwb@FreeBSD.org> | 2018-04-20 09:34:05 +0800 |
commit | b3979e08040d4c0ec53c3221400460e439389b83 (patch) | |
tree | 04c4ff39fdd1b887e6d09adee7877fdc9582f1b6 /biology | |
parent | d15e3a8bba03fd669c4ebade0f57ca8367e825a7 (diff) | |
download | freebsd-ports-gnome-b3979e08040d4c0ec53c3221400460e439389b83.tar.gz freebsd-ports-gnome-b3979e08040d4c0ec53c3221400460e439389b83.tar.zst freebsd-ports-gnome-b3979e08040d4c0ec53c3221400460e439389b83.zip |
biology/ddocent: Bash pipeline for RAD sequencing
Approved by: jrm (mentor)
Differential Revision: https://reviews.freebsd.org/D15139
Diffstat (limited to 'biology')
-rw-r--r-- | biology/Makefile | 1 | ||||
-rw-r--r-- | biology/ddocent/Makefile | 67 | ||||
-rw-r--r-- | biology/ddocent/distinfo | 3 | ||||
-rw-r--r-- | biology/ddocent/files/ddocent-assembly-test | 380 | ||||
-rw-r--r-- | biology/ddocent/files/ddocent-assembly-test-cleanup | 8 | ||||
-rw-r--r-- | biology/ddocent/files/patch-dDocent | 134 | ||||
-rw-r--r-- | biology/ddocent/pkg-descr | 7 | ||||
-rw-r--r-- | biology/ddocent/pkg-plist | 17 |
8 files changed, 617 insertions, 0 deletions
diff --git a/biology/Makefile b/biology/Makefile index 5003301b151a..031e74b9df80 100644 --- a/biology/Makefile +++ b/biology/Makefile @@ -22,6 +22,7 @@ SUBDIR += clustalw SUBDIR += consed SUBDIR += crux + SUBDIR += ddocent SUBDIR += diamond SUBDIR += emboss SUBDIR += fasta diff --git a/biology/ddocent/Makefile b/biology/ddocent/Makefile new file mode 100644 index 000000000000..552701cf244f --- /dev/null +++ b/biology/ddocent/Makefile @@ -0,0 +1,67 @@ +# $FreeBSD$ + +PORTNAME= dDocent +DISTVERSIONPREFIX= v +DISTVERSION= 2.2.25 +CATEGORIES= biology java + +MAINTAINER= jwb@FreeBSD.org +COMMENT= Bash pipeline for RAD sequencing + +LICENSE= MIT + +# ddocent test data do not unpack with FreeBSD 11.1 /usr/bin/unzip +RUN_DEPENDS= unzip>=0:archivers/unzip \ + mawk>=0:lang/mawk \ + gawk>=0:lang/gawk \ + coreutils>=0:sysutils/coreutils \ + gnuplot>=0:math/gnuplot \ + parallel>=0:sysutils/parallel \ + bash:shells/bash \ + bwa>=0.7.13:biology/bwa \ + cd-hit>=0:biology/cd-hit \ + samtools>=1.3:biology/samtools \ + vcftools>=0.1.15:biology/vcftools \ + trimmomatic>=0:biology/trimmomatic \ + bamtools>=0:biology/bamtools \ + stacks>=0:biology/stacks \ + rainbow>=0:biology/rainbow \ + trimadap>=0:biology/trimadap \ + seqtk>=0:biology/seqtk \ + bedtools>=2.26.0:biology/bedtools \ + pear-merger>=0:biology/pear-merger \ + vcflib>=0:biology/vcflib \ + freebayes:biology/freebayes + +USES= perl5 python shebangfix +SHEBANG_FILES= dDocent scripts/*.sh scripts/*.pl scripts/dDocent_filters +USE_JAVA= yes +USE_GITHUB= yes +GH_ACCOUNT= jpuritz + +NO_BUILD= yes +NO_ARCH= yes + +# These are on top of patch-dDocent, so don't apply them within the source +# tree, or they'll get picked up by patch generators, and hard-code PREFIX. +post-install: + ${REINPLACE_CMD} -i '' \ + -e 's|%%PREFIX%%|${PREFIX}|g' \ + -e 's|%%JAVAJARDIR%%|${JAVAJARDIR}|g' \ + -e 's|%%BASH%%|${LOCALBASE}/bin/bash|g' \ + -e 's|python|${PYTHON_CMD}|g' \ + ${STAGEDIR}${PREFIX}/bin/dDocent + +do-install: + ${MKDIR} ${STAGEDIR}${PREFIX}/bin + ${INSTALL_SCRIPT} \ + ${WRKSRC}/dDocent \ + ${WRKSRC}/*.sh \ + ${FILESDIR}/ddocent-assembly-test \ + ${FILESDIR}/ddocent-assembly-test-cleanup \ + ${WRKSRC}/scripts/*.sh \ + ${WRKSRC}/scripts/*.pl \ + ${WRKSRC}/scripts/dDocent_filters \ + ${STAGEDIR}${PREFIX}/bin + +.include <bsd.port.mk> diff --git a/biology/ddocent/distinfo b/biology/ddocent/distinfo new file mode 100644 index 000000000000..8bf1f9a26a16 --- /dev/null +++ b/biology/ddocent/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1520345850 +SHA256 (jpuritz-dDocent-v2.2.25_GH0.tar.gz) = 903c3010b29b2ca95f7fe6099925948e4d3f21655668caff653df97dfa7ecf44 +SIZE (jpuritz-dDocent-v2.2.25_GH0.tar.gz) = 336804 diff --git a/biology/ddocent/files/ddocent-assembly-test b/biology/ddocent/files/ddocent-assembly-test new file mode 100644 index 000000000000..7a159e72ba24 --- /dev/null +++ b/biology/ddocent/files/ddocent-assembly-test @@ -0,0 +1,380 @@ +#!/usr/bin/env bash + +set -e + +########################################################################## +# Function description: +# Pause until user presses return +########################################################################## + +pause() +{ + local junk + + printf "Press return to continue..." + read junk +} + + +########################################################################## +# Not necessary if invoking scripts with "bash script-name" as shown +# in the dDocent tutorial, but supplied for completeness. Tutorial +# scripts have also been patched upstream to usr /usr/bin/env, but +# we won't assume they'll all stay that way. +########################################################################## + +fix_bash_path() +{ + # Don't echo these commands + { set +x; } 2>/dev/null + if [ $# != 1 ]; then + printf "Usage: $0 script\n" + exit 1 + fi + sed -i.bak -e "s|#!/bin/bash|#!/usr/bin/env bash|g" $1 + set -x +} + + +########################################################################## +# Main +########################################################################## + +########################################################################## +# Workarounds for running dDocent from FreeBSD ports and Pkgsrc. +# Include this section in any scripts running dDocent commands. +########################################################################## + +# Hack to allow remake_reference.sh, etc. to find trimmomatic. +# trimmomatic.jar and adapter files are not an executables and should not +# be in PATH according to filesystem hierarchy standards, but the dDocent +# scripts are coded to look for them there, because that's how dDocent +# installs the bundled Trimmomatic. +if [ -e /usr/local/share/java/classes/trimmomatic.jar ]; then + export PATH=${PATH}:/usr/local/share/java/classes:/usr/local/share/trimmomatic/adapters +elif [ -e $PKGSRC/lib/java/trimmomatic.jar ]; then + export PATH=${PATH}:$PKGSRC/lib/java:$PKGSRC/share/trimmomatic/adapters +else + printf "Error: Trimmomatic is not installed in a known location.\n" >> /dev/stderr +fi + +if ! pwd | fgrep dDocent-test; then + mkdir -p dDocent-test + cd dDocent-test +fi + +# remake_reference.sh and some other scripts use GNU extensions in awk +# commands. Trick systems into using gawk to get around this without +# having to patch every script. +mkdir -p ddocent-bin +ln -sf `which gawk` ddocent-bin/awk +ln -sf `which python2.7` ddocent-bin/python +export PATH=`pwd`/ddocent-bin:$PATH + +########################################################################## +# Actual dDocent commands below +########################################################################## + +########################################################################## +# Command sequence from the dDocent tutorial on Github. See link below. +########################################################################## + +cat << EOM + +This script runs the test commands described at + +http://ddocent.com/assembly + +You may want to follow along on this web page as the script runs. It will +pause after each step to allow checking the output. + +EOM +pause + +mkdir -p Data +cd Data + +printf "Downloading and unpacking data.zip...\n" +if [ -e data.zip ]; then + mv data.zip /tmp/dDocent-data.zip + rm -rf * + mv /tmp/dDocent-data.zip data.zip +else + rm -rf * + curl --insecure -L -o data.zip \ + 'https://www.dropbox.com/s/t09xjuudev4de72/data.zip?dl=0' +fi + +# Hack: current data.zip contains data.zip. ??! +if [ ! -e simRRLs2.py ]; then + yes | unzip data.zip || true +fi +ls -l +pause + +set -x +head SimRAD.barcodes +{ set +x; } 2>/dev/null +pause + +set -x +cut -f2 SimRAD.barcodes > barcodes +head barcodes +{ set +x; } 2>/dev/null +pause + +set -x +process_radtags -1 SimRAD_R1.fastq.gz -2 SimRAD_R2.fastq.gz -b barcodes \ + -e ecoRI --renz_2 mspI -r -i gzfastq +ls | fgrep sample_ | head +{ set +x; } 2>/dev/null +pause + +set -x +rm *rem* +{ set +x; } 2>/dev/null +pause + +rm -f Rename_for_dDocent.sh # Always get the latest +set -x +curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/Rename_for_dDocent.sh +more Rename_for_dDocent.sh +{ set +x; } 2>/dev/null +pause + +set -x +bash Rename_for_dDocent.sh SimRAD.barcodes +{ set +x; } 2>/dev/null + +set -x +ls *.fq.gz +{ set +x; } 2>/dev/null +pause + +set -x +ls *.F.fq.gz > namelist +sed -i'' -e 's/.F.fq.gz//g' namelist +AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' +AWK2='!/>/' +AWK3='!/NNN/' +PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' + +cat namelist | parallel --no-notice -j 8 "zcat {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward" +cat namelist | parallel --no-notice -j 8 "zcat {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse" +cat namelist | parallel --no-notice -j 8 "paste -d '-' {}.forward {}.reverse | mawk '$AWK3' | sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs" +{ set +x; } 2>/dev/null +pause + +set -x +cat *.uniq.seqs > uniq.seqs +for i in {2..20}; +do +echo $i >> pfile +done +cat pfile | parallel --no-notice \ + "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniq.seqs | wc -l" \ + | mawk '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.data +rm pfile +{ set +x; } 2>/dev/null +pause + +set -x +more uniqseq.data +{ set +x; } 2>/dev/null +pause + +set -x +gnuplot << \EOF +set terminal dumb size 80, 30 +set autoscale +set xrange [2:20] +unset label +set title "Number of Unique Sequences with More than X Coverage (Counted within individuals)" +set xlabel "Coverage" +set ylabel "Number of Unique Sequences" +plot 'uniqseq.data' with lines notitle +pause -1 +EOF +{ set +x; } 2>/dev/null +pause + +set -x +parallel --no-notice -j 8 mawk -v x=4 \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv +wc -l uniqCperindv +{ set +x; } 2>/dev/null +pause + +set -x +for ((i = 2; i <= 10; i++)); +do +echo $i >> ufile +done + +cat ufile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniqCperindv | wc -l" | mawk '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.peri.data +rm ufile +{ set +x; } 2>/dev/null +pause + +set -x +gnuplot << \EOF +set terminal dumb size 80, 30 +set autoscale +unset label +set title "Number of Unique Sequences present in more than X Individuals" +set xlabel "Number of Individuals" +set ylabel "Number of Unique Sequences" +plot 'uniqseq.peri.data' with lines notitle +pause -1 +EOF +{ set +x; } 2>/dev/null +pause + +set -x +mawk -v x=4 '$1 >= x' uniqCperindv > uniq.k.4.c.4.seqs +wc -l uniq.k.4.c.4.seqs +{ set +x; } 2>/dev/null +pause + +set -x +cut -f2 uniq.k.4.c.4.seqs > totaluniqseq +mawk '{c= c + 1; print ">Contig_" c "\n" $1}' totaluniqseq > uniq.fasta +{ set +x; } 2>/dev/null +pause + +set -x +sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta +{ set +x; } 2>/dev/null +pause + +set -x +cd-hit-est -i uniq.F.fasta -o xxx -c 0.8 -T 0 -M 0 -g 1 +{ set +x; } 2>/dev/null +pause + +set -x +mawk '{if ($1 ~ /Cl/) clus = clus + 1; else print $3 "\t" clus}' xxx.clstr | sed 's/[>Contig_,...]//g' | sort -g -k1 > sort.contig.cluster.ids +paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq +sort -k2,2 -g contig.cluster.totaluniqseq | sed -e 's/NNNNNNNNNN/\t/g' > rcluster +{ set +x; } 2>/dev/null +pause + +set -x +head rcluster +{ set +x; } 2>/dev/null +pause + +set -x +cut -f2 rcluster | uniq | wc -l +{ set +x; } 2>/dev/null +pause + +set -x +rainbow div -i rcluster -o rbdiv.out +{ set +x; } 2>/dev/null +pause + +set -x +rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10 +{ set +x; } 2>/dev/null +pause + +set -x +rainbow merge -o rbasm.out -a -i rbdiv.out +{ set +x; } 2>/dev/null +pause + +set -x +rainbow merge -o rbasm.out -a -i rbdiv.out -r 2 +{ set +x; } 2>/dev/null +pause + +# If missing fdesc mount for bash, <(echo "E") will not work +# cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' { +set -x +echo "E" > endfile +cat rbasm.out endfile |sed 's/[0-9]*:[0-9]*://g' | mawk ' { +if (NR == 1) e=$2; +else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0} +else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0} +else if ($1 ~/C/) clus=$2; +else if ($1 ~/L/) len=$2; +else if ($1 ~/S/) seq=$2; +else if ($1 ~/N/) freq=$2; +else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len} +else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len} +else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq} +}' > rainbow.fasta +{ set +x; } 2>/dev/null +pause + +set -x +cd-hit-est -i rainbow.fasta -o referenceRC.fasta -M 0 -T 0 -c 0.9 +{ set +x; } 2>/dev/null +pause + +rm -f remake_reference.sh +set -x +curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/remake_reference.sh +more remake_reference.sh +#fix_bash_path remake_reference.sh + +bash remake_reference.sh 4 4 0.90 PE 2 +{ set +x; } 2>/dev/null +pause + +rm -f ReferenceOpt.sh +set -x +curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/ReferenceOpt.sh +more ReferenceOpt.sh + +bash ReferenceOpt.sh 4 8 4 8 PE 16 +{ set +x; } 2>/dev/null +pause + +set -x +bash remake_reference.sh 4 4 0.90 PE 2 +head reference.fasta +{ set +x; } 2>/dev/null +pause + +set -x +wc -l reference.fasta +{ set +x; } 2>/dev/null +pause + +set -x +mawk '/>/' reference.fasta | wc -l +{ set +x; } 2>/dev/null +pause + +set -x +mawk '/^NAATT.*N*.*CCGN$/' reference.fasta | wc -l +grep '^NAATT.*N*.*CCGN$' reference.fasta | wc -l +{ set +x; } 2>/dev/null +pause + +printf "Bonus Section: Optimize reference assemblies? (takes a long time) y/[n] " +read bonus +if [ 0$bonus = 0y ]; then + set -x + curl -L -O https://raw.githubusercontent.com/jpuritz/dDocent/master/scripts/RefMapOpt.sh + { set +x; } 2>/dev/null + printf "Running dDocent to trim reads.\n" + pause + set -x + cat << EOM | dDocent || true +yes +8 +10g +yes +no +no +no +bacon@uwm.edu +EOM + bash RefMapOpt.sh 4 8 4 8 0.9 64 PE + { set +x; } 2>/dev/null + pause + more mapping.results + pause +fi diff --git a/biology/ddocent/files/ddocent-assembly-test-cleanup b/biology/ddocent/files/ddocent-assembly-test-cleanup new file mode 100644 index 000000000000..4189b3bb0355 --- /dev/null +++ b/biology/ddocent/files/ddocent-assembly-test-cleanup @@ -0,0 +1,8 @@ +#!/bin/sh -e + +if pwd | fgrep dDocent-test; then + rm -rf .*.bak Data ddocent-bin +else + printf "$0 can only be run in a dDocent-test directory.\n" + exit 1 +fi diff --git a/biology/ddocent/files/patch-dDocent b/biology/ddocent/files/patch-dDocent new file mode 100644 index 000000000000..654a3ed31f7d --- /dev/null +++ b/biology/ddocent/files/patch-dDocent @@ -0,0 +1,134 @@ +--- dDocent.orig 2018-04-20 00:10:34 UTC ++++ dDocent +@@ -1,6 +1,9 @@ + #!/usr/local/bin/bash + export LC_ALL=en_US.UTF-8 + ++# GNU Parallel uses $SHELL and has issues with [t]csh ++export SHELL=%%BASH%% ++ + ##########dDocent########## + VERSION='2.2.25' + #This script serves as an interactive bash wrapper to QC, assemble, map, and call SNPs from double digest RAD (SE or PE), ezRAD (SE or PE) data, or SE RAD data. +@@ -27,15 +30,15 @@ do + fi + done + +-if find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null| grep -q 'trim' ; then +- TRIMMOMATIC=$(find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null | head -1) ++if [ -e %%JAVAJARDIR%%/trimmomatic.jar ]; then ++ TRIMMOMATIC=%%JAVAJARDIR%%/trimmomatic.jar + else + echo "The dependency trimmomatic is not installed or is not in your" '$PATH'"." + NUMDEP=$((NUMDEP + 1)) + fi + +-if find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | grep -q 'Tru' ; then +- ADAPTERS=$(find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | head -1) ++if [ -e %%PREFIX%%/share/trimmomatic/adapters/TruSeq2-PE.fa ]; then ++ ADAPTERS=%%PREFIX%%/share/trimmomatic/adapters/TruSeq2-PE.fa + else + echo "The file listing adapters (included with trimmomatic) is not installed or is not in your" '$PATH'"." + NUMDEP=$((NUMDEP + 1)) +@@ -80,6 +83,7 @@ FREEB=(`freebayes | grep -oh 'v[0-9].*' + exit 1 + fi + VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed 's/[a-z)]//') ++ echo $VCFTV + if [ "$VCFTV" -lt "10" ]; then + echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent." + echo "Please install at least version 0.1.11" +@@ -89,7 +93,7 @@ VCFTV=$(vcftools | grep VCF | grep -oh ' + elif [ "$VCFTV" -ge "12" ]; then + VCFGTFLAG="--max-missing" + fi +-BWAV=$(bwa 2>&1 | mawk '/Versi/' | sed 's/Version: //g' | sed 's/0.7.//g' | sed 's/-.*//g' | cut -c 1-2) ++BWAV=$(bwa 2>&1 | mawk '/Versi/' | sed 's/Version: //g' | sed 's/0.7.//g' | sed 's/a*-.*//g') + if [ "$BWAV" -lt "13" ]; then + echo "The version of bwa installed in your" '$PATH' "is not optimized for dDocent." + echo "Please install at least version 0.7.13" +@@ -107,13 +111,12 @@ BTC=$( bedtools --version | mawk '{print + exit 1 + fi + +-if ! awk --version | fgrep -v GNU &>/dev/null; then ++if ! awk --version | fgrep GNU &>/dev/null; then + awk=gawk + else + awk=awk + fi + +- + if [ $NUMDEP -gt 0 ]; then + echo -e "\nPlease install all required software before running dDocent again." + exit 1 +@@ -291,9 +294,9 @@ echo "Using BWA to map reads." + for i in "${NAMES[@]}" + do + if [ -f "$i.R2.fq.gz" ]; then +- bwa mem reference.fasta $i.R1.fq.gz $i.R2.fq.gz -L 20,5 -I $INSERT,$SD,$INSERTH,$INSERTL -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log ++ bwa mem -L 20,5 -I $INSERT,$SD,$INSERTH,$INSERTL -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" reference.fasta $i.R1.fq.gz $i.R2.fq.gz 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log + else +- bwa mem reference.fasta $i.R1.fq.gz -L 20,5 -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log ++ bwa mem -L 20,5 -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" reference.fasta $i.R1.fq.gz 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log + fi + samtools sort -@$NUMProc $i.bam -o $i.bam + mv $i.bam $i-RG.bam +@@ -388,10 +391,10 @@ if [ "$SNP" != "no" ]; then + } + export -f call_genos + +- ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | shuf | parallel --env call_genos --memfree $MAXMemory -j $NUMProc --no-notice call_genos {} ++ ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | gshuf | parallel --env call_genos --memfree $MAXMemory -j $NUMProc --no-notice call_genos {} + #### +- #ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | shuf | parallel --memfree $MAXMemory -j $FB1 --no-notice --delay 1 freebayes -L bamlist.list -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10 +- #ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | shuf | parallel --memfree $MAXMemory -j $FB1 --no-notice "samtools view -b -L mapped.{}.bed | freebayes -c -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10" ++ #ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | gshuf | parallel --memfree $MAXMemory -j $FB1 --no-notice --delay 1 freebayes -L bamlist.list -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10 ++ #ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | gshuf | parallel --memfree $MAXMemory -j $FB1 --no-notice "samtools view -b -L mapped.{}.bed | freebayes -c -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10" + + + rm mapped.*.bed +@@ -447,8 +450,8 @@ fi + + #Function for trimming reads using trimmomatic + trim_reads(){ +- TRIMMOMATIC=$(find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null | head -1) +- ADAPTERS=$(find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | head -1) ++ TRIMMOMATIC=%%JAVAJARDIR%%/trimmomatic.jar ++ ADAPTERS=%%PREFIX%%/share/trimmomatic/adapters/TruSeq2-PE.fa + + if [ -f $1.R.fq.gz ]; then + java -Xmx2g -jar $TRIMMOMATIC PE -threads 2 -phred33 $1.F.fq.gz $1.R.fq.gz $1.R1.fq.gz $1.unpairedF.fq.gz $1.R2.fq.gz $1.unpairedR.fq.gz ILLUMINACLIP:$ADAPTERS:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:5:10 $TW &> $1.trim.log +@@ -747,7 +750,14 @@ else + fi + + #Tries to get number of processors, if not asks user +-NUMProc=( `grep -c ^processor /proc/cpuinfo 2> /dev/null` ) ++if [ `uname` = Linux ]; then ++ NUMProc=( `grep -c ^processor /proc/cpuinfo 2> /dev/null` ) ++elif [ `uname` = FreeBSD ]; then ++ NUMProc=( `sysctl -n hw.ncpu` ) ++else ++ printf "Unsupported platform: `uname`\n" ++ exit 1 ++fi + NUMProc=$(($NUMProc + 0)) + + echo "dDocent detects $NUMProc processors available on this system." +@@ -764,7 +774,15 @@ if [ $NUMProc -lt 1 ]; then + fi + + #Tries to get maximum system memory, if not asks user +-MAXMemory=$(($(grep -Po '(?<=^MemTotal:)\s*[0-9]+' /proc/meminfo | tr -d " ") / 1048576))G ++if [ `uname` = Linux ]; then ++ MAXMemory=$(($(grep -Po '(?<=^MemTotal:)\s*[0-9]+' /proc/meminfo | tr -d " ") / 1048576))G ++elif [ `uname` = FreeBSD ]; then ++ MAXMemory=`sysctl -n hw.realmem` ++ MAXMemory=$((MAXMemory / 1073741824))G ++else ++ printf "Unsupported platform: `uname`\n" ++ exit 1 ++fi + + echo "dDocent detects $MAXMemory maximum memory available on this system." + echo "Please enter the maximum memory to use for this analysis. The size can be postfixed with diff --git a/biology/ddocent/pkg-descr b/biology/ddocent/pkg-descr new file mode 100644 index 000000000000..84c4e96e7a14 --- /dev/null +++ b/biology/ddocent/pkg-descr @@ -0,0 +1,7 @@ +dDocent is simple bash wrapper to QC, assemble, map, and call SNPs from almost +any kind of RAD sequencing. If you have a reference already, dDocent can be +used to call SNPs from almost any type of NGS data set. It is designed to run +on Linux based machines with large memory capacity and multiple processing +cores, and it can be modified for use on HPC. + +WWW: http://ddocent.com diff --git a/biology/ddocent/pkg-plist b/biology/ddocent/pkg-plist new file mode 100644 index 000000000000..d6f882481366 --- /dev/null +++ b/biology/ddocent/pkg-plist @@ -0,0 +1,17 @@ +bin/ErrorCount.sh +bin/RefMapOpt.sh +bin/ReferenceOpt.hyb.sh +bin/ReferenceOpt.sh +bin/Rename_SequenceFiles.sh +bin/Rename_for_dDocent.sh +bin/Rename_for_dDocent_with-INDEX.sh +bin/dDocent +bin/dDocent_filters +bin/dDocent_ngs.sh +bin/ddocent-assembly-test +bin/ddocent-assembly-test-cleanup +bin/filter_hwe_by_pop.pl +bin/filter_missing_ind.sh +bin/pop_missing_filter.sh +bin/remake_reference.sh +bin/remove.bad.hap.loci.sh |