#!/usr/bin/env python """ Runs BFAST on single-end or paired-end data. TODO: more documentation TODO: - auto-detect gzip or bz2 - split options (?) - queue lengths (?) - assumes reference always has been indexed - main and secondary indexes - scoring matrix file ? - read group file ? usage: bfast_wrapper.py [options] -r, --ref=r: The reference genome to use or index -f, --fastq=f: The fastq file to use for the mapping -F, --output=u: The file to save the output (SAM format) -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) -p, --params=p: Parameter setting to use (pre_set or full) -n, --numThreads=n: The number of threads to use -A, --space=A: The encoding space (0: base 1: color) -o, --offsets=o: The offsets for 'match' -l, --loadAllIndexes=l: Load all indexes into memory -k, --keySize=k: truncate key size in 'match' -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse) -t, --timing=t: output timing information to stderr -u, --ungapped=u: performed ungapped local alignment -U, --unconstrained=U: performed local alignment without mask constraints -O, --offset=O: the number of bases before and after each hit to consider in local alignment -q, --avgMismatchQuality=q: average mismatch quality -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all) -P, --disallowPairing=P: do not choose alignments based on pairing -R, --reverse=R: paired end reads are given on reverse strands -z, --random=z: output a random best scoring alignment -D, --dbkey=D: Dbkey for reference genome -H, --suppressHeader=H: Suppress the sam header """ import optparse import os import shutil import subprocess import sys import tempfile def stop_err(msg): sys.stderr.write("%s\n" % msg) sys.exit() def __main__(): parser = optparse.OptionParser() parser.add_option("-r", "--ref", dest="ref", help="The reference genome to index and use") parser.add_option("-f", "--fastq", dest="fastq", help="The fastq file to use for the mapping") parser.add_option("-F", "--output", dest="output", help="The file to save the output (SAM format)") parser.add_option( "-A", "--space", dest="space", type="choice", default="0", choices=("0", "1"), help="The encoding space (0: base 1: color)", ) parser.add_option( "-H", "--suppressHeader", action="store_true", dest="suppressHeader", default=False, help="Suppress header" ) parser.add_option( "-n", "--numThreads", dest="numThreads", type="int", default="1", help="The number of threads to use" ) parser.add_option( "-t", "--timing", action="store_true", default=False, dest="timing", help="output timming information to stderr" ) parser.add_option( "-l", "--loadAllIndexes", action="store_true", default=False, dest="loadAllIndexes", help="Load all indexes into memory", ) parser.add_option( "-m", "--indexMask", dest="indexMask", help="String containing info on how to build custom indexes" ) parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help="String containing info on how to build custom indexes", ) parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help="Do not index lower case sequences. Such as those created by RepeatMasker", ) parser.add_option( "--indexContigOptions", dest="indexContigOptions", default="", help="The contig range options to use for the indexing", ) parser.add_option( "--indexExonsFileName", dest="indexExonsFileName", default="", help="The exons file to use for the indexing" ) parser.add_option("-o", "--offsets", dest="offsets", default="", help="The offsets for 'match'") parser.add_option("-k", "--keySize", dest="keySize", type="int", default="-1", help="truncate key size in 'match'") parser.add_option( "-K", "--maxKeyMatches", dest="maxKeyMatches", type="int", default="-1", help="the maximum number of matches to allow before a key is ignored", ) parser.add_option( "-M", "--maxNumMatches", dest="maxNumMatches", type="int", default="-1", help="the maximum number of matches to allow bfore the read is discarded", ) parser.add_option( "-w", "--whichStrand", dest="whichStrand", type="choice", default="0", choices=("0", "1", "2"), help="the strands to consider (0: both 1: forward 2: reverse)", ) parser.add_option( "--scoringMatrixFileName", dest="scoringMatrixFileName", help="Scoring Matrix file used to score the alignments" ) parser.add_option( "-u", "--ungapped", dest="ungapped", action="store_true", default=False, help="performed ungapped local alignment", ) parser.add_option( "-U", "--unconstrained", dest="unconstrained", action="store_true", default=False, help="performed local alignment without mask constraints", ) parser.add_option( "-O", "--offset", dest="offset", type="int", default="0", help="the number of bases before and after each hit to consider in local alignment", ) parser.add_option( "-q", "--avgMismatchQuality", type="int", default="-1", dest="avgMismatchQuality", help="average mismatch quality", ) parser.add_option( "-a", "--algorithm", dest="algorithm", default="0", type="choice", choices=("0", "1", "2", "3", "4"), help="post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all", ) parser.add_option( "--unpaired", dest="unpaired", action="store_true", default=False, help="do not choose alignments based on pairing", ) parser.add_option( "--reverseStrand", dest="reverseStrand", action="store_true", default=False, help="paired end reads are given on reverse strands", ) parser.add_option( "--pairedEndInfer", dest="pairedEndInfer", action="store_true", default=False, help="break ties when one end of a paired end read by estimating the insert size distribution", ) parser.add_option( "--randomBest", dest="randomBest", action="store_true", default=False, help="output a random best scoring alignment", ) options, args = parser.parse_args() # output version # of tool try: tmp = tempfile.NamedTemporaryFile().name tmp_stdout = open(tmp, "wb") proc = subprocess.Popen(args="bfast 2>&1", shell=True, stdout=tmp_stdout) tmp_stdout.close() returncode = proc.wait() stdout = None for line in open(tmp_stdout.name, "rb"): if line.lower().find("version") >= 0: stdout = line.strip() break if stdout: sys.stdout.write("%s\n" % stdout) else: raise Exception except Exception: sys.stdout.write("Could not determine BFAST version\n") buffsize = 1048576 # make temp directory for bfast, requires trailing slash tmp_dir = "%s/" % tempfile.mkdtemp() # 'generic' options used in all bfast commands here if options.timing: all_cmd_options = "-t" else: all_cmd_options = "" try: if options.buildIndex: reference_filepath = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".fa").name # build bfast indexes os.symlink(options.ref, reference_filepath) # bfast fast2brg try: nuc_space = ["0"] if options.space == "1": # color space localalign appears to require nuc space brg nuc_space.append("1") for space in nuc_space: cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % (reference_filepath, space, all_cmd_options) tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stderr = open(tmp, "wb") proc = subprocess.Popen(args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno()) returncode = proc.wait() tmp_stderr.close() # get stderr, allowing for case where it's very large tmp_stderr = open(tmp, "rb") stderr = "" try: while True: stderr += tmp_stderr.read(buffsize) if not stderr or len(stderr) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception(stderr) except Exception as e: raise Exception("Error in 'bfast fasta2brg'.\n" + str(e)) # bfast index try: all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads, ) if options.indexRepeatMasker: all_index_cmds += " -R" if options.indexContigOptions: index_contig_options = [int(_) for _ in options.indexContigOptions.split(",")] if index_contig_options[0] >= 0: all_index_cmds += ' -s "%s"' % index_contig_options[0] if index_contig_options[1] >= 0: all_index_cmds += ' -S "%s"' % index_contig_options[1] if index_contig_options[2] >= 0: all_index_cmds += ' -e "%s"' % index_contig_options[2] if index_contig_options[3] >= 0: all_index_cmds += ' -E "%s"' % index_contig_options[3] elif options.indexExonsFileName: all_index_cmds += ' -x "%s"' % options.indexExonsFileName index_count = 1 for mask, hash_width in [mask.split(":") for mask in options.indexMask.split(",")]: cmd = '%s -m "%s" -w "%s" -i "%i"' % (all_index_cmds, mask, hash_width, index_count) tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stderr = open(tmp, "wb") proc = subprocess.Popen(args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno()) returncode = proc.wait() tmp_stderr.close() # get stderr, allowing for case where it's very large tmp_stderr = open(tmp, "rb") stderr = "" try: while True: stderr += tmp_stderr.read(buffsize) if not stderr or len(stderr) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception(stderr) index_count += 1 except Exception as e: raise Exception("Error in 'bfast index'.\n" + str(e)) else: reference_filepath = options.ref assert reference_filepath and os.path.exists(reference_filepath), "A valid genome reference was not provided." # set up aligning and generate aligning command options # set up temp output files tmp_bmf = tempfile.NamedTemporaryFile(dir=tmp_dir) tmp_bmf_name = tmp_bmf.name tmp_bmf.close() tmp_baf = tempfile.NamedTemporaryFile(dir=tmp_dir) tmp_baf_name = tmp_baf.name tmp_baf.close() bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options, ) bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options, ) bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options, ) if options.offsets: bfast_match_cmd += ' -o "%s"' % options.offsets if options.keySize >= 0: bfast_match_cmd += ' -k "%s"' % options.keySize if options.maxKeyMatches >= 0: bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches if options.maxNumMatches >= 0: bfast_match_cmd += ' -M "%s"' % options.maxNumMatches bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches if options.scoringMatrixFileName: bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName if options.ungapped: bfast_localalign_cmd += " -u" if options.unconstrained: bfast_localalign_cmd += " -U" if options.avgMismatchQuality >= 0: bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality if options.algorithm == 3: if options.pairedEndInfer: bfast_postprocess_cmd += " -P" if options.randomBest: bfast_postprocess_cmd += " -z" if options.unpaired: bfast_postprocess_cmd += " -U" if options.reverseStrand: bfast_postprocess_cmd += " -R" # instead of using temp files, should we stream through pipes? bfast_match_cmd += " > %s" % tmp_bmf_name bfast_localalign_cmd += " > %s" % tmp_baf_name bfast_postprocess_cmd += " > %s" % options.output # need to nest try-except in try-finally to handle 2.4 try: # bfast 'match' try: tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stderr = open(tmp, "wb") proc = subprocess.Popen(args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno()) returncode = proc.wait() tmp_stderr.close() # get stderr, allowing for case where it's very large tmp_stderr = open(tmp, "rb") stderr = "" try: while True: stderr += tmp_stderr.read(buffsize) if not stderr or len(stderr) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception(stderr) except Exception as e: raise Exception("Error in 'bfast match'. \n" + str(e)) # bfast 'localalign' try: tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stderr = open(tmp, "wb") proc = subprocess.Popen(args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno()) returncode = proc.wait() tmp_stderr.close() # get stderr, allowing for case where it's very large tmp_stderr = open(tmp, "rb") stderr = "" try: while True: stderr += tmp_stderr.read(buffsize) if not stderr or len(stderr) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception(stderr) except Exception as e: raise Exception("Error in 'bfast localalign'. \n" + str(e)) # bfast 'postprocess' try: tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name tmp_stderr = open(tmp, "wb") proc = subprocess.Popen(args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno()) returncode = proc.wait() tmp_stderr.close() # get stderr, allowing for case where it's very large tmp_stderr = open(tmp, "rb") stderr = "" try: while True: stderr += tmp_stderr.read(buffsize) if not stderr or len(stderr) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception(stderr) except Exception as e: raise Exception("Error in 'bfast postprocess'. \n" + str(e)) # remove header if necessary if options.suppressHeader: tmp_out = tempfile.NamedTemporaryFile(dir=tmp_dir) tmp_out_name = tmp_out.name tmp_out.close() try: shutil.move(options.output, tmp_out_name) except Exception as e: raise Exception("Error moving output file before removing headers. \n" + str(e)) fout = open(options.output, "w") for line in open(tmp_out.name): if len(line) < 3 or line[0:3] not in ["@HD", "@SQ", "@RG", "@PG", "@CO"]: fout.write(line) fout.close() # check that there are results in the output file if os.path.getsize(options.output) > 0: if "0" == options.space: sys.stdout.write("BFAST run on Base Space data") else: sys.stdout.write("BFAST run on Color Space data") else: raise Exception( "The output file is empty. You may simply have no matches, or there may be an error with your input file or settings." ) except Exception as e: stop_err("The alignment failed.\n" + str(e)) finally: # clean up temp dir if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) if __name__ == "__main__": __main__()