#!/usr/bin/env python """ Runs Ben's simulation. usage: %prog [options] -i, --input=i: Input genome (FASTA format) -g, --genome=g: If built-in, the genome being used -l, --read_len=l: Read length -c, --avg_coverage=c: Average coverage -e, --error_rate=e: Error rate (0-1) -n, --num_sims=n: Number of simulations to run -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1) -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1) -p, --output_png=p: Plot output -s, --summary_out=s: Whether or not to output a file with summary of all simulations -m, --output_summary=m: File name for output summary of all simulations -f, --new_file_path=f: Directory for summary output files """ # removed output of all simulation results on request (not working) # -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds) # -o, --output=o: Base name for summary output for each run from __future__ import print_function import itertools import os import random import sys import tempfile from bx.cookbook import doc_optparse from rpy import r def stop_err(msg): sys.stderr.write("%s\n" % msg) sys.exit() def __main__(): # Parse Command Line options, args = doc_optparse.parse(__doc__) # validate parameters error = "" try: read_len = int(options.read_len) if read_len <= 0: raise Exception(" greater than 0") except TypeError as e: error = ": %s" % str(e) if error: stop_err("Make sure your number of reads is an integer value%s" % error) error = "" try: avg_coverage = int(options.avg_coverage) if avg_coverage <= 0: raise Exception(" greater than 0") except Exception as e: error = ": %s" % str(e) if error: stop_err("Make sure your average coverage is an integer value%s" % error) error = "" try: error_rate = float(options.error_rate) if error_rate >= 1.0: error_rate = 10 ** (-error_rate / 10.0) elif error_rate < 0: raise Exception(" between 0 and 1") except Exception as e: error = ": %s" % str(e) if error: stop_err("Make sure the error rate is a decimal value%s or the quality score is at least 1" % error) try: num_sims = int(options.num_sims) except TypeError as e: stop_err("Make sure the number of simulations is an integer value: %s" % str(e)) if options.polymorphism != "None": polymorphisms = [float(p) for p in options.polymorphism.split(",")] else: stop_err("Select at least one polymorphism value to use") if options.detection_thresh != "None": detection_threshes = [float(dt) for dt in options.detection_thresh.split(",")] else: stop_err("Select at least one detection threshold to use") # mutation dictionaries hp_dict = {"A": "G", "G": "A", "C": "T", "T": "C", "N": "N"} # heteroplasmy dictionary mt_dict = {"A": "C", "C": "A", "G": "T", "T": "G", "N": "N"} # misread dictionary # read fasta file to seq string all_lines = open(options.input, "rb").readlines() seq = "" for line in all_lines: line = line.rstrip() if line.startswith(">"): pass else: seq += line.upper() seq_len = len(seq) # output file name template # removed output of all simulation results on request (not working) # if options.sim_results == "true": # out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' ) # else: # out_name_template = tempfile.NamedTemporaryFile().name + '_%s' out_name_template = tempfile.NamedTemporaryFile().name + "_%s" print("out_name_template:", out_name_template) # set up output files outputs = {} i = 1 for p in polymorphisms: outputs[p] = {} for d in detection_threshes: outputs[p][d] = out_name_template % i i += 1 # run sims for polymorphism in polymorphisms: for detection_thresh in detection_threshes: output = open(outputs[polymorphism][detection_thresh], "wb") output.write("FP\tFN\tGENOMESIZE=%s\n" % seq_len) sim_count = 0 while sim_count < num_sims: # randomly pick heteroplasmic base index hbase = random.randrange(seq_len) # hbase = seq_len/2#random.randrange( 0, seq_len ) # create 2D quasispecies list qspec = [[] for _ in range(seq_len)] # simulate read indices and assign to quasispecies i = 0 while i < (avg_coverage * (seq_len / read_len)): # number of reads (approximates coverage) start = random.randrange(seq_len) if random.random() < 0.5: # positive sense read end = start + read_len # assign read end if end > seq_len: # overshooting origin read = itertools.chain(range(start, seq_len), range(0, end - seq_len)) else: # regular read read = range(start, end) else: # negative sense read end = start - read_len # assign read end if end < -1: # overshooting origin read = itertools.chain(range(start, -1, -1), range(seq_len - 1, seq_len + end, -1)) else: # regular read read = range(start, end, -1) # assign read to quasispecies list by index for j in read: if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het ref = hp_dict[seq[j]] else: # ref is the verbatim reference nucleotide (all positions) ref = seq[j] if random.random() < error_rate: # base in read is misread with p = err qspec[j].append(mt_dict[ref]) else: # otherwise we carry ref through to the end qspec[j].append(ref) # last but not least i += 1 bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later for i, nuc in enumerate(seq): cov = len(qspec[i]) bases["A"] = qspec[i].count("A") bases["C"] = qspec[i].count("C") bases["G"] = qspec[i].count("G") bases["T"] = qspec[i].count("T") # calculate max NON-REF deviation del bases[nuc] maxdev = float(max(bases.values())) / cov # deal with non-het sites if i != hbase: if maxdev >= detection_thresh: # greater than detection threshold = false positive fpos += 1 # deal with het sites if i == hbase: hnuc = hp_dict[nuc] # let's recover het variant if ( float(bases[hnuc]) / cov ) < detection_thresh: # less than detection threshold = false negative fneg += 1 del bases[hnuc] # ignore het variant maxdev = float(max(bases.values())) / cov # check other non-ref bases at het site if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible) fpos += 1 # output error sums and genome size to summary file output.write("%d\t%d\n" % (fpos, fneg)) sim_count += 1 # close output up output.close() # Parameters (heteroplasmy, error threshold, colours) r(""" het=c(%s) err=c(%s) grade = (0:32)/32 hues = rev(gray(grade)) """ % (",".join(str(p) for p in polymorphisms), ",".join(str(d) for d in detection_threshes))) # Suppress warnings r("options(warn=-1)") # Create allsum (for FP) and allneg (for FN) objects r("allsum <- data.frame()") for polymorphism in polymorphisms: for detection_thresh in detection_threshes: output = outputs[polymorphism][detection_thresh] cmd = """ ngsum = read.delim('%s', header=T) ngsum$fprate <- ngsum$FP/%s ngsum$hetcol <- %s ngsum$errcol <- %s allsum <- rbind(allsum, ngsum) """ % ( output, seq_len, polymorphism, detection_thresh, ) r(cmd) if os.path.getsize(output) == 0: for p in outputs.keys(): for d in outputs[p].keys(): sys.stderr.write(outputs[p][d] + " " + str(os.path.getsize(outputs[p][d])) + "\n") if options.summary_out == "true": r('write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary) # Summary objects (these could be printed) r(""" tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean) tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean) cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t') write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t') cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t') write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t') """ % tuple([options.output_summary] * 4)) # Setup graphs r(""" png('%s', width=800, height=500, units='px', res=250) layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10)) """ % options.output_png) # Main title genome = "" if options.genome: genome = "%s: " % options.genome r(""" par(mar=c(0,0,0,0)) plot(1, type='n', axes=F, xlab='', ylab='') text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7) """ % (genome, options.num_sims, options.avg_coverage)) # False positive boxplot r(""" par(mar=c(5,4,2,2), las=1, cex=0.35) boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85) title(main='False Positives', xlab='false positive rate', ylab='') """) # False negative heatmap (note zlim command!) num_polys = len(polymorphisms) num_dets = len(detection_threshes) r(""" par(mar=c(5,4,2,1), las=1, cex=0.35) image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1) axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i') axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85) title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold') """ % (num_polys, num_dets, num_polys, num_dets)) # Scale alongside r(""" par(mar=c(2,2,2,3), las=1) image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85) title(main='Key', cex=0.35) mtext('false negative rate', side=1, cex=0.35) """) # Close graphics r(""" layout(1) dev.off() """) if __name__ == "__main__": __main__()