#!/usr/bin/env python # Guruprasad Ananda # MAQ mapper for SOLiD colourspace-reads from __future__ import print_function import os import subprocess import sys import tempfile def stop_err(msg): sys.stderr.write("%s\n" % msg) sys.exit() def __main__(): out_fname = sys.argv[1].strip() out_f2 = open(sys.argv[2].strip(), "r+") ref_fname = sys.argv[3].strip() f3_read_fname = sys.argv[4].strip() f3_qual_fname = sys.argv[5].strip() paired = sys.argv[6] if paired == "yes": r3_read_fname = sys.argv[7].strip() r3_qual_fname = sys.argv[8].strip() min_mapqual = int(sys.argv[9].strip()) max_mismatch = int(sys.argv[10].strip()) out_f3name = sys.argv[11].strip() subprocess_dict = {} ref_csfa = tempfile.NamedTemporaryFile() ref_bfa = tempfile.NamedTemporaryFile() ref_csbfa = tempfile.NamedTemporaryFile() cmd2_1 = "maq fasta2csfa %s > %s 2>&1" % (ref_fname, ref_csfa.name) cmd2_2 = "maq fasta2bfa %s %s 2>&1" % (ref_csfa.name, ref_csbfa.name) cmd2_3 = "maq fasta2bfa %s %s 2>&1" % (ref_fname, ref_bfa.name) try: os.system(cmd2_1) os.system(cmd2_2) os.system(cmd2_3) except Exception as erf: stop_err(str(erf) + "Error processing reference sequence") if paired == "yes": # paired end reads tmpf = tempfile.NamedTemporaryFile() # forward reads tmpr = tempfile.NamedTemporaryFile() # reverse reads tmps = tempfile.NamedTemporaryFile() # single reads tmpffastq = tempfile.NamedTemporaryFile() tmprfastq = tempfile.NamedTemporaryFile() tmpsfastq = tempfile.NamedTemporaryFile() cmd1 = "solid2fastq_modified.pl 'yes' %s %s %s %s %s %s %s 2>&1" % ( tmpf.name, tmpr.name, tmps.name, f3_read_fname, f3_qual_fname, r3_read_fname, r3_qual_fname, ) try: os.system(cmd1) os.system("gunzip -c %s >> %s" % (tmpf.name, tmpffastq.name)) os.system("gunzip -c %s >> %s" % (tmpr.name, tmprfastq.name)) os.system("gunzip -c %s >> %s" % (tmps.name, tmpsfastq.name)) except Exception as eq: stop_err("Error converting data to fastq format." + str(eq)) # Make a temp directory where the split fastq files will be stored try: split_dir = tempfile.mkdtemp() split_file_prefix_f = tempfile.mktemp(dir=split_dir) split_file_prefix_r = tempfile.mktemp(dir=split_dir) splitcmd_f = "split -a 2 -l %d %s %s" % ( 32000000, tmpffastq.name, split_file_prefix_f, ) # 32M lines correspond to 8M reads splitcmd_r = "split -a 2 -l %d %s %s" % ( 32000000, tmprfastq.name, split_file_prefix_r, ) # 32M lines correspond to 8M reads os.system(splitcmd_f) os.system(splitcmd_r) os.chdir(split_dir) ii = 0 for fastq in os.listdir(split_dir): if not fastq.startswith(split_file_prefix_f.split("/")[-1]): continue fastq_r = ( split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] ) # find the reverse strand fastq corresponding to forward strand fastq tmpbfq_f = tempfile.NamedTemporaryFile() tmpbfq_r = tempfile.NamedTemporaryFile() cmd3 = ( "maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt" % ( fastq, tmpbfq_f.name, fastq_r, tmpbfq_r.name, fastq, ref_csbfa.name, tmpbfq_f.name, tmpbfq_r.name, fastq, fastq, ) ) subprocess_dict["sp" + str(ii + 1)] = subprocess.Popen([cmd3], shell=True, stdout=subprocess.PIPE) ii += 1 while True: all_done = True for j in range(len(subprocess_dict)): if subprocess_dict["sp" + str(j + 1)].wait() != 0: err = subprocess_dict["sp" + str(j + 1)].communicate()[1] if err is not None: stop_err("Mapping error: %s" % err) all_done = False if all_done: break cmdout = "for map in *.txt; do cat $map >> %s; done" % (out_fname) os.system(cmdout) tmpcsmap = tempfile.NamedTemporaryFile() cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" % (tmpcsmap.name) os.system(cmd_cat_csmap) tmppileup = tempfile.NamedTemporaryFile() cmdpileup = "maq pileup -m %s -q %s %s %s > %s" % ( max_mismatch, min_mapqual, ref_bfa.name, tmpcsmap.name, tmppileup.name, ) os.system(cmdpileup) tmppileup.seek(0) print("#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count", file=out_f2) for line in open(tmppileup.name): elems = line.strip().split() ref_nt = elems[2].capitalize() read_nt = elems[4] coverage = int(elems[3]) a, t, g, c = 0, 0, 0, 0 ref_nt_count = 0 for ch in read_nt: ch = ch.capitalize() if ch not in ["A", "T", "G", "C", ",", "."]: continue if ch in [",", "."]: ch = ref_nt ref_nt_count += 1 try: nt_ind = ["A", "T", "G", "C"].index(ch) if nt_ind == 0: a += 1 elif nt_ind == 1: t += 1 elif nt_ind == 2: g += 1 else: c += 1 except ValueError as we: print(we, file=sys.stderr) print( "%s\t%s\t%s\t%s\t%s\t%s" % ("\t".join(elems[:4]), coverage - ref_nt_count, a, t, g, c), file=out_f2 ) except Exception as er2: stop_err("Encountered error while mapping: %s" % (str(er2))) else: # single end reads tmpf = tempfile.NamedTemporaryFile() tmpfastq = tempfile.NamedTemporaryFile() cmd1 = "solid2fastq_modified.pl 'no' %s %s %s %s %s %s %s 2>&1" % ( tmpf.name, None, None, f3_read_fname, f3_qual_fname, None, None, ) try: os.system(cmd1) os.system("gunzip -c %s >> %s" % (tmpf.name, tmpfastq.name)) tmpf.close() except Exception: stop_err("Error converting data to fastq format.") # Make a temp directory where the split fastq files will be stored try: split_dir = tempfile.mkdtemp() split_file_prefix = tempfile.mktemp(dir=split_dir) splitcmd = "split -a 2 -l %d %s %s" % ( 32000000, tmpfastq.name, split_file_prefix, ) # 32M lines correspond to 8M reads os.system(splitcmd) os.chdir(split_dir) for i, fastq in enumerate(os.listdir(split_dir)): tmpbfq = tempfile.NamedTemporaryFile() cmd3 = ( "maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt" % (fastq, tmpbfq.name, fastq, ref_csbfa.name, tmpbfq.name, fastq, fastq) ) subprocess_dict["sp" + str(i + 1)] = subprocess.Popen([cmd3], shell=True, stdout=subprocess.PIPE) while True: all_done = True for j in range(len(subprocess_dict)): if subprocess_dict["sp" + str(j + 1)].wait() != 0: err = subprocess_dict["sp" + str(j + 1)].communicate()[1] if err is not None: stop_err("Mapping error: %s" % err) all_done = False if all_done: break cmdout = "for map in *.txt; do cat $map >> %s; done" % (out_fname) os.system(cmdout) tmpcsmap = tempfile.NamedTemporaryFile() cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" % (tmpcsmap.name) os.system(cmd_cat_csmap) tmppileup = tempfile.NamedTemporaryFile() cmdpileup = "maq pileup -m %s -q %s %s %s > %s" % ( max_mismatch, min_mapqual, ref_bfa.name, tmpcsmap.name, tmppileup.name, ) os.system(cmdpileup) tmppileup.seek(0) print("#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count", file=out_f2) for line in open(tmppileup.name): elems = line.strip().split() ref_nt = elems[2].capitalize() read_nt = elems[4] coverage = int(elems[3]) a, t, g, c = 0, 0, 0, 0 ref_nt_count = 0 for ch in read_nt: ch = ch.capitalize() if ch not in ["A", "T", "G", "C", ",", "."]: continue if ch in [",", "."]: ch = ref_nt ref_nt_count += 1 try: nt_ind = ["A", "T", "G", "C"].index(ch) if nt_ind == 0: a += 1 elif nt_ind == 1: t += 1 elif nt_ind == 2: g += 1 else: c += 1 except Exception: pass print( "%s\t%s\t%s\t%s\t%s\t%s" % ("\t".join(elems[:4]), coverage - ref_nt_count, a, t, g, c), file=out_f2 ) except Exception as er2: stop_err("Encountered error while mapping: %s" % (str(er2))) # Build custom track from pileup chr_list = [] out_f2.seek(0) fcov = tempfile.NamedTemporaryFile() fout_a = tempfile.NamedTemporaryFile() fout_t = tempfile.NamedTemporaryFile() fout_g = tempfile.NamedTemporaryFile() fout_c = tempfile.NamedTemporaryFile() fcov.write( """track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\n""" ) fout_a.write( """track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\n""" ) fout_t.write( """track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\n""" ) fout_g.write( """track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\n""" ) fout_c.write( """track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\n""" ) for line in out_f2: if line.startswith("#"): continue elems = line.split() chr = elems[0] if chr not in chr_list: chr_list.append(chr) if not (chr.startswith("chr") or chr.startswith("scaffold")): chr = "chr" header = "variableStep chrom=%s" % (chr) fcov.write("%s\n" % (header)) fout_a.write("%s\n" % (header)) fout_t.write("%s\n" % (header)) fout_g.write("%s\n" % (header)) fout_c.write("%s\n" % (header)) try: pos = int(elems[1]) cov = int(elems[3]) a = int(elems[5]) t = int(elems[6]) g = int(elems[7]) c = int(elems[8]) except Exception: continue fcov.write("%s\t%s\n" % (pos, cov)) try: a_freq = a * 100.0 / cov t_freq = t * 100.0 / cov g_freq = g * 100.0 / cov c_freq = c * 100.0 / cov except ZeroDivisionError: a_freq = t_freq = g_freq = c_freq = 0 fout_a.write("%s\t%s\n" % (pos, a_freq)) fout_t.write("%s\t%s\n" % (pos, t_freq)) fout_g.write("%s\t%s\n" % (pos, g_freq)) fout_c.write("%s\t%s\n" % (pos, c_freq)) fcov.seek(0) fout_a.seek(0) fout_g.seek(0) fout_t.seek(0) fout_c.seek(0) os.system( "cat %s %s %s %s %s | cat > %s" % (fcov.name, fout_a.name, fout_t.name, fout_g.name, fout_c.name, out_f3name) ) if __name__ == "__main__": __main__()