#!/usr/bin/env python # Dan Blankenberg import os import sys assert sys.version_info[:2] >= (2, 6) def __main__(): base_dir = os.path.join(os.getcwd(), "bacteria") try: base_dir = sys.argv[1] except IndexError: pass organisms = {} for result in os.walk(base_dir): this_base_dir, sub_dirs, files = result for file in files: if file[-5:] == ".info": tmp_dict = {} info_file = open(os.path.join(this_base_dir, file)) info = info_file.readlines() info_file.close() for line in info: fields = line.replace("\n", "").split("=") tmp_dict[fields[0]] = "=".join(fields[1:]) if "genome project id" in tmp_dict.keys(): name = tmp_dict["genome project id"] if "build" in tmp_dict.keys(): name = tmp_dict["build"] if name not in organisms.keys(): organisms[name] = {"chrs": {}, "base_dir": this_base_dir} for key in tmp_dict.keys(): organisms[name][key] = tmp_dict[key] else: if tmp_dict["organism"] not in organisms.keys(): organisms[tmp_dict["organism"]] = {"chrs": {}, "base_dir": this_base_dir} organisms[tmp_dict["organism"]]["chrs"][tmp_dict["chromosome"]] = tmp_dict for org in organisms: org = organisms[org] # if no gpi, then must be a ncbi chr which corresponds to a UCSC org, w/o matching UCSC designation try: build = org["genome project id"] except KeyError: continue if "build" in org: build = org["build"] print( "ORG\t{}\t{}\t{}\t{}\t{}\t{}\tUCSC".format( build, org["name"], org["kingdom"], org["group"], org["chromosomes"], org["info url"] ) ) else: print( "ORG\t{}\t{}\t{}\t{}\t{}\t{}\tNone".format( build, org["name"], org["kingdom"], org["group"], org["chromosomes"], org["info url"] ) ) for chr in org["chrs"]: chr = org["chrs"][chr] print( "CHR\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( build, chr["chromosome"], chr["name"], chr["length"], chr["gi"], chr["gb"], "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=" + chr["refseq"], ) ) for feature in ["CDS", "tRNA", "rRNA"]: print( "DATA\t{}_{}_{}\t{}\t{}\t{}\t{}\t{}".format( build, chr["chromosome"], feature, build, chr["chromosome"], feature, "bed", os.path.join(org["base_dir"], "{}.{}.bed".format(chr["chromosome"], feature)), ) ) # FASTA print( "DATA\t{}_{}_{}\t{}\t{}\t{}\t{}\t{}".format( build, chr["chromosome"], "seq", build, chr["chromosome"], "sequence", "fasta", os.path.join(org["base_dir"], f"{chr['chromosome']}.fna"), ) ) # GeneMark if os.path.exists(os.path.join(org["base_dir"], f"{chr['chromosome']}.GeneMark.bed")): print( "DATA\t{}_{}_{}\t{}\t{}\t{}\t{}\t{}".format( build, chr["chromosome"], "GeneMark", build, chr["chromosome"], "GeneMark", "bed", os.path.join(org["base_dir"], f"{chr['chromosome']}.GeneMark.bed"), ) ) # GenMarkHMM if os.path.exists(os.path.join(org["base_dir"], f"{chr['chromosome']}.GeneMarkHMM.bed")): print( "DATA\t{}_{}_{}\t{}\t{}\t{}\t{}\t{}".format( build, chr["chromosome"], "GeneMarkHMM", build, chr["chromosome"], "GeneMarkHMM", "bed", os.path.join(org["base_dir"], f"{chr['chromosome']}.GeneMarkHMM.bed"), ) ) # Glimmer3 if os.path.exists(os.path.join(org["base_dir"], f"{chr['chromosome']}.Glimmer3.bed")): print( "DATA\t{}_{}_{}\t{}\t{}\t{}\t{}\t{}".format( build, chr["chromosome"], "Glimmer3", build, chr["chromosome"], "Glimmer3", "bed", os.path.join(org["base_dir"], f"{chr['chromosome']}.Glimmer3.bed"), ) ) if __name__ == "__main__": __main__()