# Filename: uniq.py # Author: Ian N. Schenck # Version: 19/12/2005 # # This script accepts an input file, an output file, a column # delimiter, and a list of columns. The script then grabs unique # lines based on the columns, and returns those records with a count # of occurences of each unique column (ignoring trailing spaces), # inserted before the columns. # # This executes the command pipeline: # cut -f $fields | sort | uniq -C # # -i Input file # -o Output file # -d Delimiter # -c Column list (Comma Seperated) from __future__ import print_function import re import subprocess import sys # This function is exceedingly useful, perhaps package for reuse? def getopts(argv): opts = {} while argv: if argv[0][0] == "-": opts[argv[0]] = argv[1] argv = argv[2:] else: argv = argv[1:] return opts def main(): args = sys.argv[1:] try: opts = getopts(args) except IndexError: print("Usage:") print(" -i Input file") print(" -o Output file") print(" -c Column list (comma seperated)") print(" -d Delimiter:") print(" T Tab") print(" C Comma") print(" D Dash") print(" U Underscore") print(" P Pipe") print(" Dt Dot") print(" Sp Space") print(" -s Sorting: value (default), largest, or smallest") return 0 outputfile = opts.get("-o") if outputfile is None: print("No output file specified.") return -1 inputfile = opts.get("-i") if inputfile is None: print("No input file specified.") return -2 delim = opts.get("-d") if delim is None: print("Field delimiter not specified.") return -3 columns = opts.get("-c") if columns is None or columns == "None": print("Columns not specified.") return -4 sorting = opts.get("-s") if sorting is None: sorting = "value" if sorting not in ["value", "largest", "smallest"]: print("Unknown sorting option %r" % sorting) return -5 # All inputs have been specified at this point, now validate. fileRegEx = re.compile(r"^[A-Za-z0-9./\-_]+$") columnRegEx = re.compile("([0-9]{1,},?)+") if not columnRegEx.match(columns): print("Illegal column specification.") return -4 if not fileRegEx.match(outputfile): print("Illegal output filename.") return -5 if not fileRegEx.match(inputfile): print("Illegal input filename.") return -6 column_list = re.split(",", columns) columns_for_display = "c" + ", c".join(column_list) commandline = "cut " # Set delimiter if delim == "C": commandline += '-d "," ' if delim == "D": commandline += '-d "-" ' if delim == "U": commandline += '-d "_" ' if delim == "P": commandline += '-d "|" ' if delim == "Dt": commandline += '-d "." ' if delim == "Sp": commandline += '-d " " ' # set columns commandline += "-f " + columns # we want to remove *trailing* spaces from each field, # so look for spaces then tab (for first and middle selected columns) # and replace with just tab, and remove any spaces at end of the line # (for the final selected column): commandline += " " + inputfile + r" | sed 's/\ *\t/\t/' | sed 's/\ *$//'" commandline += " | sort | uniq -c" # uniq -C puts counts at the start, so we can sort lines by numerical value if sorting == "largest": commandline += " | sort -n -r" elif sorting == "smallest": commandline += " | sort -n" # uniq -C produces lines with leading spaces, use sed to remove that # uniq -C puts a space between the count and the field, want a tab. # To replace just first tab, use sed again with 1 as the index commandline += r" | sed 's/^\ *//' | sed 's/ /\t/1' > " + outputfile errorcode = subprocess.call(commandline, shell=True) print("Count of unique values in " + columns_for_display) return errorcode if __name__ == "__main__": main()