import os, csv, sys import argparse #compare.py -f1 file1.csv -f2 file2.csv -k key1,key2 parser = argparse.ArgumentParser(description='compare two files to check if they contain same records') parser.add_argument('-f1', '--file1', required=True) parser.add_argument('-f2', '--file2', required=True) parser.add_argument('-k', '--key', required=True, help='used to get the alignment key') args = parser.parse_args() file1 = args.file1 file2 = args.file2 key = args.key keys = key.split(',') file1Dict = dict() headerDict="" matched = 0 mismatchedTrades = [] missedInFile1Trades = [] missedInFile2Trades = [] with open(file1, 'rb') as file1: reader1 = csv.reader(file1, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = reader1.next() headerDict = dict(zip(header, range(len(header)))) for row_in_file1 in reader1: alignmentKey = ""; for key in keys: if len(alignmentKey) > 0 and len(str(row_in_file1[headerDict[key]]).upper()) > 0: alignmentKey = alignmentKey + "|" + str(row_in_file1[headerDict[key]]).upper() else: alignmentKey = alignmentKey + str(row_in_file1[headerDict[key]]).upper() file1Dict[alignmentKey] = row_in_file1 with open(file2, 'rb') as file2: reader2 = csv.reader(file2, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = reader2.next() for row_in_file2 in reader2: alignmentKey = ""; for key in keys: if len(alignmentKey) > 0 and len(str(row_in_file2[headerDict[key]]).upper()) > 0: alignmentKey = alignmentKey + "|" + str(row_in_file2[headerDict[key]]).upper() else: alignmentKey = alignmentKey + str(row_in_file2[headerDict[key]]).upper() if alignmentKey in file1Dict: row_in_file1 = file1Dict[alignmentKey] del file1Dict[alignmentKey] if row_in_file2 == row_in_file1: print alignmentKey, "matched" matched += 1 else: print row_in_file2 print row_in_file1 mismatchedTrades.append(alignmentKey) else: print row_in_file2 missedInFile1Trades.append(alignmentKey) missedInFile2Trades = list(file1Dict.keys()) print "matched = %s"%(matched) print "Mismatched trades[%d]:\n%s" % (len(mismatchedTrades), mismatchedTrades) print "Missed in [%s] trades[%d]:\n%s" % (file1.name, len(missedInFile1Trades), missedInFile1Trades) print "Missed in [%s] trades[%d]:\n%s" % (file2.name, len(missedInFile2Trades), missedInFile2Trades)