#!/usr/bin/env python
#
# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program.  If not, see
# <http://www.gnu.org/licenses/>.

"""Search two directory trees for duplicate files.

The command line script can optionally remove duplicates from the
lesser tree.
"""

import os
import os.path
from hashlib import sha1
import sys


def hash_file(filename):
    print >> sys.stderr, '    hashing', filename
    return sha1(open(filename, 'r').read()).hexdigest()

def duplicates(dir_a, dir_b):
    hashes = {}        # first occurance of hash in dir_a
    aa_duplicates = {} # hash found multiple times in dir_a
    ab_duplicates = {} # hash found once in dir_a and 1+ times in dir_b
    for dirpath,dirnames,filenames in os.walk(dir_a):
        for filename in filenames:
            path = os.path.join(dirpath, filename)
            h = hash_file(path)
            if h in hashes:
                if h in aa_duplicates:
                    aa_duplicates[h].append(path)
                else:
                    aa_duplicates[h] = [hashes[h], path]
            else:
                hashes[h] = path
    for dirpath,dirnames,filenames in os.walk(dir_b):
        for filename in filenames:
            path = os.path.join(dirpath, filename)
            h = hash_file(path)
            if h in hashes:
                if h in ab_duplicates:
                    ab_duplicates[h].append(path)
                else:
                    ab_duplicates[h] = [hashes[h], path]
    return (aa_duplicates, ab_duplicates)


if __name__ == '__main__':
    from optparse import OptionParser

    p = OptionParser(usage='%prog [options] dir_a dir_b')
    p.add_option('-r', '--remove', action='store_true',
                 help='remove duplicates from dir_b')
    p.add_option('--one-line', action='store_true',
                 help=('print tab-delimited duplicates on a single line '
                       '(for easier post-processing)'))

    options,arguments = p.parse_args()

    dir_a,dir_b = arguments

    aa_duplicates,ab_duplicates = duplicates(dir_a, dir_b)
    path_groups = []
    path_groups.extend(aa_duplicates.itervalues())
    path_groups.extend(ab_duplicates.itervalues())
    for path_group in path_groups:
        if options.one_line:
            print('\t'.join(path_group))
        else:
            print path_group[0]
            for dup in path_group[1:]:
                print '  ', dup
    if options.remove:
        print ''
        for path_group in ab_duplicates.itervalues():
            print 'removing duplicates of', path_group[0]
            for dup in path_group[1:]:
                print '  ', dup
                os.remove(dup)
