Review or Delete Duplicate Files with Python.

My music and photos were getting out of hand. Thought I would share.

'''
Author: C. Nichols, mohawke@gmail.com
Release: Dec. 2018

Simple script to review or remove duplicate files in a folder.
** Does not delete unless specifed.

TO RUN: See config:
    * Set dir        - path to check.
    * Set do_deletes - delete or review log mode.
    * Set log_name   - Log is unique to each run with datetime.
    * Run in terminal, cd to the folder you saved to script to and type:
        python3 dupe_killer.py
        OR
        python dupe_killer.py
'''
import os
import datetime
import hashlib

def chunk_reader(fobj, chunk_size=2048):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk

def get_hash(_file, chunk_size=2048, hash=hashlib.sha512):
    hashobj = hash()
    with open(_file, 'rb') as file_object:
        hashobj.update(file_object.read(chunk_size))
        ##for chunk in chunk_reader(file_object, chunk_size=chunk_size):
        ##    hashobj.update(chunk)
    _hash = hashobj.digest()
    return _hash

def get_ctime(_file):
    return os.path.getctime(_file)

def get_size(_file):
    return os.path.getsize(_file)

def collect_files(path, delete=False):
    unique = {}
    for root, dirs, files in os.walk(path):
        for _file in files:

            file_path = os.path.join(root, _file)

            if os.path.isfile(file_path):
                _sz = get_size(file_path)
                _ct = get_ctime(file_path)
                _hd = get_hash(file_path)
                key = '%s_%s' % (_hd,_sz)
                if key not in unique:
                    unique[key] = file_path
                    yield 'Unique,"%s","%s",%s,"%s"\n' % (_file,datetime.datetime.fromtimestamp(_ct),_sz,root)
                else:
                    if delete:
                        os.remove(file_path)
                    yield 'Removed,"%s","%s",%s,"%s"\n' % (_file,datetime.datetime.fromtimestamp(_ct),_sz,root)

if __name__ == '__main__':

    # ===================================
    # Config
    # ===================================
    '''
    Examples:
        dir = r'C:\Music'
        dir = 'C:/Music'
        dir = '/Music'
    '''

    dir = r'/home/mohawke/Music' # Set the path to the folder you wish to check.

    do_deletes = False # False will let you review the log before committing to removal.

    log_name = 'dupe_kill'

    # ====================================
    # No changes needed below this line.
    # ====================================
    log_file = '%s_%s.csv' % (log_name, datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))

    with open(log_file,'w') as log:
        log.write('Stat,File,Created,Size,Path\n')
        for dupe in collect_files(dir, delete=do_deletes):
            log.write(dupe)

~ by mohawke on December 22, 2018.