Review or Delete Duplicate Files with Python.
My music and photos were getting out of hand. Thought I would share.
''' Author: C. Nichols, mohawke@gmail.com Release: Dec. 2018 Simple script to review or remove posible duplicate files in a folder. ** Does not delete unless specifed. Backup, example code only. TO RUN: See config: * Set dir - path to check. * Set do_deletes - delete or review log mode. * Set log_name - Log is unique to each run with datetime. * Run in terminal, cd to the folder you saved to script to and type: python3 dupe_killer.py OR python dupe_killer.py ''' import os import datetime import hashlib def chunk_reader(fobj, chunk_size=2048): """Generator that reads a file in chunks of bytes""" while True: chunk = fobj.read(chunk_size) if not chunk: return yield chunk def get_hash(_file, chunk_size=2048, hash=hashlib.sha512): hashobj = hash() with open(_file, 'rb') as file_object: hashobj.update(file_object.read(chunk_size)) ##for chunk in chunk_reader(file_object, chunk_size=chunk_size): ## hashobj.update(chunk) _hash = hashobj.digest() return _hash def get_ctime(_file): return os.path.getctime(_file) def get_size(_file): return os.path.getsize(_file) def collect_files(path, delete=False): unique = {} for root, dirs, files in os.walk(path): for _file in files: file_path = os.path.join(root, _file) if os.path.isfile(file_path): _sz = get_size(file_path) _ct = get_ctime(file_path) _hd = get_hash(file_path) key = '%s_%s' % (_hd,_sz) if key not in unique: unique[key] = file_path yield 'Unique,"%s","%s",%s,"%s"\n' % (_file,datetime.datetime.fromtimestamp(_ct),_sz,root) else: if delete: os.remove(file_path) yield 'Removed,"%s","%s",%s,"%s"\n' % (_file,datetime.datetime.fromtimestamp(_ct),_sz,root) if __name__ == '__main__': # =================================== # Config # =================================== ''' Examples: dir = r'C:\Music' dir = 'C:/Music' dir = '/Music' ''' dir = r'/home/mohawke/Music' # Set the path to the folder you wish to check. do_deletes = False # False will let you review the log before committing to removal. log_name = 'dupe_kill' # ==================================== # No changes needed below this line. # ==================================== log_file = '%s_%s.csv' % (log_name, datetime.datetime.now().strftime('%Y%m%d_%H%M%S')) with open(log_file,'w') as log: log.write('Stat,File,Created,Size,Path\n') for dupe in collect_files(dir, delete=do_deletes): log.write(dupe)