feat: quick and dirty duplicate finder

This commit is contained in:
Marcin 2020-05-14 22:01:38 +02:00
parent 04c67ed73a
commit c495a4aa44
3 changed files with 109 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
pickles/

View file

@ -0,0 +1,5 @@
# Clea
## Caveats
* This method only works if you compare relatively different files (i.e. photos)

103
fast_duplicates.py Executable file
View file

@ -0,0 +1,103 @@
#!/usr/bin/python
import os
import sys
import pickle
import md5
import argparse
PICKLES_DIR = './pickles'
LOGS_DIR = './logs'
def get_files_hashmap(directory, force):
pickle_filename = md5.new(directory).hexdigest() + '.pickle'
if os.path.isfile(os.path.join(PICKLES_DIR, pickle_filename)) and not force:
print('Reading existing hashmap for {} from {}'.format(directory, pickle_filename))
with open(os.path.join(PICKLES_DIR, pickle_filename), 'rb') as f:
files_hashmap = pickle.load(f)
else:
print('Creating new hashmap for {} in {}'.format(directory, pickle_filename))
files_hashmap = create_files_hashmap(directory)
with open(os.path.join(PICKLES_DIR, pickle_filename), 'wb') as f:
pickle.dump(files_hashmap, f)
return files_hashmap
def create_files_hashmap(directory):
list_of_files = []
for dirpath, dirnames, filenames in os.walk(directory):
list_of_files += [(filename, os.path.join(dirpath, filename)) for filename in filenames]
files_hashmap = {}
for file_name, file_path in list_of_files:
file_size = os.path.getsize(file_path)
if (file_name, file_size) in files_hashmap.keys():
files_hashmap[(file_name, file_size)].append(file_path)
else:
files_hashmap[(file_name, file_size)] = [file_path]
return files_hashmap
parser = argparse.ArgumentParser(description='Check if all files from one directory are present in another.')
parser.add_argument(dest='needles', help='Directory that will be used as reference')
parser.add_argument(dest='haystack', help='Directory in which we look for duplicates')
parser.add_argument('-f', '--force', action='store_true', help='Forces program to re-hash the directory')
parser.add_argument('-l', '--log', action='store_true', help='Generate log file with details per-file')
args = parser.parse_args()
def check_dir(dirname):
if not os.path.isdir(dirname):
print(CRED + 'directory "{}" does not exist!'.format(dirname) + CEND)
sys.exit(1)
check_dir(args.needles)
check_dir(args.haystack)
if not os.path.isdir(PICKLES_DIR):
os.mkdir(PICKLES_DIR)
if not os.path.isdir(LOGS_DIR):
os.mkdir(LOGS_DIR)
if args.log:
log_filename = '{}_{}.log'.format(os.path.basename(args.needles), os.path.basename(args.haystack))
log_file = open(os.path.join(LOGS_DIR, log_filename), 'w')
print('Getting hashmaps...')
needle_files = get_files_hashmap(args.needles, args.force)
haystack_files = get_files_hashmap(args.haystack, args.force)
print('Comparing hashmaps...')
mismatched = 0
count_all = len(needle_files.keys())
for i, (file_name, file_size) in enumerate(needle_files.keys()):
progress = int(round((i+1)*100.0/count_all, 0))
pre = needle_files[file_name, file_size]
if (file_name, file_size) in haystack_files.keys():
post = haystack_files[file_name, file_size]
status = 'matched'
else:
post = 'None'
status = 'mismatched'
mismatched += 1
print('({}%): {} -> {}'.format(progress,pre , post))
if args.log:
log_file.write("{};{};{}\n".format(status, pre, post))
if mismatched == 0:
print('All matched!')
else:
print( 'Not all matched!')
if args.log:
log_file.close()
print('Saved to log: {}'.format(log_filename))