#!/usr/bin/env perl # # checkem: Find groups of duplicate files with core libraries. # # Author: Tom Ryder # Site: # package main; # Force me to write this properly use strict; use warnings; use utf8; # Tolerate very old Perls use 5.006; # Import modules; Digest is the only one that wasn't in Perl 5.6 core use Digest; use English '-no_match_vars'; use Fcntl ':mode'; use File::Find; # Version number to make Perl::Critic happy our $VERSION = 2.18; # Complain if there are no arguments if ( !@ARGV ) { printf {*STDERR} "Need at least one file or directory\n"; exit 2; } # Convenience keys into lstat() return array for clarity and to appease # Perl::Critic my %STATS = ( dev => 0, ino => 1, mode => 2, size => 7, ); # Use either the specified algorithm or a default list my @algs = exists $ENV{CHECKEM_ALG} ? $ENV{CHECKEM_ALG} : qw(SHA-256 SHA-1 MD5); # Build digest object or give up my $dig; for (@algs) { last if eval { $dig = Digest->new($_) }; } defined $dig or die "Could not create a useable Digest object\n"; # Start a hash of filesizes to file names/stats... my %sizes; # ...and fill it up with File::Find. find { no_chdir => 1, wanted => sub { # Start a hash to represent this file my %f = ( name => $File::Find::name ); # Pull in the stat values we care about if ( @f{ keys %STATS } = ( lstat $f{name} )[ values %STATS ] ) { # Check it's a plain old file return if not S_ISREG( $f{mode} ); # Check its size is non-zero return if not $f{size}; # Push the file hash into its size's bucket return push @{ $sizes{ $f{size} } }, \%f; } # Complain that we couldn't stat else { warn "Could not stat $f{name}: $ERRNO\n"; } # Return if we got to here return; }, }, @ARGV; # If there's more than one filename of any of the sizes, look for hard links, # checksum them if not linked, and push them into a sums table my %sums; SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) { # Keep a temporary table of inodes to catch hard links my %inos; # Iterate through each file in the list FILE: for my $f ( @{$fs} ) { # Catch hard links on compliant systems by keeping a dev/inode hash my ( $dev, $ino ) = @{$f}{qw(dev ino)}; if ( $dev and $ino ) { next if exists $inos{$dev}{$ino}; $inos{$dev}{$ino} = $f; } # Files still the same size and not hard linked, group by digest if ( open my $fh, '<', $f->{name} ) { binmode $fh; $dig->addfile($fh); push @{ $sums{ $f->{hexdigest} = $dig->hexdigest() } }, $f; close $fh or warn "Could not close $f->{name}: $ERRNO\n"; } else { warn "Could not open $f->{name}: $ERRNO\n"; } } } # Print the groups of matched files (more than one share a checksum in the # final table); sort the blocks by the filesize, and the files within each # block by name GROUP: for my $group ( sort { $a->[0]{size} <=> $b->[0]{size} } grep { @{$_} > 1 } values %sums ) { printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group}; }