checkem


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

#!/usr/bin/env perl

#
# checkem: Find groups of duplicate files with core libraries.
#
# Author: Tom Ryder <tom@sanctum.geek.nz>
# Site: <https://sanctum.geek.nz/cgit/checkem.git>
#
package main;

# Force me to write this properly
use strict;
use warnings;
use utf8;

# Tolerate very old Perls
use 5.006;

# Import modules; Digest is the only one that wasn't in Perl 5.6 core
use Digest;
use English '-no_match_vars';
use Fcntl ':mode';
use File::Find;

# Version number to make Perl::Critic happy
our $VERSION = 2.18;

# Complain if there are no arguments
if ( !@ARGV ) {
    printf {*STDERR} "Need at least one file or directory\n";
    exit 2;
}

# Convenience keys into lstat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
    dev  => 0,
    ino  => 1,
    mode => 2,
    size => 7,
);

# Use either the specified algorithm or a default list
my @algs =
  exists $ENV{CHECKEM_ALG}
  ? $ENV{CHECKEM_ALG}
  : qw(SHA-256 SHA-1 MD5);

# Build digest object or give up
my $dig;
for (@algs) {
    last if eval { $dig = Digest->new($_) };
}
defined $dig
  or die "Could not create a useable Digest object\n";

# Start a hash of filesizes to file names/stats...
my %sizes;

# ...and fill it up with File::Find.
find {
    no_chdir => 1,
    wanted   => sub {

        # Start a hash to represent this file
        my %f = ( name => $File::Find::name );

        # Pull in the stat values we care about
        if ( @f{ keys %STATS } = ( lstat $f{name} )[ values %STATS ] ) {

            # Check it's a plain old file
            return if not S_ISREG( $f{mode} );

            # Check its size is non-zero
            return if not $f{size};

            # Push the file hash into its size's bucket
            return push @{ $sizes{ $f{size} } }, \%f;
        }

        # Complain that we couldn't stat
        else {
            warn "Could not stat $f{name}: $ERRNO\n";
        }

        # Return if we got to here
        return;
    },
}, @ARGV;

# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my %sums;
SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) {

    # Keep a temporary table of inodes to catch hard links
    my %inos;

    # Iterate through each file in the list
  FILE: for my $f ( @{$fs} ) {

        # Catch hard links on compliant systems by keeping a dev/inode hash
        my ( $dev, $ino ) = @{$f}{qw(dev ino)};
        if ( $dev and $ino ) {
            next if exists $inos{$dev}{$ino};
            $inos{$dev}{$ino} = $f;
        }

        # Files still the same size and not hard linked, group by digest
        if ( open my $fh, '<', $f->{name} ) {
            binmode $fh;
            $dig->addfile($fh);
            push @{ $sums{ $f->{hexdigest} = $dig->hexdigest() } }, $f;
            close $fh
              or warn "Could not close $f->{name}: $ERRNO\n";
        }
        else {
            warn "Could not open $f->{name}: $ERRNO\n";
        }
    }
}

# Print the groups of matched files (more than one share a checksum in the
# final table); sort the blocks by the filesize, and the files within each
# block by name
GROUP:
for my $group (
    sort { $a->[0]{size} <=> $b->[0]{size} }
    grep { @{$_} > 1 } values %sums
  )
{
    printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group};
}