1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
#!/usr/bin/env perl
#
# checkem: Find groups of duplicate files with core libraries.
#
# Author: Tom Ryder <tom@sanctum.geek.nz>
# Site: <https://sanctum.geek.nz/cgit/checkem.git>
#
# Package name
package File::Duplicates::Checkem;
# Force me to write this properly
use strict;
use warnings;
use utf8;
# Tolerate very old Perls
use 5.006;
# Import required modules
use Carp;
use Fcntl ':mode';
use File::Find;
use Digest;
# Version number to make Perl::Critic happy
our $VERSION = 2.9;
# If no arguments, work with the current working directory
if ( !@ARGV ) {
printf {*STDERR} "%s\n", 'Need at least one file or directory';
exit 2;
}
# Convenience keys into stat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
dev => 0,
ino => 1,
mode => 2,
size => 7,
);
# Create a digest object; defaults to SHA-256, but can be overriden by setting
# CHECKEM_ALG in the environment
my $dig = Digest->new(
exists $ENV{CHECKEM_ALG}
? $ENV{CHECKEM_ALG}
: 'SHA-256',
);
# Start a hash of filesizes to file names/stats...
my %sizes;
# ...and fill it up with File::Find.
find {
no_chdir => 1,
wanted => sub {
# Start a hash to represent this file
my %f = (
name => $File::Find::name,
);
# Pull in the file stat values we care about
@f{ keys %STATS } = ( stat $f{name} )[ values %STATS ]
or return;
# Check it's a regular file
return if not $f{mode} & S_IFREG;
# Check its size is non-zero
return if not $f{size};
# Push the file hash into its size's bucket
return push @{ $sizes{ $f{size} } }, \%f;
},
}, @ARGV;
# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my %sums;
for my $fs ( grep { @{$_} > 1 } values %sizes ) {
# Keep a temporary table of inodes to catch hard links
my %inos;
# Iterate through each file in the list
for my $f ( @{$fs} ) {
# Catch hard links on compliant systems by keeping a dev/inode hash
my ( $dev, $ino ) = @{$f}{qw(dev ino)};
if ( $dev && $ino ) {
next if exists $inos{$dev}{$ino};
$inos{$dev}{$ino} = $f;
}
# Files still the same size and not hard linked, group by digest;
# create the digest object if it isn't already defined
open my $fh, '<', $f->{name}
or croak 'Failed to open file';
binmode $fh;
$dig->addfile($fh);
close $fh
or croak 'Failed to close file';
push @{ $sums{ $dig->digest() } }, $f;
}
}
# Print the groups of matched files (more than one share a checksum in the
# final table)
for my $group ( grep { @{$_} > 1 } values %sums ) {
printf "%s\n\n", join "\n", map { $_->{name} } @{$group};
}
|