#!/usr/bin/env perl

#
# checkem: Find groups of duplicate files with core libraries.
#
# Author: Tom Ryder <tom@sanctum.geek.nz>
# Site: <https://sanctum.geek.nz/cgit/checkem.git>
#
package main;

# Force me to write this properly
use strict;
use warnings;
use utf8;

# Tolerate very old Perls
use 5.006;

# Import modules; Digest is the only one that wasn't in Perl 5.6 core
use Digest;
use English '-no_match_vars';
use Fcntl ':mode';
use File::Find;

# Version number to make Perl::Critic happy
our $VERSION = 2.18;

# Complain if there are no arguments
if ( !@ARGV ) {
    printf {*STDERR} "Need at least one file or directory\n";
    exit 2;
}

# Convenience keys into lstat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
    dev  => 0,
    ino  => 1,
    mode => 2,
    size => 7,
);

# Use either the specified algorithm or a default list
my @algs =
  exists $ENV{CHECKEM_ALG}
  ? $ENV{CHECKEM_ALG}
  : qw(SHA-256 SHA-1 MD5);

# Build digest object or give up
my $dig;
for (@algs) {
    last if eval { $dig = Digest->new($_) };
}
defined $dig
  or die "Could not create a useable Digest object\n";

# Start a hash of filesizes to file names/stats...
my %sizes;

# ...and fill it up with File::Find.
find {
    no_chdir => 1,
    wanted   => sub {

        # Start a hash to represent this file
        my %f = ( name => $File::Find::name );

        # Pull in the stat values we care about
        if ( @f{ keys %STATS } = ( lstat $f{name} )[ values %STATS ] ) {

            # Check it's a plain old file
            return if not S_ISREG( $f{mode} );

            # Check its size is non-zero
            return if not $f{size};

            # Push the file hash into its size's bucket
            return push @{ $sizes{ $f{size} } }, \%f;
        }

        # Complain that we couldn't stat
        else {
            warn "Could not stat $f{name}: $ERRNO\n";
        }

        # Return if we got to here
        return;
    },
}, @ARGV;

# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my %sums;
SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) {

    # Keep a temporary table of inodes to catch hard links
    my %inos;

    # Iterate through each file in the list
  FILE: for my $f ( @{$fs} ) {

        # Catch hard links on compliant systems by keeping a dev/inode hash
        my ( $dev, $ino ) = @{$f}{qw(dev ino)};
        if ( $dev and $ino ) {
            next if exists $inos{$dev}{$ino};
            $inos{$dev}{$ino} = $f;
        }

        # Files still the same size and not hard linked, group by digest
        if ( open my $fh, '<', $f->{name} ) {
            binmode $fh;
            $dig->addfile($fh);
            push @{ $sums{ $f->{hexdigest} = $dig->hexdigest() } }, $f;
            close $fh
              or warn "Could not close $f->{name}: $ERRNO\n";
        }
        else {
            warn "Could not open $f->{name}: $ERRNO\n";
        }
    }
}

# Print the groups of matched files (more than one share a checksum in the
# final table); sort the blocks by the filesize, and the files within each
# block by name
GROUP:
for my $group (
    sort { $a->[0]{size} <=> $b->[0]{size} }
    grep { @{$_} > 1 } values %sums
  )
{
    printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group};
}