#!/bin/bash

# yet another fugly script by Erik Jan "alphageek" Tromp

# this time, it's a pathetic attempt to recover some space on my server.
# sizable portions of the slackware-8.1/ & slackware-current/ trees were
# virtually identical at one point. by hardlinking the common files, only
# single file 'bodies' need reside on disk, thereby freeing up space.

# NOTE: absolutely no sanity checking is performed for unlinking
#       hardlinks. if you have 100 gigs worth of files hardlinked on a 40
#       gig drive, you're going to be in for a nasty surprise.

# known to work with the following versions of utilities under linux. ymmv
# basename - sh-utils 2.0
# bash     - 2.05.0
# cmp      - diffutils 2.7
# date     - sh-utils 2.0
# dirname  - sh-utils 2.0
# echo     - sh-utils 2.0
# find     - 4.1
# ln       - fileutils 4.1
# ls       - fileutils 4.1
# sort     - textutils 2.0
# touch    - fileutils 4.1

function _show_help() {
  echo "Usage: `basename $0` <options> [path] [...] <options>"
  echo
  echo "  -d, --dry-run  show what would have been (un)linked"
  echo "  -i, --ignore   ignore file timestamps"
  echo "  -l, --link     link duplicate files"
  echo "  -u, --unlink   unlink duplicate files"
  echo "  -q, --quiet    be vewwy vewwy quiet"
  echo "  -h, --help     show this help text"
  exit 1
}

# hmm, works better if you tell it to do something
if [ "$#" -eq "0" ] ; then
  _show_help
fi

# presets
DRYRUN="false"
IGNORE="false"
MODE=""
QUIET="false"
PATHS=""

# process arguments (with minimal sanity checking)
while [ "$#" -gt "0" ] ; do
  case "$1" in
    --dry-run|-d)
      DRYRUN="true"
      shift
      ;;
    --ignore|-i)
      IGNORE="true"
      shift
      ;;
    --link|-l)
      if [ "$MODE" = "unlink" ] ; then
        echo "choose ONE mode"
        _show_help
      fi
      MODE="link"
      shift
      ;;
    --unlink|-u)
      if [ "$MODE" = "link" ] ; then
        echo "choose ONE mode"
        _show_help
      fi
      MODE="unlink"
      shift
      ;;
    --quiet|-q)
      QUIET="true"
      shift
      ;;
    --help|-h)
      _show_help
      ;;
    -*)
      echo "unknown option: $1"
      _show_help
      ;;
    *)
      PATHS="$PATHS $1"
      shift
      ;;
  esac
done

if [ "$MODE" = "" ] ; then
  echo "choose a mode"
  _show_help
  exit 1
fi

# linking & unlinking files are 2 fundamentally different processes. as
# such, we have a separate section for each.

if [ "$MODE" = "link" ] ; then # link

  # order by smallest, oldest, etc etc
  # note: size & time are _not_ arbitrary.. all others are
  # %s  - size
  # %T@ - time (seconds since 1970)
  # %U  - uid
  # %G  - gid
  # %m  - permissions
  # %i  - inode
  # %p  - filename (including relative path)
  # \\t - horiz tab (makes for easy reading when troubleshooting this hack)
  # \\n - newline

  SIZ1=""
  TIM1=""
  UID1=""
  GID1=""
  PRM1=""
  NOD1=""
  PTH1=""
  find $PATHS -xdev -type f -printf %s\\t%T@\\t%U\\t%G\\t%m\\t%i\\t%p\\n | \
  sort -nr | while { read SIZ2 TIM2 UID2 GID2 PRM2 NOD2 PTH2 ; } ; do

    # here's where the real work gets done
    if [ "$SIZ1" = "$SIZ2" ] \
    && ( [ "$TIM1" = "$TIM2" ] || [ "$IGNORE" = "true" ] ) \
    && [ "$UID1" = "$UID2" ] \
    && [ "$GID1" = "$GID2" ] \
    && [ "$PRM1" = "$PRM2" ] \
    && [ "$NOD1" != "$NOD2" ] \
    && [ "$PTH1" != "$PTH2" ] ; then

      # we've gotten this far, so the 2 files look the same externally

      # I would have preferred to use md5sum & carry the result over
      # to the next iteration of the loop (less reads that way), but
      # I'm suffering from a terminal case of brainlock & I need this
      # to work NOW!
      cmp -s "$PTH1" "$PTH2"

      if [ "$?" = "0" ] ; then

        # note: we are going to to perform 'blind' hardlinks on the
        #       assumption that multiple sets of hardlinked duplicates (if
        #       they exist) will be traversed & combined into a single
        #       set.
        #       this could be a fatal assumption, so "if you break it, you
        #       get to keep both halves". consider yourself duly warned!
        if [ "$DRYRUN" = "false" ] ; then

          # preserve parent directory's timestamp
          PTIM="`date -r "\`dirname "$PTH2"\`"`"

          # make the link
          ln -f "$PTH1" "$PTH2"

          # restore parent directory's timestamp
          touch "`dirname "$PTH2"`" -d "$PTIM"
        fi

        # PTH2 now shares PTH1's inode
        NOD2="$NOD1"

        if [ "$QUIET" = "false" ] ; then

          # be verbose (great for logging the changes)
          ls -li --full-time "$PTH1"
          ls -li --full-time "$PTH2"
          echo
        fi
      fi
    fi

    # carry over data for next iteration
    SIZ1="$SIZ2"
    TIM1="$TIM2"
    UID1="$UID2"
    GID1="$GID2"
    PRM1="$PRM2"
    NOD1="$NOD2"
    PTH1="$PTH2"
  done

else # unlink

  # no ordering necessary. first come, first served
  # %n  - number of hardlinks

  find $PATHS -xdev -type f | \
  while { read PTH ; } ; do

    # we grab node count 'on the fly' to avoid processing the last file
    # of a hardlinked set. why unlink a file that doesn't need it?
    NOD="`find "$PTH" -printf %n`"

    # here's where the real work gets done
    if [ "$NOD" -gt "1" ] ; then

      if [ "$DRYRUN" = "false" ] ; then

        # preserve parent directory's timestamp
        PTIM="`date -r "\`dirname "$PTH"\`"`"

        # set tempfile name
        PTMP="$PTH.$RANDOM"

        # copy file to tempfile
        cp -a "$PTH" "$PTMP"

        # replace link from tempfile
        mv -f "$PTMP" "$PTH"

        # restore parent directory's timestamp
        touch "`dirname "$PTH"`" -d "$PTIM"
      fi

      if [ "$QUIET" = "false" ] ; then

        # be verbose (great for logging the changes)
        ls -li --full-time "$PTH"
      fi
    fi

  done

fi

# eof
