#!/usr/local/bin/bash
#/ Usage: ghe-backup-repositories-cluster
#/ Take an online, incremental snapshot of all Git repository data.
#/
#/ Note: This command typically isn't called directly. It's invoked by
#/ ghe-backup when the cluster strategy is used.
set -e

# This command is designed to allow for transferring active Git repository data
# from a GitHub instance to a backup site in a way that ensures data is
# captured in a consistent state even when being written to.
#
# - All Git GC operations are disabled on the GitHub instance for the duration of
#   the backup. This removes the possibly of objects or packs being removed
#   while the backup is in progress.
#
# - In progress Git GC operations are given a cooldown window to complete. The
#   script will sleep for up to 60 seconds waiting for GC operations to finish.
#
# - Git repository data is transferred in a specific order: auxiliary files,
#   packed refs, loose refs, reflogs, and finally objects and pack files in that
#   order. This ensures that all referenced objects are captured.
#
# - Git GC operations are re-enabled on the GitHub instance.
#
# The script uses multiple runs of rsync to transfer repository files. Each run
# includes a list of filter rules that ensure only specific types of files are
# transferred.
#
# See the "FILTER RULES" and "INCLUDE/EXCLUDE PATTERN RULES" sections of the
# rsync(1) manual for more information:
#      <http://rsync.samba.org/ftp/rsync/rsync.html>

# Bring in the backup configuration
. $( dirname "${BASH_SOURCE[0]}" )/ghe-backup-config

bm_start "$(basename $0)"

# Split host:port into parts
port=$(ssh_port_part "$GHE_HOSTNAME")
host=$(ssh_host_part "$GHE_HOSTNAME")

# Add user / -l option
user="${host%@*}"
[ "$user" = "$host" ] && user="admin"

backup_dir="$GHE_SNAPSHOT_DIR/repositories"

# Location of last good backup for rsync --link-dest
backup_current="$GHE_DATA_DIR/current/repositories"

# Verify rsync is available.
if ! rsync --version 1>/dev/null 2>&1; then
  echo "Error: rsync not found." 1>&2
  exit 1
fi

# Perform a host-check and establish GHE_REMOTE_XXX variables.
ghe_remote_version_required "$host"

# Generate SSH config for forwarding

config=""

# Split host:port into parts
port=$(ssh_port_part "$GHE_HOSTNAME")
host=$(ssh_host_part "$GHE_HOSTNAME")

# Add user / -l option
user="${host%@*}"
[ "$user" = "$host" ] && user="admin"

# git server hostnames
hostnames=$(ghe_cluster_online_nodes "git-server")
for hostname in $hostnames; do
  config="$config
Host $hostname
  ProxyCommand ssh -q $GHE_EXTRA_SSH_OPTS -p $port $user@$host nc.openbsd %h %p
  StrictHostKeyChecking=no
"
done

config_file=$(mktemp -t cluster-backup-restore-XXXXXX)
echo "$config" > "$config_file"

opts="$GHE_EXTRA_SSH_OPTS -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PasswordAuthentication=no"

# Make sure root backup dir exists if this is the first run
mkdir -p "$backup_dir"

# Removes the remote sync-in-progress file on exit, re-enabling GC operations
# on the remote instance.
cleanup() {
  # Enable remote GC operations
  for hostname in $hostnames; do
    ghe-gc-enable -F $config_file $hostname:$port
  done
  rm -f $config_file
}
trap 'cleanup' EXIT
trap 'exit $?' INT # ^C always terminate

# Disable remote GC operations
for hostname in $hostnames; do
  ghe-gc-disable -F $config_file $hostname:$port
done

# If we have a previous increment, avoid transferring existing files via rsync's
# --link-dest support. This also decreases physical space usage considerably.
if [ -d "$backup_current" ]; then
  link_dest="--link-dest=../../current/repositories"
fi

# Transfer repository data from a GitHub instance to the current snapshot
# directory, using a previous snapshot to avoid transferring files that have
# already been transferred. A set of rsync filter rules are provided on stdin
# for each invocation.
rsync_repository_data () {
  port=$(ssh_port_part "$1")
  host=$(ssh_host_part "$1")

  shift
  ghe-rsync -av \
    -e "ssh -q $opts -p $port -F $config_file -l $user" \
    $link_dest "$@" \
    --rsync-path='sudo -u git rsync' \
    --include-from=- --exclude=\* \
    "$host:$GHE_REMOTE_DATA_USER_DIR/repositories/" \
    "$backup_dir" 1>&3
}

for hostname in $hostnames; do
  bm_start "$(basename $0) - $hostname"
  echo 1>&3
  echo "* Starting backup for host: $hostname"
  # Sync all auxiliary repository data. This includes files and directories like
  # HEAD, audit_log, config, description, info/, etc. No refs or object data
  # should be transferred here.
  echo 1>&3
  echo "* Transferring auxiliary files ..." 1>&3
  rsync_repository_data $hostname:122 -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
- /*/*.git/objects
- /*/*.git/refs
- /*/*.git/packed-refs
- /*/*.git/logs
+ /*/*.git/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
- /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/refs
- /*/??/??/??/gist/*.git/packed-refs
- /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/**

+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
- /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/refs
- /*/nw/??/??/??/*/*.git/packed-refs
- /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/**
RULES

  # Sync packed refs files. This is performed before sync'ing loose refs since
  # loose refs trump packed-refs information.
  echo 1>&3
  echo "* Transferring packed-refs files ..." 1>&3
  rsync_repository_data $hostname:122 -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/packed-refs

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/packed-refs

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/packed-refs
RULES

  # Sync loose refs and reflogs. This must be performed before object data is
  # transferred to ensure that all referenced objects are included.
  echo 1>&3
  echo "* Transferring refs and reflogs ..."  1>&3
  rsync_repository_data $hostname:122 -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/refs
+ /*/*.git/refs/**
+ /*/*.git/logs
+ /*/*.git/logs/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/refs
+ /*/??/??/??/gist/*.git/refs/**
+ /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/logs/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/refs
+ /*/nw/??/??/??/*/*.git/refs/**
+ /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/logs/**
RULES

  # Sync git objects and pack files. Compression is disabled during this phase
  # since these files are already well compressed.
  echo 1>&3
  echo "* Transferring objects and packs ..." 1>&3
  rsync_repository_data $hostname:122 -H <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/objects
- /*/*.git/objects/**/tmp_*
+ /*/*.git/objects/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/objects/**/tmp_*
+ /*/??/??/??/gist/*.git/objects/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/objects/**/tmp_*
+ /*/nw/??/??/??/*/*.git/objects/**
RULES

  # Sync __special__ data directories, including the __alambic_assets__,
  # __hookshot__, and __purgatory__ directories. The __nodeload_archives__,
  # __gitmon__, and __render__ directories are excludes since they act only as
  # caches.
  #
  # Under v2.x and greater, only the special __purgatory__ directory remains under
  # /data/repositories. All other special user data directories have been moved under
  # the /data/user directory.
  echo 1>&3
  echo "* Transferring special data directories ..." 1>&3
  rsync_repository_data $hostname:122 <<RULES
- /__nodeload_archives__/
- /__gitmon__/
- /__render__/
+ /__*__/
+ /__*__/**
+ /info/
- /info/lost+found/
+ /info/*
RULES
  echo 1>&3
  bm_end "$(basename $0) - $hostname"
done

bm_end "$(basename $0)"
