#!/usr/local/bin/bash
#/ Usage: ghe-backup-repositories-rsync
#/ Take an online, incremental snapshot of all Git repository data.
#/
#/ Note: This command typically isn't called directly. It's invoked by
#/ ghe-backup when the rsync strategy is used.
set -e

# This command is designed to allow for transferring active Git repository data
# from a GitHub instance to a backup site in a way that ensures data is
# captured in a consistent state even when being written to.
#
# - All Git GC operations are disabled on the GitHub instance for the duration of
#   the backup. This removes the possibly of objects or packs being removed
#   while the backup is in progress.
#
# - In progress Git GC operations are given a cooldown window to complete. The
#   script will sleep for up to 60 seconds waiting for GC operations to finish.
#
# - Git repository data is transferred in a specific order: auxiliary files,
#   packed refs, loose refs, reflogs, and finally objects and pack files in that
#   order. This ensures that all referenced objects are captured.
#
# - Git GC operations are re-enabled on the GitHub instance.
#
# The script uses multiple runs of rsync to transfer repository files. Each run
# includes a list of filter rules that ensure only specific types of files are
# transferred.
#
# See the "FILTER RULES" and "INCLUDE/EXCLUDE PATTERN RULES" sections of the
# rsync(1) manual for more information:
#      <http://rsync.samba.org/ftp/rsync/rsync.html>

# Bring in the backup configuration
. $( dirname "${BASH_SOURCE[0]}" )/ghe-backup-config

bm_start "$(basename $0)"

# Set up remote host and root backup snapshot directory based on config
host="$GHE_HOSTNAME"
backup_dir="$GHE_SNAPSHOT_DIR/repositories"

# Location of last good backup for rsync --link-dest
backup_current="$GHE_DATA_DIR/current/repositories"

# Verify rsync is available.
if ! rsync --version 1>/dev/null 2>&1; then
    echo "Error: rsync not found." 1>&2
    exit 1
fi

# Perform a host-check and establish GHE_REMOTE_XXX variables.
ghe_remote_version_required "$host"

# Make sure root backup dir exists if this is the first run
mkdir -p "$backup_dir"

# Removes the remote sync-in-progress file on exit, re-enabling GC operations
# on the remote instance.
cleanup() {
  # Enable remote GC operations
  ghe-gc-enable $host
}
trap 'cleanup' EXIT
trap 'exit $?' INT # ^C always terminate

# Disable remote GC operations
ghe-gc-disable $host

# Transfer repository data from a GitHub instance to the current snapshot
# directory, using a previous snapshot to avoid transferring files that have
# already been transferred. A set of rsync filter rules are provided on stdin
# for each invocation.
rsync_repository_data () {
    ghe-rsync -av \
        -e "ghe-ssh -p $(ssh_port_part "$host")" \
        $link_dest "$@" \
        --rsync-path='sudo -u git rsync' \
        --include-from=- --exclude=\* \
        "$(ssh_host_part "$host"):$GHE_REMOTE_DATA_USER_DIR/repositories/" \
        "$backup_dir" 1>&3
}

# If we have a previous increment, avoid transferring existing files via rsync's
# --link-dest support. This also decreases physical space usage considerably.
if [ -d "$backup_current" ]; then
    link_dest="--link-dest=../../current/repositories"
fi

# Sync all auxiliary repository data. This includes files and directories like
# HEAD, audit_log, config, description, info/, etc. No refs or object data
# should be transferred here.
echo 1>&3
echo "* Transferring auxiliary files ..." 1>&3
rsync_repository_data -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
- /*/*.git/objects
- /*/*.git/refs
- /*/*.git/packed-refs
- /*/*.git/logs
+ /*/*.git/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
- /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/refs
- /*/??/??/??/gist/*.git/packed-refs
- /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/**

+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
- /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/refs
- /*/nw/??/??/??/*/*.git/packed-refs
- /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/**
RULES

# Sync packed refs files. This is performed before sync'ing loose refs since
# loose refs trump packed-refs information.
echo 1>&3
echo "* Transferring packed-refs files ..." 1>&3
rsync_repository_data -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/packed-refs

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/packed-refs

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/packed-refs
RULES

# Sync loose refs and reflogs. This must be performed before object data is
# transferred to ensure that all referenced objects are included.
echo 1>&3
echo "* Transferring refs and reflogs ..."  1>&3
rsync_repository_data -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/refs
+ /*/*.git/refs/**
+ /*/*.git/logs
+ /*/*.git/logs/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/refs
+ /*/??/??/??/gist/*.git/refs/**
+ /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/logs/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/refs
+ /*/nw/??/??/??/*/*.git/refs/**
+ /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/logs/**
RULES

# Sync git objects and pack files. Compression is disabled during this phase
# since these files are already well compressed.
echo 1>&3
echo "* Transferring objects and packs ..." 1>&3
rsync_repository_data -H <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/objects
- /*/*.git/objects/**/tmp_*
+ /*/*.git/objects/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/objects/**/tmp_*
+ /*/??/??/??/gist/*.git/objects/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/objects/**/tmp_*
+ /*/nw/??/??/??/*/*.git/objects/**
RULES

# Sync __special__ data directories, including the __alambic_assets__,
# __hookshot__, and __purgatory__ directories. The __nodeload_archives__,
# __gitmon__, and __render__ directories are excludes since they act only as
# caches.
#
# Under v2.x and greater, only the special __purgatory__ directory remains under
# /data/repositories. All other special user data directories have been moved under
# the /data/user directory.
echo 1>&3
echo "* Transferring special data directories ..." 1>&3
rsync_repository_data <<RULES
- /__nodeload_archives__/
- /__gitmon__/
- /__render__/
+ /__*__/
+ /__*__/**
+ /info/
- /info/lost+found/
+ /info/*
RULES
echo 1>&3

bm_end "$(basename $0)"
