#!/bin/bash

#
# Script that cleans up the local cache of Images
# The logic is to cleanup images to make sure the Images cache does not take more than
# a fixed amount of disk space.
#
# IMPORTANT1: this script will not ask any confirmation as it is meant to be part of a CI process.
# IMPORTANT2: this script assumes that no docker container is needed anymore when it runs.
#             In other words, this script cannot run from WITHIN a docker container!

# Constants

# start by removing docker images older than 14days
MAX_AGE_HOURS=336
DOCKER_LIB_MAX_SIZE_PERC=80


# Helper functions

function stop_kind_clusters()
{
    # some EVA/ECC pipelines employ "kind" to perform Kubernetes-based integration tests
    # kind is using docker containers to emulate Kubernetes worker nodes.
    # so it's a good idea to erase the "atf" kind cluster (and all the worker nodes)
    kind delete cluster --name=atf
}

function stop_non_critical_containers()
{
    # NOTE: this filter is usefull when in a TC build agent can coexist Rancher cluster
    # containers that MUST NOT be stopped, so the related container ids will be excluded
    # from the container list to stop
    DOCKERS_TO_EXCLUDE="$(docker ps --quiet --filter 'name=k8s_*' --filter 'name=kube-*' --filter 'name=kubelet' --filter 'name=etcd*')"
    
    DOCKER_FULL_LIST="$(docker ps -q)"
    DOCKER_STOP_LIST=""
    for dockerid in ${DOCKER_FULL_LIST}; do
        if [[ "$DOCKERS_TO_EXCLUDE" == *"$dockerid"* ]]; then
            echo "Skipping stop of docker with ID=$dockerid since it looks like a Rancher Kubernetes docker"
        else
            DOCKER_STOP_LIST+=" ${dockerid}"
        fi
    done

    # before we cleanup docker images, it's a good idea to stop any unnecessary docker container
    # still up and running (typically leftovers from failed/interrupted automated tests)
    if [ ! -z "$DOCKER_STOP_LIST" ]; then
        echo ">> Stopping docker containers still running"
        docker stop ${DOCKER_STOP_LIST}
    else
        echo ">> No docker container selected to be stopped"
    fi
}

function compute_available_space_for_docker()
{
    # some build agents have a lot of disk (TBs) while some have only 250GB... so this script
    # adapt dynamically the threshold which triggers docker cleanups:
    MEMORY_UNIT="GB"
    FILESYSTEM_SIZE_GB=$(df --output=size --block-size=1G /var/lib/docker/ | tail -1)
    if [ -z "$FILESYSTEM_SIZE_GB" ]; then
        echo "Df command return unexpected output, aborting cleanup"
        exit 1
    fi

    # dedicate up to 80% of disk space to docker
    DOCKER_SYSTEM_MAX_DISK_USAGE_GB=$(( $FILESYSTEM_SIZE_GB * DOCKER_LIB_MAX_SIZE_PERC / 100 ))
    if (( DOCKER_SYSTEM_MAX_DISK_USAGE_GB < 100 )); then
        # minimal threshold
        DOCKER_SYSTEM_MAX_DISK_USAGE_GB=100
    fi
    if (( DOCKER_SYSTEM_MAX_DISK_USAGE_GB > 1000 )); then
        # max threshold
        DOCKER_SYSTEM_MAX_DISK_USAGE_GB=1000
    fi

    echo ">> The file system hosting /var/lib/docker/ has a total size of ${FILESYSTEM_SIZE_GB}GBs; dedicating max ${DOCKER_LIB_MAX_SIZE_PERC}% of that space for docker: ${DOCKER_SYSTEM_MAX_DISK_USAGE_GB}GBs"
}

#
# IMPORTANT: get_docker_image_disk_usage() is now unused because apparently the "docker system df"
# command it used does not fully reflect docker system usage, see e.g. following situation:
#
#    [root@bilcentos7-build21-3 /]# du -hs /var/lib/docker/
#    246G    /var/lib/docker/
#
#    [root@bilcentos7-build21-3 ~]# docker system df
#    TYPE                TOTAL               ACTIVE              SIZE                RECLAIMABLE
#    Images              404                 0                   98.13GB             98.13GB (100%)
#    Containers          0                   0                   0B                  0B
#    Local Volumes       0                   0                   0B                  0B
#    Build Cache         2700                0                   64.24GB             64.24GB
#                                                              ----------
#                                          REPORTED TOTAL SIZE:   160GB
#
#  In this case it turns out that "docker system df" reports 160GB used while /var/lib/docker is around 246GB;
#  In particular it was /var/lib/docket/overlay2 to take so much despite having zero docker containers...
#  For this reason now the get_docker_real_disk_usage() function is in use instead
#
function get_docker_image_disk_usage()
{
    local CLEANUP_TYPE="$1"

    # update global DOCKER_CACHE_SIZE_GB variable
    DOCKER_SYSTEMDF="$(docker system df --format '{{.Size}}' | tr '\n' ' ')"
    if [ -z "$DOCKER_SYSTEMDF" ]; then
        echo "Docker system command return unexpected output, aborting cleanup"
        exit 1
    fi

    #echo "DEBUG: $DOCKER_SYSTEMDF"

    if [ "$CLEANUP_TYPE" = "builder" ]; then
        # 4th word is the BUILDER CACHE
        DOCKER_CACHE_SIZE_GB="$(echo $DOCKER_SYSTEMDF | awk '{print $4;}')"
    elif [ "$CLEANUP_TYPE" = "image" ]; then
        # first word is the IMAGE CACHE
        DOCKER_CACHE_SIZE_GB="$(echo $DOCKER_SYSTEMDF | awk '{print $1;}')"
    else
        echo "Invalid $CLEANUP_TYPE"
        exit 199
    fi
    #echo "DEBUG: $DOCKER_CACHE_SIZE_GB"

    # update global MEMORY_UNIT: take the last 2 chars: the measurement unit
    MEMORY_UNIT="${DOCKER_CACHE_SIZE_GB: -2}"

    # remove last 2 chars (the memory unit)
    DOCKER_CACHE_SIZE_GB=${DOCKER_CACHE_SIZE_GB%??}

    # remove everything comes after the first dot (and the dot itself) --  we don't care about decimal part
    DOCKER_CACHE_SIZE_GB=${DOCKER_CACHE_SIZE_GB%.*}
}

function get_docker_real_disk_usage()
{
    MEMORY_UNIT="GB"
    DOCKER_CACHE_SIZE_GB="$(du --summarize  --block-size=1G /var/lib/docker/ | awk '{print $1;}')"
    if [ -z "$DOCKER_CACHE_SIZE_GB" ]; then
        echo "Du command return unexpected output, aborting cleanup"
        exit 1
    fi
}

function emergency_cleanup_stale_layers()
{
    local EMERGENCY_CLEAN_LOG="/var/log/docker_emergency_cleanup.log"

    # Purpose: clean the disk from stale docker layers (seems a bug impacting at least docker 19.03.15)
    echo "Stopping all dockers and pruning everything (images, containers, networks) that can be pruned."
    echo
    echo "WARNING: this operation will destroy any Rancher setup eventually present on this machine."
    echo "         If you need Rancher (or some other docker-based application), please restore it"
    echo "         manually."
    echo

    DOCKERS_STILL_RUNNING="$(docker ps -q)"
    if [ ! -z "$DOCKERS_STILL_RUNNING" ]; then
        echo ">> Stopping docker containers still running"
        docker stop ${DOCKERS_STILL_RUNNING}
        sleep 3
    else
        echo ">> No docker container selected to be stopped"
    fi

    echo "$(date): starting new emergency cleaning of docker stale layers; disk status before cleaning:" >>$EMERGENCY_CLEAN_LOG
    df -Th >>$EMERGENCY_CLEAN_LOG

    # try again a system prune
    docker system prune -af
    sleep 3

    # before hitting hard on the docker daemon layer organization on disk, let's stop the daemon
    echo "Stopping docker daemon"
    systemctl stop docker
    sleep 3

    echo "Now deleting all stale layers to free up disks and workaround docker bugs."
    rm -rf /var/lib/docker/overlay2
    sleep 3

    # put docker back up in a functional status:
    echo "Restarting docker daemon"
    systemctl start docker
    
    echo "$(date): finished emergency cleaning; disk status after cleaning:" >>$EMERGENCY_CLEAN_LOG
    df -Th >>$EMERGENCY_CLEAN_LOG
}

function cleanup_image_or_builder_cache()
{
    local DESC="/var/lib/docker directory"

    echo ">> Cleaning the docker $DESC till we reach max usage of ${DOCKER_SYSTEM_MAX_DISK_USAGE_GB}GB"

    LESS_THAN_ONE_GB=true
    get_docker_real_disk_usage 
    if [ "$MEMORY_UNIT" == "GB" ]; then
        LESS_THAN_ONE_GB=false
        while (( $DOCKER_CACHE_SIZE_GB > $DOCKER_SYSTEM_MAX_DISK_USAGE_GB )); do
            echo "Current docker $DESC has size=${DOCKER_CACHE_SIZE_GB}GB, max allowed size=${DOCKER_SYSTEM_MAX_DISK_USAGE_GB}GB."
            echo "Starting removal of all docker images older than ${MAX_AGE_HOURS}hours (if any)..."
            
            docker builder prune --all --force --filter "until=${MAX_AGE_HOURS}h"
            docker image prune --all --force --filter "until=${MAX_AGE_HOURS}h"

            # now recalculate how much space is still taken by docker images:
            get_docker_real_disk_usage

            if [ "$MEMORY_UNIT" != "GB" ]; then
                echo "Docker system is reporting less than 1GB used, aborting cleanup"
                LESS_THAN_ONE_GB=true
                break
            fi

            # decrease threshold for deleting unused images:
            if (( MAX_AGE_HOURS > 25 )); then
                MAX_AGE_HOURS=$(( MAX_AGE_HOURS - 24 ))
            else
                MAX_AGE_HOURS=$(( MAX_AGE_HOURS - 2 ))
            fi
            if (( $MAX_AGE_HOURS < 0 )); then
                echo "Age threshold has been decreased till reaching a negative value... aborting cleanup."
                break
            fi
        done
    fi

    if (( $DOCKER_CACHE_SIZE_GB < $DOCKER_SYSTEM_MAX_DISK_USAGE_GB )); then
        echo "Docker $DESC cleanup successfully completed: $DESC has size=${DOCKER_CACHE_SIZE_GB}GB."
    else
        echo "Docker $DESC cleanup failed: $DESC has size=${DOCKER_CACHE_SIZE_GB}GB. Will try a cleanup of all images without any --filter option"
        docker image prune --all --force
    
        # now recalculate how much space is still taken by docker images:
        get_docker_real_disk_usage
        echo "Docker $DESC has now size=${DOCKER_CACHE_SIZE_GB}GB"

        if (( $DOCKER_CACHE_SIZE_GB > $DOCKER_SYSTEM_MAX_DISK_USAGE_GB )); then
            # if the "docker image prune" done above was still not enough... then trigger our
            # "emergency" recovery of disk space:
            echo ">> Docker images still take too much space... EMERGENCY CLEANUP OF STALE DOCKER IMAGE LAYERS ACTIVATED"
            emergency_cleanup_stale_layers
        fi
    fi
}

function cleanup_containers()
{
    echo ">> Cleaning all docker containers with status=exited"
    COMPLETED_DOCKERS="$(docker ps --filter status=exited -q 2>/dev/null)"
    NUM_COMPLETED_DOCKERS="$(echo $COMPLETED_DOCKERS | wc -w)"
    if [[ ! -z "$COMPLETED_DOCKERS" ]]; then 
        docker rm --volumes $COMPLETED_DOCKERS 
        echo "Removed a total of $NUM_COMPLETED_DOCKERS containers"
    else 
        echo "No docker containers to cleanup !"
    fi
}

function cleanup_volumes()
{
    echo ">> Cleaning all unused docker volumes"
    docker volume prune --force
}


# MAIN
#
# As of docker v19.03.15, the 'docker system df' command identifies 4 possible types of disk use by docker:
#
#   $ docker system df
#   TYPE                TOTAL               ACTIVE              SIZE                RECLAIMABLE
#   Images              355                 4                   145.6GB             145.4GB (99%)     a)
#   Containers          11                  0                   0B                  0B                b)
#   Local Volumes       0                   0                   0B                  0B                c)
#   Build Cache         3645                0                   107.3GB             107.3GB           d)
#
compute_available_space_for_docker

# before any actual cleanup, let's stop and remove any non-critical container:
stop_kind_clusters
stop_non_critical_containers
cleanup_containers # cleans up entry b) above

# then go with the bulk cleanup:
cleanup_image_or_builder_cache # cleans up entry a)+d) above
cleanup_volumes # cleans up entry c) above

echo ">> Final docker disk space report:"
docker system df

echo ">> Final /var/lib/docker disk space usage:"
du -hs /var/lib/docker

echo ">> Complete disk usage view:"
df -Th
