#!/bin/sh

##
## This script tunes Linux kernel parameters useful for scaling HTCondor.
## Some values may be inappropriate for machines not dedicated to HTCondor.
## This script must be run as root, and we assume that (and support only)
## HTCondor is running as root for large installations.
##
## The next '##' sections marks the beginning of the implementation section
## of this script, and you should not need to change anything past that mark.
##

#
# This script logs its actions to syslog.  If you'd rather it didn't,
# change the line below to something like '/bin/true'.
#
LOGGER='/usr/bin/logger -t htcondor'

#
# Increase the global number of file descriptors.  For example, each
# dynamically-linked shadow uses approximately 13 just to open its
# libraries.
#
# We don't set the per-process maximum, because HTCondor will do that
# appropriately for each different subsystem (daemon) when run as root.
# If necessary, you can change those values using the configuration variables
# MAX_FILE_DESCRIPTORS and <SUBSYS>_MAX_FILE_DESCRIPTORS.
#
# This should match LimitNOFILE in the condor.service file for systemd
#
GLOBAL_MAX_FDS=32768

#
# Increase the maximum process ID.  By default, Linux process IDs wrap at
# 32768, which isn't a lot of processes when you have one per running job.
#
# This should match the TasksMax in the condor.service file for systemd
#
GLOBAL_MAX_PROCESSES=4194303

#
# Unless otherwise specified, an outbound connection uses a port within
# this range.  On some systems, this default to 1024-4999, which may not
# be enough.  You can also force HTCondor to specify a particular
# outbound port by setting the configuration variables OUT_LOWPORT and
# OUT_HIGHPORT appropriately, but this will be slower under load, as
# HTCondor has to search for an open port.
#
LOCAL_PORT_RANGE="1024 65535"

#
# Increase the length of the TCP listen queue.  This allows more connections
# to pile up while HTCondor is otherwise busy.  Linux kernels pre 5.4 set this
# to 128, which is way too low.  5.4 changed this to 4096.
#
TCP_LISTEN_QUEUE=4096

#
# Likewise, the central manager (collector) needs have large UDP buffers.
# Increase the maximum allowed size of networks receive buffers.
#
MAX_RECEIVE_BUFFER=10485760

# Maximum amount of dirty filesystem bytes to buffer in the kernel
# before processes writing to the filesystem are blocked, and made
# to do their own i/o synchronously.  Set to 0 to undo.
# If we are running with cgroups on, there's a Linux kernel bug
# that causes spurious OOM events sent to a cgroup with a hard memory limit
# if it writes a lot of data to the filesystem quickly.  Limitting
# the buffering to 100M works around the problem
DIRTY_BYTES=100000000

# Set root user quota for max number of kernel session keys and
# bytes.  Kernel session keys are used by Kerberos, AFS, ecryptfs, and
# other services.  The condor_master creates a new session keyring by
# default on startup (via config knob DISCARD_SESSION_KEYRING_ON_STARTUP)
# so that Kerberos tokens and the like are not leaked to user jobs, and
# also to support encrypted execute directories.  The default quota
# starting with RHEL 6.4 is 1M keys and 25M bytes, which is plenty
# big.  But older kernels (like what shipped in RHEL 6.0) have a crazy
# small quota of 200 keys.  So here we just set these params to what
# RHEL 6.4+ use by default just in case people try to use HTCondor on
# an earlier kernel.
ROOT_MAXKEYS=1000000
ROOT_MAXKEYS_BYTES=25000000

# Linux kernels allocate 16 pages (4kbytes each) for each pipe created
# until a user has used /proc/sys/fs/pipe-user-pages-soft worth of
# pages, then just one page.  The linux schedd uses a pipe to send the
# job classad from the schedd to a newly-forked shadow, and the job ad
# is usually more than one page large.  If the kernel is in the one-page
# per-pipe mode, this means the single-threaded schedd blocks until the
# shadow can pull the job ad out of the pipe, which limits scalability.

# This pipe itself is short-lived, so we are happy for it to consume 16
# pages, but there's another pipe, which HTCondor keeps open for 
# the duration of the shadow (and every daemon, for that matter).
# This other pipe, the async_signal pipe is used by HTCondor to turn
# a signal into a select(2)-able event. 

# This means with the default values, after spawning about 16k jobs,
# the schedd will slow down.  Let's raise this value to 128k, 
# which should be more than the number of concurrently 
# running jobs per user we claim to support.

PIPE_USER_PAGES_SOFT=131072

##
## Implementation.  You shouldn't need to change anything below here.
##

LOG=/dev/null
if [ -d /etc/sysctl.d ]; then
	LOG=/etc/sysctl.d/99-htcondor.conf
	(
		echo "#"
		echo "# This file was written by $0"
		echo "# when the condor_master started up."
		echo "#"
		echo "# This script tunes kernel parameters to support HTCondor at"
		echo "# larger scales.  A list of the changes follows.  You can set"
		echo "# ENABLE_KERNEL_TUNING = FALSE"
		echo "# in your HTCondor configuration to disable this entirely, or"
		echo "# set LINUX_KERNEL_TUNING_SCRIPT to some other file to change"
		echo "# which script is run when the condor_master daemon starts."
		echo "#"
	) > ${LOG}
fi

increaseKernelParameter() {
	PARAMETER=$1
	FILE=$2
	NEW=$3

	if [ -n "${NEW}" ]; then
		OLD=`cat "${FILE}"`
		if [ "${NEW}" -gt "${OLD}" ]; then
			echo "Changing ${PARAMETER} (${FILE}) from ${OLD} to ${NEW}"
			echo "# Changing ${PARAMETER} (${FILE}) from ${OLD} to ${NEW}" >> ${LOG}
			echo "Changing ${PARAMETER} (${FILE}) from ${OLD} to ${NEW}" | ${LOGGER}
			echo "${NEW}" > "${FILE}"
		else
			echo "Not changing ${PARAMETER} (${FILE}): new value (${NEW}) <= old value (${OLD})."
			echo "# Not changing ${PARAMETER} (${FILE}): new value (${NEW}) <= old value (${OLD})." >> ${LOG}
			echo "Not changing ${PARAMETER} (${FILE}): new value (${NEW}) <= old value (${OLD})." | ${LOGGER}
		fi
	fi
}

setKernelParameter() {
	PARAMETER=$1
	FILE=$2
	NEW=$3

	if [ -n "${NEW}" ]; then
		OLD=`cat ${FILE}`
		echo "Changing ${PARAMETER} (${FILE}) from ${OLD} to ${NEW}"
		echo "# Changing ${PARAMETER} (${FILE}) from ${OLD} to ${NEW}" >> ${LOG}
		echo "Changing ${PARAMETER} (${FILE}) from ${OLD} to ${NEW}" | ${LOGGER}
		echo "${NEW}" > "${FILE}"
	fi
}

increaseKernelParameter "GLOBAL_MAX_FDS" "/proc/sys/fs/file-max" ${GLOBAL_MAX_FDS}

# Set GLOBAL_MAX_PROCESSES and LOCAL_PORT_TRANGE only for schedd machines.
daemonList=`condor_config_val DAEMON_LIST | sed -e's/,/ /g'`
for daemon in $daemonList; do
	daemonFile=`condor_config_val ${daemon}`
	if [ -n "${daemonFile}" ]; then
		baseName=`basename ${daemonFile}`
		if [ "${baseName}" = "condor_schedd" ]; then
			increaseKernelParameter "GLOBAL_MAX_PROCESSES" "/proc/sys/kernel/pid_max" ${GLOBAL_MAX_PROCESSES}
			setKernelParameter "LOCAL_PORT_RANGE" "/proc/sys/net/ipv4/ip_local_port_range" "${LOCAL_PORT_RANGE}"
		fi
	fi
done

increaseKernelParameter "TCP_LISTEN_QUEUE" "/proc/sys/net/core/somaxconn" ${TCP_LISTEN_QUEUE}
increaseKernelParameter "ROOT_MAXKEYS" "/proc/sys/kernel/keys/root_maxkeys" ${ROOT_MAXKEYS}
increaseKernelParameter "ROOT_MAXKEYS_BYTES" "/proc/sys/kernel/keys/root_maxbytes" ${ROOT_MAXKEYS_BYTES}
increaseKernelParameter "PIPE_USER_PAGES_SOFT" "/proc/sys/fs/pipe-user-pages-soft" ${PIPE_USER_PAGES_SOFT}

if condor_config_val BASE_CGROUP > /dev/null 2>&1
then
	setKernelParameter "FS_CACHE_DIRTY_BYTES" "/proc/sys/vm/dirty_bytes" "${DIRTY_BYTES}"
fi

# FIXME: Only on the collector.
increaseKernelParameter "MAX_RECEIVE_BUFFER" "/proc/sys/net/core/rmem_max" ${MAX_RECEIVE_BUFFER}

exit 0
