#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
# Copyright (c) 2017-2018 Western Digital Corporation or its affiliates.
#
# Functions and global variables used by the srp tests.

. common/shellcheck
. common/null_blk

debug=
filesystem_type=ext4
fio_aux_path=/tmp/fio-state-files
memtotal=$(sed -n 's/^MemTotal:[[:blank:]]*\([0-9]*\)[[:blank:]]*kB$/\1/p' /proc/meminfo)
max_ramdisk_size=$((1<<25))
use_siw=${use_siw:-""}
ramdisk_size=$((memtotal*(1024/16)))  # in bytes
if [ $ramdisk_size -gt $max_ramdisk_size ]; then
	ramdisk_size=$max_ramdisk_size
fi

_have_legacy_dm() {
	_have_kernel_config_file || return
	if ! _check_kernel_option DM_MQ_DEFAULT; then
		SKIP_REASONS+=("legacy device mapper support is missing")
		return 1
	fi
}

# Check whether version number $1 is less than or equal to version number $2.
version_le() {
	local i v1 v2

	IFS='.' read -ra v1 <<<"$1"
	IFS='.' read -ra v2 <<<"$2"
	if [ ${#v1[@]} != ${#v2[@]} ]; then
		echo "Error: version number mismatch $1 <> $2" >&2
		return 1
	fi
	for ((i=0; i < ${#v1[@]}; i++)); do
		[ "${v1[i]}" -lt "${v2[i]}" ] && return 0
		[ "${v1[i]}" -gt "${v2[i]}" ] && return 1
	done
	return 0
}

# Check whether the multipathd version is at least $1. $1 is a version number
# with three components separated by dots.
_multipathd_version_ge() {
	local min_ver=$1 mp_ver

	mp_ver=$(multipath -k 2>&1 |
		sed -n 's/^multipath-tools v\([0-9]*\.[0-9]*\.[0-9]*\).*/\1/p')
	if version_le "$min_ver" "$mp_ver"; then
		return 0
	fi
	SKIP_REASONS+=("Need multipathd version $min_ver; found multipathd version $mp_ver.")
	return 1
}

get_ipv4_addr() {
	if [ ! -e "/sys/class/net/$1" ]; then
		echo "get_ipv4_addr(): $1 is not a network interface" 1>&2
	fi
	ip -4 -o addr show dev "$1" |
		sed -n 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
}

# Convert e.g. ::1 into 0000:0000:0000:0000:0000:0000:0000:0001.
expand_ipv6_addr() {
	awk -F : '{ left=1; for(i=1;i<=NF;i++) { a=substr("0000", 1+length($i)) $i; if ($i == "") left=0; else if (left) pre = pre ":" a; else suf = suf ":" a }; mid=substr("0000:0000:0000:0000:0000:0000:0000:0000", (pre=="")+length(pre)+length(suf)); print substr(pre,2) mid suf}'
}

get_ipv6_addr() {
	ip -6 -o addr show dev "$1" |
		sed -n 's/.*[[:blank:]]inet6[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
}

# grab only the link local address
get_ipv6_ll_addr() {
	if [[ ! -e "/sys/class/net/$1" ]]; then
		echo "get_ipv6_addr(): $1 is not a network interface" 1>&2
	fi
	ll_addr=$(ip -6 -o addr show dev "$1" | grep "scope link" |
		sed -n 's/.*[[:blank:]]inet6[[:blank:]]*\([^[:blank:]/]*\).*/\1/p')
	echo "$ll_addr%$1"
}

# Whether or not $1 is a number.
is_number() {
	[ "$1" -eq "0$1" ] 2>/dev/null
}

# Lists RDMA capable network interface names, e.g. ib0 ib1.
rdma_network_interfaces() {
	rdma link show |
		sed -n 's/^.*[[:blank:]]netdev[[:blank:]]\+\([^[:blank:]]*\)[[:blank:]]*/\1/p' |
		sort -u
}

# Check whether any stacked block device holds block device $1. If so, echo
# the name of the holder.
held_by() {
	local d e dev=$1

	while [ -L "$dev" ]; do
		dev=$(realpath "$dev")
	done
	dev=${dev#/dev/}
	for d in /sys/class/block/*/holders/*; do
		[ -e "$d" ] || continue
		e=$(basename "$d")
		if [ "$e" = "$dev" ]; then
			echo "/dev/$(basename "$(dirname "$(dirname "$d")")")"
		fi
	done
}

# Sleep until either $1 seconds have elapsed or until the deadline $2 has been
# reached. Return 1 if and only if the deadline has been met.
sleep_until() {
	local duration=$1 deadline=$2 u

	u=$(_uptime_s)
	if [ $((u + duration)) -le "$deadline" ]; then
		sleep "$duration"
	else
		[ "$deadline" -gt "$u" ] && sleep $((deadline - u))
		return 1
	fi
}

# Kill all processes that have opened block device $1.
stop_bdev_users() {
	[ -n "$1" ] || return 1
	lsof -F p "$1" 2>/dev/null | while read -r line; do
		p="${line#p}"
		if [ "$p" != "$line" ]; then
			echo -n " (pid $p)" >>"$FULL"
			kill -9 "$p"
		fi
	done
}

# RHEL 6 dmsetup accepts mpath<n> but not /dev/dm-<n> as its first argument.
# Hence this function that converts /dev/dm-<n> into mpath<n>.
dev_to_mpath() {
	local d e mm

	d="${1#/dev/mapper/}";
	if [ "$d" != "$1" ]; then
		echo "$d"
		return 0
	fi

	[ -e "$1" ] || return $?

	if [ -L "$1" ]; then
		e=$(readlink -f "$1")
	else
		e="$1"
	fi
	if ! mm=$(stat -c %t:%T "$e"); then
		echo "stat $1 -> $e failed"
		return 1
	fi

	for d in /dev/mapper/mpath*; do
		if [ -L "$d" ]; then
			e=$(readlink -f "$d")
		elif [ -e "$d" ]; then
			e="$d"
		else
			continue
		fi
		if [ "$(stat -c %t:%T "$e")" = "$mm" ]; then
			basename "$d"
			return 0
		fi
	done
	return 1
}

# Find all multipaths with one or more deleted devices and remove these.
remove_stale_mpath_devs() {
	echo "Examining all multipaths"
	dmsetup table | while read -r mpdev fs ls type def; do
		echo "$fs $ls" >/dev/null
		# shellcheck disable=SC2086
		if [ "$type" = multipath ] &&
			   { is_qinp_def "$def" ||
				     mpath_has_stale_dev $def; }; then
			echo "${mpdev%:}"
		fi
	done |
		sort -u |
		while read -r mpdev; do
			mpdev="/dev/mapper/$mpdev"
			echo -n "removing $mpdev: "
			if ! remove_mpath_dev "$mpdev"; then
				echo "failed"
				test -z "$debug" || return 1
			fi
		done
	echo "Finished examining multipaths"
}

# Modify mpath device $1 to fail_if_no_path mode, unmount the filesystem on top
# of it and remove the mpath device.
remove_mpath_dev() {
	local cmd dm i output t1 t2

	{
		for ((i=10;i>0;i--)); do
			cmd="dm=\$(dev_to_mpath \"$1\")"
			if ! eval "$cmd"; then
				echo "$cmd: failed"
			else
				t1=$(dmsetup table "$dm")
				cmd="dmsetup message $dm 0 fail_if_no_path"
				if ! eval "$cmd"; then
					echo "$cmd: failed"
				else
					t2=$(dmsetup table "$dm")
					if echo "$t2" | grep -qw queue_if_no_path; then
						echo "$dm: $t1 -> $t2"
					fi
					echo "Attempting to unmount /dev/mapper/$dm"
					umount "/dev/mapper/$dm"
					cmd="dmsetup remove $dm"
					if ! output=$(eval "$cmd" 2>&1); then
						echo "$cmd: $output; retrying"
					else
						echo "done"
						break
					fi
				fi
			fi
			if [ ! -e "$1" ]; then
				break
			fi
			ls -l "$1"
			stop_bdev_users "$(readlink -f "$1")"
			sleep .5
		done
		if [[ $i = 0 ]]; then
			echo "failed"
			return 1
		fi
	} &>>"$FULL"
}

# Check whether one or more arguments contain stale device nodes (/dev/...).
mpath_has_stale_dev() {
	local d

	for d in "$@"; do
		if [ "${d/://}" != "$d" ]; then
			grep -qw "$d" /sys/class/block/*/dev 2>/dev/null ||
				return 0
		fi
	done

	return 1
}

# Check whether multipath definition $1 includes the queue_if_no_path keyword.
is_qinp_def() {
	case "$1" in
		*" 3 queue_if_no_path queue_mode mq "*)
			return 0;;
		*" 1 queue_if_no_path "*)
			return 0;;
		*)
			return 1;;
	esac
}

# Load the configfs kernel module and mount it.
mount_configfs() {
	if [ ! -e /sys/module/configfs ]; then
		modprobe configfs || return $?
	fi
	if ! mount | grep -qw configfs; then
		mount -t configfs none /sys/kernel/config || return $?
	fi
}

# Set scheduler of block device $1 to $2.
set_scheduler() {
	local b=$1 p s=$2

	p=/sys/class/block/$b/queue/scheduler
	if [ -e "/sys/block/$b/mq" ]; then
		case "$s" in
			noop)        s=none;;
			deadline)    s=mq-deadline;;
			bfq)         s=bfq;;
		esac
	else
		case "$s" in
			none)        s=noop;;
			mq-deadline) s=deadline;;
			bfq-mq)      s=bfq;;
		esac
	fi
	if ! echo "$s" > "$p"; then
		echo "Changing scheduler of $b from $(<"$p") into $s failed"
		return 1
	fi
}

# Get a /dev/... path that points at dm device number $1. Set its I/O scheduler
# to $2 and its timeout to $3. The shell script that includes this file must
# define a function get_bdev_path() that translates device number $1 into a
# /dev/disk/... path.
get_bdev_n() {
	local b d dev elevator=$2 h i=$1 j realdev timeout=$3

	is_number "$i" || return $?
	dev=""
	for ((j=0;j<50;j++)); do
		if dev=$(get_bdev_path "$i") && [ -n "$dev" ] && [ -e "$dev" ]
		then
			break
		fi
		echo reconfigure | multipathd -k >&/dev/null
		sleep .1
	done
	if [ -z "$dev" ] || [ ! -e "$dev" ]; then
		echo "Could not find device $i -> $dev"
		return 1
	fi
	if [ ! -L "$dev" ]; then
		echo "$dev: not a soft link"
		return 1
	fi
	realdev=$(readlink "$dev" 2>/dev/null || echo "?")
	echo "Using $dev -> ${realdev}" >>"$FULL"
	for ((j=0; j<50; j++)); do
		blockdev --getbsz "$dev" >&/dev/null && break
		echo reconfigure | multipathd -k >& /dev/null
		sleep .1
	done
	if ! blockdev --getbsz "$dev" >&/dev/null; then
		echo "$dev: querying block size failed"
		return 1
	fi
	b=$(basename "$realdev")
	set_scheduler "$b" "$elevator"
	for d in /sys/class/block/*"/holders/$b"; do
		[ -e "$d" ] || continue
		h="$(basename "$(dirname "$(dirname "$d")")")"
		set_scheduler "$h" "${elevator}"
		if [ -e "/sys/class/block/$h/device/timeout" ]; then
			echo "$timeout" > "/sys/class/block/$h/device/timeout"
		fi
	done
	echo "get_bdev_n() returned $dev" >>"$FULL"
	echo "$dev"
}

# Full path of mountpoint $1. fio will be run on top of the filesystem mounted
# at the returned mountpoint.
function mountpoint() {
	if [ -z "$TMPDIR" ]; then
		echo "Error: \$TMPDIR has not been set." 1>&2
		exit 1
	fi
	if [ -z "$1" ]; then
		echo "Error: missing argument" 1>&2
		exit 1
	fi
	echo "$TMPDIR/mnt$1"
}

# All primary RDMA GIDs
all_primary_gids() {
	local gid p

	for p in /sys/class/infiniband/*/ports/*/gids/0; do
		gid="$(<"$p")"
		[ "$gid" != 0000:0000:0000:0000:0000:0000:0000:0000 ] &&
			echo "$gid"
	done
}

# Check whether or not an rdma_rxe or siw instance has been associated with
# network interface $1.
has_soft_rdma() {
	rdma link | grep -q " netdev $1[[:blank:]]*\$"
}

# Load the rdma_rxe or siw kernel module and associate it with all network
# interfaces.
start_soft_rdma() {
	local type

	{
	if [ -n "$use_siw" ]; then
		modprobe siw || return $?
		type=siw
	else
		modprobe rdma_rxe || return $?
		type=rxe
	fi
	(
		cd /sys/class/net &&
			for i in *; do
				[ -e "$i" ] || continue
				[ "$i" = "lo" ] && continue
				[ "$(<"$i/addr_len")" = 6 ] || continue
				[ "$(<"$i/carrier")" = 1 ] || continue
				has_soft_rdma "$i" && continue
				rdma link add "${i}_$type" type $type netdev "$i" ||
				echo "Failed to bind the $type driver to $i"
			done
	)
	} >>"$FULL"
}

# Dissociate the rdma_rxe or siw kernel module from all network interfaces and
# unload the rdma_rxe kernel module.
stop_soft_rdma() {
	{
	rdma link |
		sed -n 's,^link[[:blank:]]*\([^/]*\)/.* netdev .*,\1,p' |
		while read -r i; do
		      echo "$i ..."
		      rdma link del "${i}" || echo "Failed to remove ${i}"
		done
	if ! _unload_module rdma_rxe 10; then
		echo "Unloading rdma_rxe failed"
		return 1
	fi
	if ! _unload_module siw 10; then
		echo "Unloading siw failed"
		return 1
	fi
	} >>"$FULL"
}

# Look up the block device below the filesystem for directory $1.
block_dev_of_dir() {
	df "$1" | {
		read -r header
		echo "$header" >/dev/null
		read -r blockdev rest
		echo "$blockdev"
	}
}

# Create a filesystem of type "$filesystem_type" on block device $1.
create_filesystem() {
	local dev=$1

	case "$filesystem_type" in
		ext4)
			mkfs.ext4 -F -O ^has_journal "$dev" >>"$FULL" 2>&1;;
		xfs)
			mkfs.xfs -f "$dev" >>"$FULL" 2>&1;;
		*)
			return 1;;
	esac
}

# Whether or not path "$1" is a mountpoint.
is_mountpoint() {
	[ -n "$1" ] &&
		[ -d "$1" ] &&
		[ "$(block_dev_of_dir "$1")" != \
		  "$(block_dev_of_dir "$(dirname "$1")")" ]
}

# Execute mount "$@" and check whether the mount command has succeeded by
# verifying whether after mount has finished that ${$#} is a mountpoint.
mount_and_check() {
	local dir last

	dir=$(for last; do :; done; echo "$last")
	mount "$@"
	if ! is_mountpoint "$dir"; then
		echo "Error: mount $* failed"
		return 1
	fi
}

# Unmount the filesystem mounted at mountpoint $1. In contrast with the umount
# command, this function does not accept a block device as argument.
unmount_and_check() {
	local bd m=$1 mp

	if is_mountpoint "$m"; then
		bd=$(block_dev_of_dir "$m")
		mp=$(dev_to_mpath "$bd") 2>/dev/null
		if [ -n "$mp" ]; then
			dmsetup message "$mp" 0 fail_if_no_path
		fi
		stop_bdev_users "$bd"
		echo "Unmounting $m from $bd" >> "$FULL"
		umount "$m" || umount --lazy "$m"
	fi
	if is_mountpoint "$m"; then
		echo "Error: unmounting $m failed"
		return 1
	fi
}

# Test whether fio supports command-line options "$@"
test_fio_opt() {
	local opt

	for opt in "$@"; do
		opt=${opt//=*}
		fio --help |& grep -q -- "${opt}=" && continue
		opt=${opt#--}
		fio --cmdhelp=all |& grep -q "^${opt}[[:blank:]]" && continue
		return 1
	done
}

run_fio() {
	local a args avail_kb="" bd="" d="" j opt output rc

	args=("$@")
	j=1
	for opt in "${args[@]}"; do
		case "$opt" in
			--directory=*) d="${opt#--directory=}";;
			--filename=*)  bd="${opt#--filename=}";;
			--numjobs=*)   j="${opt#--numjobs=}";;
			--output=*)    output="${opt#--output=}";;
		esac
	done
	if [ -n "$d" ]; then
		a=$(df "$d" | grep "^/" |
			    {
				    if read -r fs blocks used avail use mnt; then
					    echo "$avail"
					    echo "$fs $blocks $used $use $mnt" >/dev/null
				    fi
			    }
		 )
		avail_kb=$a
	fi
	if [ -n "$bd" ]; then
		avail_kb=$(("$(blockdev --getsz "$bd")" / 2))
	fi
	if [ -n "$avail_kb" ]; then
		args+=("--size=$(((avail_kb * 1024 * 7 / 10) / j & ~4095))")
	fi
	for opt in --exitall_on_error=1 --gtod_reduce=1 --aux-path=${fio_aux_path}
	do
		if test_fio_opt "$opt"; then
			args+=("$opt")
		fi
	done
	mkdir -p "${fio_aux_path}"
	echo "fio ${args[*]}" >>"$FULL"
	fio "${args[@]}" 2>&1
	rc=$?
	if [ $rc = 0 ] && [ -n "$output" ]; then
		# Return exit code 1 if no I/O has been performed.
		grep -q ', io=[0-9].*, run=[0-9]' "$output"
		rc=$?
	fi
	echo "run_fio exit code: $rc" >>"$FULL"
	return $rc
}

# Configure two null_blk instances.
configure_null_blk() {
	local i

	for i in nullb0 nullb1; do
		_configure_null_blk $i completion_nsec=0 blocksize=512 \
				    size=$((ramdisk_size>>20)) memory_backed=1 \
				    power=1 || return $?
	done
	ls -l /dev/nullb* &>>"$FULL"
}

setup_rdma() {
	start_soft_rdma
	(
		echo "RDMA interfaces:"
		cd /sys/class/infiniband &&
			for i in *; do
				[ -e "$i" ] || continue
				for p in "$i/ports/"*; do
					echo "$i, port $(basename "$p"): $(<"$p/gids/0")"
				done
			done
	) &>>"$FULL"
}

# Undo setup()
teardown_uncond() {
	shutdown_client
	killall -9 multipathd >&/dev/null
	rm -f /etc/multipath.conf
	stop_target
	stop_soft_rdma
	_exit_null_blk
}

teardown() {
	[ -z "$debug" ] && teardown_uncond
}

# Set up test configuration with "$1" as multipath configuration file.
setup_test() {
	local i m modules

	if ! shutdown_client; then
		echo "failed to shutdown client"
		return 1
	fi

	if ! teardown_uncond; then
		echo "teardown() failed"
		return 1
	fi

	modules=(
		configfs
		dm-multipath
		dm_mod
		scsi_dh_alua
		scsi_dh_emc
		scsi_dh_rdac
		scsi_mod
	)
	for m in "${modules[@]}"; do
		[ -e "/sys/module/$m" ] || modprobe "$m" || return $?
	done

	_init_null_blk nr_devices=0 || return $?

	configure_null_blk || return $?

	if [ ! -e /etc/multipath.conf ]; then
		(
			cd /etc && ln -s "$1" .
		)
	fi
	multipathd

	# Load the I/O scheduler kernel modules
	(
		KERNEL_BLOCK="/lib/modules/$(uname -r)/kernel/block"
		if [[ -d $KERNEL_BLOCK ]]; then
			cd "$KERNEL_BLOCK" &&
				for m in *.ko; do
					[ -e "$m" ] && modprobe "${m%.ko}"
				done
		fi
	)

	setup_rdma || return $?
	start_target || return $?
	echo "Test setup finished" >>"$FULL"
}

# Run these unit tests as follows:
# bash -c '. ./common/multipath-over-rdma && unit_tests'
unit_tests() {
	local t tests pass=1

	tests=("version_le 0.1.2 1.2.3"
	       "version_le 0.1.2 0.1.3"
	       "version_le 0.1.2 0.1.2"
	       "! version_le 3 2"
	       "! version_le 3.1 2.3")
	for t in "${tests[@]}"; do
		if ! eval "$t"; then
			echo "FAILED: $t"
			pass=0
		fi
	done
	[ $pass = 1 ] && echo PASS
}
