#!/bin/bash
#
# hsci - Tool to manage HiperSockets Converged Interfaces (HSCI)
#
# Copyright IBM Corp. 2020
#
# s390-tools is free software; you can redistribute it and/or modify
# it under the terms of the MIT license. See LICENSE for details.
#

hsdev=""
ndev=""
hsci=""
hscibr=""
hscibp=""
hsci_mac=""
hsif_pnetid=""
netif_pnetid=""
hsci_pnetid=""

##############################################################################
#  Concept:
#	     --------
#	     | hsci |
#	     --------
#	     /       \
#	--------    --------
#	| ndev |    | hsdev |
#	--------    --------
#
# Detail (bridge w/ bridgeports):
#	           --------
#	           | hsci |
#	           --------
#	               |
#	----------------------
#	|          |hsci-bp| |
#	| hsci-br  --------- |
#	|                    |
#	--------   --------  |
#	| ndev |   | hsdev | |
#	----------------------

function usage {
cat <<-EOD
Usage: hsci COMMAND [OPTION]

This tool is designed to control and show HSCI (HiperSockets Converged
Interfaces) settings. A HiperSockets interface and an external network
interface are converged into an HSCI interface.

COMMANDS
        add HIPERSOCKETS_DEV NET_DEV    Adds an HSCI interface
        del HSCI_NAME                   Deletes an HSCI interface
        show                            Lists the configured HSCI interfaces

OPTIONS:
        -v, --version                   Prints the version number of the hsci tool and exits
        -h, --help                      Displays the help information for the command
EOD
}

function prereqs_check {
	if ! [ -x "$(command -v ip)" ]; then
		echo "Error: No iproute2 installed on this system" >&2
		return 1
	fi
}

function get_pnetid {
	local netdev=$1
	local pnetid=""

	#### ROCE:
	if [ -e /sys/class/net/$netdev/device/util_string ]; then
		pnetids="$(cat /sys/class/net/$netdev/device/util_string | tr -d '\000' | iconv -f IBM-1047 -t ASCII)"
	else
	#### OSA /HiperSockets:
		if [ -e /sys/class/net/$netdev/device/chpid ]; then
			chpid="$(cat /sys/class/net/$netdev/device/chpid | tr [:upper:] [:lower:])"
			pnetids="$(cat /sys/devices/css0/chp0.$chpid/util_string | tr -d '\000' | iconv -f IBM-1047 -t ASCII)"
		fi
	fi
	if [ "$pnetids" != "" ]; then
		port_if="$(cat /sys/class/net/$netdev/dev_port)"
		(( idx=16*$port_if+1 ))
		(( end=$idx+15 ))
		pnetid="$(echo "$pnetids" | cut -c $idx-$end | tr -d ' ')"
	fi

	echo $pnetid
}

function check_pnetids {
	if [ "$hsdev" != "" ]; then
		hsif_pnetid="$(get_pnetid $hsdev)"
	else
		hsif_pnetid=""
	fi
	if [ "$ndev" != "" ]; then
		netif_pnetid="$(get_pnetid $ndev)"
	else
		netif_pnetid=""
	fi

	#Check PNETIDs
	if [ "$hsif_pnetid" != "" ] && [ "$netif_pnetid" != "" ] && [ "$netif_pnetid" != "$hsif_pnetid" ]; then
		echo "Error: $hsdev and $ndev have different PNETIDs! They are $hsif_pnetid and $netif_pnetid respectively" >&2
		return 1
	else
		if [ "$hsif_pnetid" != "" ]; then
			hsci_pnetid=$hsif_pnetid
		else
			hsci_pnetid=$netif_pnetid
		fi
		return 0
	fi
}

function verify_precon {
	echo "Verifying net dev $ndev and HiperSockets dev $hsdev"

	if [ ! -e /sys/class/net/$hsdev ]; then
		echo "Error: $hsdev does not exist" >&2
		return 1
	fi
	if [ "$(cat /sys/class/net/$hsdev/device/card_type 2>/dev/null)" != "HiperSockets" ]; then
		echo "Error: $hsdev is not a HiperSockets device" >&2
		return 1
	fi
	if [ "$(cat /sys/class/net/$hsdev/device/layer2 2>/dev/null)" != "1" ]; then
		echo "Error: $hsdev is not in layer 2 mode" >&2
		return 1
	fi
	if [ ! -e /sys/class/net/$hsdev/device/vnicc/bridge_invisible ]; then
		echo "Error: Missing vnic-characteristics support" >&2
		return 1
	fi
	if [ "$(cat /sys/class/net/$hsdev/device/vnicc/bridge_invisible 2>/dev/null)" == "n/a" ]; then
		echo "Error: $hsdev does not support vnicc" >&2
		return 1
	fi
	if [ $(ip link show $hsdev | grep UP | wc -l) -eq 0 ]; then
		echo "Error: $hsdev is not in state UP" >&2
		return 1
	fi
	if [ $(bridge -d link show dev $hsdev self | grep learning_sync | wc -l) -eq 0 ]; then
		echo "Error: $hsdev does not support attribute learning_sync" >&2
		return 1
	fi
	if [ $(ip link show $hsdev | grep master | wc -l) -ne 0 ]; then
		if [ $(ip link show $hsdev | grep "master $hsci" | wc -l) -eq 0 ]; then
			echo "Error: $hsdev is already a subordinate to another master" >&2
			return 1
		fi
	fi

	#Pre-verify net_dev
	if [ ! -e /sys/class/net/$ndev ]; then
		echo "Error: $ndev does not exist" >&2
		return 1
	fi
	if [ $(ip link show $ndev | grep UP | wc -l) -eq 0 ]; then
		echo "Error: $ndev is not in state UP" >&2
		return 1
	fi
	if [ $(ip link show $ndev | grep master | wc -l) -ne 0 ]; then
		if [ $(ip link show $ndev | grep "master $hsci" | wc -l) -eq 0 ]; then
			echo "Error: $ndev is already a subordinate to another master" >&2
			return 1
		fi
	fi

	#Check PNETIDs
	check_pnetids
	if [ $? -ne 0 ]; then
		return 1
	fi

	return 0
}

function clean_up {
	ip link del $hsci >/dev/null 2>&1
	bridge link set dev $hsdev learning_sync off self >/dev/null 2>&1
	echo 0 > /sys/class/net/$hsdev/device/vnicc/bridge_invisible >/dev/null 2>&1
	bridge fdb del $hsci_mac dev $hsdev self local >/dev/null 2>&1
	bridge fdb del $hsci_mac dev $ndev self local >/dev/null 2>&1
	ip link del $hscibr >/dev/null 2>&1
}

##############################################################################
##	add a new HSCI interface
##############################################################################
function add_hsci {

	if [ $# != 2 ]; then
		echo "hsci: Invalid parameters" >&2
		echo "Use 'hsci --help' for more information" >&2
		return 1
	fi
	hsdev=$1
	ndev=$2

	hsci_postfix="$(readlink /sys/class/net/$hsdev/device/cdev0 | tail -c5)"
	hscibr=hsci$hsci_postfix-br
	hscibp=hsci$hsci_postfix-bp
	hsci=hsci$hsci_postfix

	#### Verify preconditions
	verify_precon
	if [ $? -ne 0 ]; then
		return 1
	fi

	echo "Adding $hsci with a HiperSockets dev $hsdev and an external dev $ndev"

	#### Create bridge (idempotent)
	if [ ! -e /sys/class/net/$hscibr ]; then
		# ageing_time of $hscibr defaults to 30000 (300 secs)
		ip link add name $hscibr type bridge stp_state 0 >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			echo "Error: Could not create a bridge" >&2
			return 1
		fi
	else
		ip link set dev $hscibr type bridge stp_state 0 >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			echo "Error: $hscibr is not a bridge" >&2
			return 1
		fi
	fi

	#### Prepare hsdev
	# Set VNICC of hsdev to invisible
	#(mandatory for co-existence with HS-OSA bridges!)
	echo 1 > /sys/class/net/$hsdev/device/vnicc/bridge_invisible

	#### Create bridge ports
	ip link set dev $ndev master $hscibr >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Could not set master for $ndev" >&2
		clean_up
		return 1
	fi
	ip link set dev $hsdev master $hscibr >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Could not set master for $hsdev" >&2
		clean_up
		return 1
	fi

# Do not learn from ndev, but do learn from hsci-bp:
#  - First define hsdev and ndev as _isolated_ bridgeports
#  - Then turn on learning_sync on self on hsdev
#  - Then define hsci-bp as non-isolated veth bridgeport

	# no forwarding between ndev and hsdev -> isolated on
	# ndev is default for outgoing unknown targets -> flood on
	# no need to learn external LAN targets into fdb -> learning off
	bridge link set dev $ndev isolated on learning off flood on mcast_flood on >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to set bridge attributes on $ndev" >&2
		clean_up
		return 1
	fi

	# no forwarding between ndev and hsdev -> isolated on
	# fdb will be populated by dev-to-bridge-notification, no need to learn
	#	-> learning off
	# only send to hsdev, if listed in fdb -> flood off
	# don't send MC/BC on hsdev -> mcast_flood off
	bridge link set dev $hsdev isolated on learning off flood off mcast_flood off >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to set bridge attributes on $hsdev" >&2
		clean_up
		return 1
	fi
	# NOTE: Although not required, BCs will be sent out on hsdev.
	# NOTE: We need to receive BCs on hsdev, as z/OS HSCI does ARP requests on HS.


	ip link set dev $hscibr up >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to set $hscibr up" >&2
		clean_up
		return 1
	fi

	#### Turn on device for bridge notification
	####  Toggle is required to learn full list of HS targets,
	####   not only future changes.
	bridge link set dev $hsdev learning_sync off self >/dev/null 2>&1
	bridge link set dev $hsdev learning_sync on self >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to turn on device for bridge notification" >&2
		clean_up
		return 1
	fi

	# define veth pair for hsci (idempotent)
	if [ ! -e /sys/class/net/$hsci ]; then
		ip link add dev $hsci type veth peer name $hscibp >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			echo "Error: Could not create veth pair $hsci - $hscibp " >&2
			clean_up
			return 1
		fi
	else
		if [ $hsci@$hscibp: != "$(ip -o -d link show dev $hsci | awk '/veth/ {print $2}' )" ]; then
			echo "Error: $hsci@$hscibp is not a veth" >&2
			clean_up
			return 1
		fi
	fi

	ip link set dev $hscibp master $hscibr >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to add $hscibp to $hscibr" >&2
		clean_up
		return 1
	fi
	bridge link set dev $hscibp isolated off learning on flood on mcast_flood on >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to set bridge parameters for $hscibp" >&2
		clean_up
		return 1
	fi

	#### Set a static forwarding rule for hsci MAC, so hsci can be used as a
	####  single-MAC network interface without being subject to
	####  ageing and re-learning
        #### Wait for systemd to change the MAC of hsci, if it wants to:
        sleep 1
	hsci_mac="$(cat /sys/class/net/$hsci/address)"
	####  (idempotent)
	if [ $(bridge fdb show dev $hscibp | grep "$hsci_mac master $hscibr static" | wc -l) -eq 0 ]; then
		bridge fdb add $hsci_mac dev $hscibp master static
		if [ $? -ne 0 ]; then
			echo "Error: Failed to set $hsci_mac to $hscibr fdb" >&2
			clean_up
			return 1
		fi
	fi

	# Bridge-to-device learning will set this MAC on hsdev and ndev.
	# Old kernel code doesn't do hsci bridge-to-device learning.
	#  In this case: Set hsci_mac as local MAC of hsdev and ndev,
	#  so at least the single-MAC scenario works.
	if [ $(bridge fdb show dev $hsdev | grep "$hsci_mac self permanent" | wc -l) -eq 0 ]; then
		echo "Warning: $hsci will support only its current static MAC address. Please upgrade your kernel to the latest level." >&2
		bridge fdb add $hsci_mac dev $hsdev self local
		if [ $? -ne 0 ]; then
			echo "Error: Failed to add $hsci_mac to $hsdev" >&2
			clean_up
			return 1
		fi
	fi
	if [ $(bridge fdb show dev $ndev | grep "$hsci_mac self permanent" | wc -l) -eq 0 ]; then
		bridge fdb add $hsci_mac dev $ndev self local
		if [ $? -ne 0 ]; then
			echo "Error: Failed to add $hsci_mac to $ndev" >&2
			clean_up
			return 1
		fi
	fi

	#### Set veth pair to UP
	ip link set dev $hscibp up >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to set $hscibp up" >&2
		clean_up
		return 1
	fi
	ip link set dev $hsci up >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to set $hsci up" >&2
		clean_up
		return 1
	fi

	echo "Added HSCI interface $hsci"
	return 0
}

##############################################################################
##	Delete HSCI
##############################################################################

function del_hsci {
	if [ $# != 1 ]; then
		echo "hsci: invalid parameters" >&2
		echo "Use 'hsci --help' for more information" >&2
		return 1
	fi
	hsci=$1
	if [ $(ip link show dev $hsci | wc -l) -eq 0 ]; then
		echo "Error: $hsci does not exit" >&2
		return 1
	fi
	hsci_mac="$(cat /sys/class/net/$hsci/address)"

	#### Find hscibp and hscibr
	hscibp="$(ip -o link show dev $hsci | awk '{print $2}')"
	if [[ $hscibp != *@* ]]; then
		#  $hsci has no HSCI veth peer
		echo "Warning: $hsci may have been created by an older version of hsci" >&2
		mvp=1
		hscibp=""
		hscibr=$hsci
	else
		mvp=0
		hscibp=${hscibp##*@}
		hscibp=${hscibp%:}
		echo "$hsci is paired with $hscibp" >&2

		hscibr="$(ip link show dev $hscibp | awk '{for(x=1;x<NF;x++) if($x~/master/) print $(x+1)}')"
	fi

	#### Find hsdev and ndev
	hsdev=""
	ndev=""
	bports="$(ip link show | grep "master $hscibr" | awk '{print $2}')"
	for bport in $bports; do
		bport=${bport%:}
		bport=${bport%@*}
		if [ $(bridge -d link show dev $bport | grep "isolated on" | wc -l) -ne 0 ]; then
				if [ $(bridge -d link show dev $bport | grep "learning_sync on" | wc -l) -ne 0 ]; then
					hsdev=$bport
				else
					ndev=$bport
				fi
		fi
	done
	if [ "$hsdev" == "" ]; then
		echo "Error: $hsci has no active HiperSockets port" >&2
	fi
	if [ "$ndev" == "" ]; then
		echo "Error: $hsci has no active external port" >&2
	fi
	echo "Deleting HSCI interface $hsci with HiperSockets interface $hsdev and external interface $ndev"

	#### Delete veth before resetting learning_sync and deleting bridge,
	####  so fdb entries are cleaned up (synced)!
	ip link del $hsci >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		echo "Error: Failed to delete $hsci" >&2
	fi
	# Bridge-to-device learning will remove learned MACs from hsdev and ndev.
	# Old kernel code doesn't do hsci bridge-to-device learning.
	#  In this case: Remove the hsci_mac that was added during 'add' from
	#  hsdev and ndev.
	if  [ "$hsdev" != "" ] && [ $mvp -eq 0 ]; then
		if [ $(bridge fdb show dev $hsdev | grep "$hsci_mac self permanent" | wc -l) -ne 0 ]; then
			echo "Warning: It seems your kernel does not support all hsci features, please upgrade." >&2
			bridge fdb del $hsci_mac dev $hsdev self local
			if [ $? -ne 0 ]; then
				echo "Error: Failed to delete $hsci_mac from $hsdev" >&2
			fi
		fi
	fi
	# In the mvp case hsci_mac == hs_mac and was only set on ndev.
	if [ "$ndev" != "" ]; then
		if [ $(bridge fdb show dev $ndev | grep "$hsci_mac self permanent" | wc -l) -ne 0 ]; then
			bridge fdb del $hsci_mac dev $ndev self local
			if [ $? -ne 0 ]; then
				echo "Error: Failed to delete $hsci_mac from $ndev" >&2
			fi
		fi
	fi

	#### Reset learning_sync
	if [ "$hsdev" != "" ]; then
		bridge link set dev $hsdev learning_sync off self >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				echo "Error: Failed to turn off learning_sync on $hsdev" >&2
			fi
	fi
	#### Delete bridge
	if [ "$hscibr" != "" ]; then
		ip link del $hscibr >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			echo "Error: Failed to delete $hscibr" >&2
		fi
	fi

	if [ "$hsdev" != "" ]; then
		echo 0 > /sys/class/net/$hsdev/device/vnicc/bridge_invisible
	fi

	echo "Deleted $hsci"
	return 0
}

##############################################################################
##	Show HSCI
##############################################################################

function print_row {
	if [ $mvp -eq 0 ]; then
		printf '%-8s  %-16s  %-15s  %-15s\n' "$hsci" "$hsci_pnetid" "$hsdev" "$ndev"
	else
		printf '%-8s  %-16s  %-15s  %-15s    (v1)\n' "$hsci" "$hsci_pnetid" "$hsdev" "$ndev"
	fi
}

function list_active {
	hsci=""
	hsdev=$1
	ndev=""
	hscibp=""
	mvp=1

	hsci_pnetid="$(get_pnetid $hsdev)"

	hscibr="$(ip link show dev $hsdev | awk '{for(x=1;x<NF;x++) if($x~/master/) print $(x+1)}')"
	if [ "$hscibr" == "" ]; then
		print_row
		return 0
	fi

	#### find ndev (the other isolated subordinate)
	bports="$(ip link show | grep "master $hscibr" | awk '{print $2}')"
	for bport in $bports; do
		bport=${bport%:}
		bport=${bport%@*}
		if [ $(ip -d link show dev $bport | grep "isolated on" | wc -l) -ne 0 ]; then
			if [ $bport != $hsdev ]; then
				ndev=$bport
			fi
		else
			hscibp=$bport
			mvp=0
		fi
	done

	check_pnetids

	if [ $mvp -ne 0 ]; then
		hsci=$hscibr
		print_row
		return 0
	fi

	#### find hsci (veth-peer of non-isolated subordinate of hscibr)
	hsci="$(ip -o link show dev $hscibp | awk '{print $2}')"
	if [[ $hsci != *@* ]]; then
		echo "Error: $hscibp has no HSCI veth peer" >&2
		print_row
		return 1
	fi
	hsci=${hsci##*@}
	hsci=${hsci%:}

	print_row
	return 0
}

function print_header {
	if [ $header -eq 0 ]; then
		echo "HSCI      PNET_ID           HiperSockets     External       "
		echo "------------------------------------------------------------"
		header=1
	fi
}

function list_one {
	local hsnetdev=$1

	if [ $(bridge -d link show dev $hsnetdev 2>/dev/null | grep "learning_sync on" | wc -l) -ne 0 ]; then
		print_header
		list_active $hsnetdev
	fi

	return 0
}

function show_hsci {
	if [ $# != 0 ]; then
		echo "hsci: invalid parameters" >&2
		echo "Use 'hsci --help' for more information" >&2
		return 1
	fi
	header=0

	for hs_net_dev in $(ls -1 /sys/class/net/); do
		list_one $hs_net_dev
	done

	return 0
}

#==============================================================================

function print_version()
{
	echo "hsci utility: version 2.29.0-build-20240423"
	echo "Copyright IBM Corp. 2020"
}

##############################################################################
##### Main
##############################################################################
prereqs_check

args="$(getopt -u -o hv -l help,version -- $*)"
[ $? -ne 0 ] && exit 2
set -- $args
while true; do
	case $1 in
		-v | --version)
			print_version
			exit 0
			;;
		-h | --help)
			usage
			exit 0
			;;
		--)
			;;
		add)	shift
			add_hsci "$@"
			exit $?
			;;
		del)	shift
			del_hsci "$@"
			exit $?
			;;
		show)	shift
			show_hsci "$@"
			exit $?
			;;
		*)	echo "hsci: Please specify a valid command or option" >&2
			echo "Use 'hsci --help' for more information" >&2
			exit 1
	esac
	shift
done
