summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDominique Martinet <asmadeus@codewreck.org>2016-01-27 18:42:29 +0100
committerDominique Martinet <dominique.martinet@cea.fr>2016-02-02 15:44:08 +0100
commit7ab0d94ee10d1f8126abd84e4fdd8e087a1ff925 (patch)
tree9db80a2774f7106b14452a44a77b376f3910b644
parent89853188054e6e23182d8a6c26d1e1d2cb703e2e (diff)
Big net rewrite for sriov + user mode
-rwxr-xr-xkvm-wrapper.sh190
1 files changed, 124 insertions, 66 deletions
diff --git a/kvm-wrapper.sh b/kvm-wrapper.sh
index 125593f..0234287 100755
--- a/kvm-wrapper.sh
+++ b/kvm-wrapper.sh
@@ -294,8 +294,10 @@ function qcow_create_disk()
local KVM_IMG_DISKNAME="$1"
local KVM_IMG_DISKSIZE="$2"
"$KVM_IMG_BIN" create -f "$KVM_IMG_FORMAT" "$KVM_IMG_DISKNAME" "$KVM_IMG_DISKSIZE"
+
if [[ "x$?" == "x0" ]]
then
+ [[ -n "$KVM_USER" ]] && chown $KVM_USER: "$KVM_IMG_DISKNAME"
desc_update_setting "KVM_DISK[0]" "$KVM_IMG_DISKNAME"
else
echo "Failed creating disk. Continuing anyway"
@@ -406,80 +408,100 @@ function pci_unvfiofy()
}
# helper for sriov
-function ib_sriov()
+# for mlx4 cards, we can force pkeys just like we'd force vlan at host level
+function ib_sriov_pkeys()
+{
+ local DEV="$1"
+ local PCIDOMAIN="$2"
+ local PKEYS="$3"
+
+ # set pkey if able
+ if [[ -e /sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx ]]; then
+ PKIDX=0
+ [[ -z "$PKEYS" ]] || for PKEY in "$PKEYS"; do
+ local PKEY_IDX=$(basename "$(grep -lZw 0x8$PKEY /sys/class/infiniband/$DEV/ports/1/pkeys/*)")
+ [[ -z "$PKEY_IDX" ]] && fail_exit "pkey $PKEY not found for $DEV"
+ echo $PKEY_IDX > /sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$PKIDX || fail_exit "couldn't set pkey"
+ PKIDX=$((PKIDX+1))
+ done
+ PKEY_IDX=$(basename "$(grep -lZw 0x7fff /sys/class/infiniband/$DEV/ports/1/pkeys/*)")
+ echo $PKEY_IDX > /sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$PKIDX
+ fi
+}
+
+function ib_unsriov_pkeys()
+{
+ local DEV="$1"
+ local PCIDOMAIN="$2"
+
+ test -e "/sys/class/infiniband/$DEV/iov/$PCIDOMAIN" || continue
+ grep -lZv none "/sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/"* | \
+ tr '\0' '\n' | \
+ while read file; do
+ echo none > $file
+ done
+}
+
+# for mlx5, we don't force pkeys but we force node guid to let opensm handle it
+function ib_sriov_guid()
{
- local PKEYS="$1"
- local IB_DEV="$2"
+ local DEV="$1"
+ local VIRTFN_IDX="$2"
local CHILD_GUID="$3"
- # pick one compatible card at random
+ if [[ "$DEV" =~ "mlx5_"* ]]; then
+ local GUID=$(sed -e 's/\([a-f0-9]\{2\}\)\([a-f0-9]\{2\}\)/\1:\2/g' < /sys/class/infiniband/$DEV/node_guid)
+ echo Follow > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/policy
+ echo $GUID > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/node
+ [[ -n "$CHILD_GUID" ]] && \
+ echo "$CHILD_GUID" > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/port || \
+ printf "%s:%x\n" "${GUID%:*}" $((0x${GUID##*:}+$VIRTFN_IDX+1)) > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/port
+ fi
+}
- [[ -z "$IB_DEV" ]] && IB_DEV=$(ls -d /sys/class/infiniband/*/device/virtfn0 | sort -r | awk -F/ '{print $5; exit}')
- [[ -z "$IB_DEV" ]] && fail_exit "No VFIO compatible IB adapter"
- local DEVDIR="/sys/class/infiniband/${IB_DEV}/device"
- local PCIDOMAIN=inval
+
+# main sriov function - returns two values in PCIDOMAIN and VFNUM variables
+function sriov()
+{
+ local BR="$1"
+ local DEVDIR=inval
+ local FLOCK
local VIRTFN
+ PCIDOMAIN=inval
+
+ [[ -e "/sys/class/net/${BR}/device" ]] && DEVDIR="/sys/class/net/${BR}/device"
+ [[ -e "/sys/class/infiniband/${BR}/device" ]] && DEVDIR="/sys/class/infiniband/${BR}/device"
+ [[ "$DEVDIR" = "inval" ]] && fail_exit "could not find device for $BR"
- if [[ "$IB_DEV" =~ "mlx5_"* ]]; then
+ # following is racy, lock through.
+ exec {FLOCK}<>/tmp/kvm-wrapper-$BR.lock
+ flock ${FLOCK}
+
+ if [[ -e "$DEVDIR/sriov_numvfs" && ! -e "$DEVDIR/virtfn0" ]]; then
# arbitrary number: create all vfs available
- [[ $(cat /sys/class/infiniband/$IB_DEV/device/mlx5_num_vfs) == "0" ]] && \
- cat /sys/class/infiniband/$IB_DEV/device/sriov_totalvfs > /sys/class/infiniband/$IB_DEV/device/mlx5_num_vfs
+ cat "$DEVDIR/sriov_totalvfs" > "$DEVDIR/sriov_numvfs"
fi
-
- # pick a virtfn - this is totally racy, but sod it.
+ # pick a virtfn
for VIRTFN in $DEVDIR/virtfn*; do
PCIDOMAIN=$(basename $(readlink $VIRTFN))
[[ -e /sys/bus/pci/drivers/vfio-pci/$PCIDOMAIN ]] || break
PCIDOMAIN=inval
done
- [[ $PCIDOMAIN = inval ]] && fail_exit "no vfio virtfn available for $IB_DEV"
+ [[ $PCIDOMAIN = inval ]] && fail_exit "no vfio virtfn available for $BR"
+ VFNUM=${VIRTFN##*virtfn}
# hack - register everything to pci-stub first so we can tell appart which are used or not
local PCIVENDOR="$(cat "/sys/bus/pci/devices/$PCIDOMAIN/vendor" |sed 's/^0x//')"
PCIVENDOR+=" $(cat "/sys/bus/pci/devices/$PCIDOMAIN/device" |sed 's/^0x//')"
echo "$PCIVENDOR" > "/sys/bus/pci/drivers/pci-stub/new_id" || fail_exit "couldn't add new_id (pci-stub)"
- # set pkey if able
- if [[ -e /sys/class/infiniband/$IB_DEV/iov/$PCIDOMAIN/ports/1/pkey_idx ]]; then
- VFIDX=0
- [[ -z "$PKEYS" ]] || for PKEY in "$PKEYS"; do
- local PKEY_IDX=$(basename "$(grep -lZw 0x8$PKEY /sys/class/infiniband/$IB_DEV/ports/1/pkeys/*)")
- [[ -z "$PKEY_IDX" ]] && fail_exit "pkey $PKEY not found for $IB_DEV"
- echo $PKEY_IDX > /sys/class/infiniband/$IB_DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$VFIDX || fail_exit "couldn't set pkey"
- VFIDX=$((VFIDX+1))
- done
- PKEY_IDX=$(basename "$(grep -lZw 0x7fff /sys/class/infiniband/$IB_DEV/ports/1/pkeys/*)")
- echo $PKEY_IDX > /sys/class/infiniband/$IB_DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$VFIDX
- fi
+ pci_vfiofy "$PCIDOMAIN"
- # set guid and policy for mlx5
- if [[ "$IB_DEV" =~ "mlx5_"* ]]; then
- local GUID=$(sed -e 's/\([a-f0-9]\{2\}\)\([a-f0-9]\{2\}\)/\1:\2/g' < /sys/class/infiniband/$IB_DEV/node_guid)
- local VIRTFN_IDX=${VIRTFN#*virtfn}
- echo Follow > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/policy
- echo $GUID > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/node
- [[ -n "$CHILD_GUID" ]] && \
- echo "$CHILD_GUID" > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/port || \
- printf "%s:%x\n" "${GUID%:*}" $((0x${GUID##*:}+$VIRTFN_IDX+1)) > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/port
- fi
-
- KVM_VFIO_DOMAIN+=("$PCIDOMAIN")
-}
-
-function ib_unsriov()
-{
- for PCIDOMAIN in ${KVM_VFIO_DOMAIN}; do
- test -e /sys/class/infiniband/*/iov/"$PCIDOMAIN" || continue
- grep -lZv none /sys/class/infiniband/*/iov/"$PCIDOMAIN"/ports/1/pkey_idx/* | \
- tr '\0' '\n' | \
- while read file; do
- echo none > $file
- done
- done
+ # unlock, can't unlink file though..
+ exec {FLOCK}<&-
}
-
# Change perms. Meant to run forked.
function serial_perms_forked()
{
@@ -701,23 +723,62 @@ function kvm_start_vm ()
[[ -z "${KVM_BR[@]:0:1}" ]] && fail_exit "No KVM_BR defined"
[[ -z "${KVM_IF[@]:0:1}" ]] && fail_exit "No KVM_IF defined"
for i in ${!KVM_MACADDR[@]}; do
- local TAPDEV
-
KVM_BR[$i]="${KVM_BR[i]:-${KVM_BR[@]:0:1}}"
KVM_IF[$i]="${KVM_IF[i]:-${KVM_IF[@]:0:1}}"
- KVM_NETDEV_OPT[$i]="${KVM_NET_OPT[i]-${KVM_NET_OPT[@]:0:1}}"
+ KVM_NET_OPT[$i]="${KVM_NET_OPT[i]-${KVM_NET_OPT[@]:0:1}}"
KVM_NET_MTU[$i]="${KVM_NET_MTU[i]-${KVM_NET_MTU[@]:0:1}}"
+ # handle special cases first
+ case "${KVM_IF[i]}" in
+ sriov)
+ local PCIDOMAIN
+ local VFNUM
+
+ sriov "${KVM_BR[i]}"
+ CLEANUP+=("pci_unvfiofy \"$PCIDOMAIN\"")
+
+ ip link set "${KVM_BR[i]}" vf "$VFNUM" mac "${KVM_MACADDR[i]}" || fail_exit "Could not set MAC address for vf $VFNUM device ${KVM_BR[i]}"
+
+ KVM_NET+="-device vfio-pci,id=net${i},host=${PCIDOMAIN} "
+
+ continue
+ ;;
+
+ mlx*-sriov)
+ local PCIDOMAIN
+ local VFNUM
+
+ sriov "${KVM_BR[i]}"
+ CLEANUP+=("pci_unvfiofy \"$PCIDOMAIN\"")
+
+ if [[ "${KVM_NETSTYLE[i]}" == "mlx4-sriov" ]]; then
+ # KVM_MACADDR reused for pkeys as we can't (?) set guid
+ ib_sriov_pkeys "${KVM_BR[i]}" "$PCIDOMAIN" "${KVM_MACADDR[i]}"
+ CLEANUP+=("ib_unsriov_pkeys \"${KVM_BR[i]}\" \"$PCIDOMAIN\"")
+ else
+ # KVM_MACADDR relates to child guid
+ ib_sriov_guid "${KVM_BR[i]}" "$VFNUM" "${KVM_MACADDR[i]}"
+ fi
+
+ KVM_NET+="-device vfio-pci,id=net${i},host=${PCIDOMAIN} "
+
+ continue
+ ;;
+ esac
+
+ local TAPDEV=""
+ local VHOSTFD_OPT=""
+
# tapdev can only be 15 chars long, if VM_NAME is too long keep start + last 2 chars
# and pray for no collision...
[[ $(( ${#VM_NAME} + ${#i} )) -le 10 ]] && \
TAPDEV=tap-${VM_NAME:0:$((10-${#i}))}-${i} || \
TAPDEV=tap-${VM_NAME:0:$((8-${#i}))}${VM_NAME:(-2)}-${i}
- if [[ ${KVM_NETDEV_OPT[i]} == *"vhost=on"* ]] && [[ -n "$KVM_USER" ]]; then
+ if [[ "${KVM_NET_OPT[i]}" == *"vhost=on"* ]] && [[ -n "$KVM_USER" ]]; then
exec {VHOSTFD}<>/dev/vhost-net
- KVM_NETDEV_OPT[$i]="${KVM_NETDEV_OPT[$i]},vhostfd=${VHOSTFD}"
- CLEANUP+=("exec ${VHOSTFD}<>-")
+ VHOSTFD_OPT=",vhostfd=${VHOSTFD}"
+ CLEANUP+=("exec ${VHOSTFD}<&-")
fi
if [[ -n "$KVM_MACVTAP" ]]; then
@@ -726,23 +787,25 @@ function kvm_start_vm ()
ip link add link ${KVM_BR[i]} name $TAPDEV address ${KVM_MACADDR[i]} type macvtap mode bridge
ip link set $TAPDEV ${KVM_NET_MTU[$i]:+mtu ${KVM_NET_MTU[$i]}} up
+ CLEANUP+=("ip link del $TAPDEV")
[[ -e "/sys/class/net/$TAPDEV/ifindex" ]] || fail_exit "Could not create interface $TAPDEV"
TAPFILE=/dev/tap$(cat "/sys/class/net/$TAPDEV/ifindex")
exec {TAPFD}<>${TAPFILE}
- CLEANUP+=("exec ${TAPFD}<>-")
- CLEANUP+=("ip link del $TAPDEV")
- KVM_NET+="-netdev type=tap,id=guest${i},fd=${TAPFD}${KVM_NETDEV_OPT[i]} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} "
+ CLEANUP+=("exec ${TAPFD}<&-")
+ KVM_NET+="-netdev type=tap,id=guest${i},fd=${TAPFD}${KVM_NET_OPT[i]}${VHOSTFD_OPT} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} "
else
ip tuntap add dev $TAPDEV mode tap ${KVM_USER+user $KVM_USER}
ip link set $TAPDEV ${KVM_NET_MTU[$i]:+mtu ${KVM_NET_MTU[$i]}} up
brctl addif ${KVM_BR[i]} $TAPDEV
CLEANUP+=("ip tuntap del dev $TAPDEV mode tap")
- KVM_NET+="-netdev type=tap,id=guest${i},ifname=${TAPDEV},script=no,downscript=no${KVM_NETDEV_OPT[i]} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} "
+ KVM_NET+="-netdev type=tap,id=guest${i},ifname=${TAPDEV},script=no,downscript=no${KVM_NET_OPT[i]}${VHOSTFD_OPT} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} "
fi
done
+
+ [[ "$KVM_NET" == *"-net"* ]] || KVM_NET+="-net none "
}
# PCI passthrough assignement
@@ -753,11 +816,6 @@ function kvm_start_vm ()
KVM_PCIASSIGN+="-device pci-assign,id=${KVM_PCIASSIGN_ID[$i]:-pciassign${i}},host=${KVM_PCIASSIGN_DOMAIN[$i]} "
done
- # IB sriov
- [[ -n "$KVM_SRIOV_PKEYS$KVM_SRIOV_DEV$KVM_SRIOV_GUID" ]] && \
- ib_sriov "$KVM_SRIOV_PKEYS" "$KVM_SRIOV_DEV" "$KVM_SRIOV_GUID" && \
- CLEANUP+=("ib_unsriov")
-
# vfio assignement
local KVM_VFIO=""
for i in ${!KVM_VFIO_DOMAIN[@]}; do