diff options
author | Dominique Martinet <asmadeus@codewreck.org> | 2016-01-27 18:42:29 +0100 |
---|---|---|
committer | Dominique Martinet <dominique.martinet@cea.fr> | 2016-02-02 15:44:08 +0100 |
commit | 7ab0d94ee10d1f8126abd84e4fdd8e087a1ff925 (patch) | |
tree | 9db80a2774f7106b14452a44a77b376f3910b644 | |
parent | 89853188054e6e23182d8a6c26d1e1d2cb703e2e (diff) |
Big net rewrite for sriov + user mode
-rwxr-xr-x | kvm-wrapper.sh | 190 |
1 files changed, 124 insertions, 66 deletions
diff --git a/kvm-wrapper.sh b/kvm-wrapper.sh index 125593f..0234287 100755 --- a/kvm-wrapper.sh +++ b/kvm-wrapper.sh @@ -294,8 +294,10 @@ function qcow_create_disk() local KVM_IMG_DISKNAME="$1" local KVM_IMG_DISKSIZE="$2" "$KVM_IMG_BIN" create -f "$KVM_IMG_FORMAT" "$KVM_IMG_DISKNAME" "$KVM_IMG_DISKSIZE" + if [[ "x$?" == "x0" ]] then + [[ -n "$KVM_USER" ]] && chown $KVM_USER: "$KVM_IMG_DISKNAME" desc_update_setting "KVM_DISK[0]" "$KVM_IMG_DISKNAME" else echo "Failed creating disk. Continuing anyway" @@ -406,80 +408,100 @@ function pci_unvfiofy() } # helper for sriov -function ib_sriov() +# for mlx4 cards, we can force pkeys just like we'd force vlan at host level +function ib_sriov_pkeys() +{ + local DEV="$1" + local PCIDOMAIN="$2" + local PKEYS="$3" + + # set pkey if able + if [[ -e /sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx ]]; then + PKIDX=0 + [[ -z "$PKEYS" ]] || for PKEY in "$PKEYS"; do + local PKEY_IDX=$(basename "$(grep -lZw 0x8$PKEY /sys/class/infiniband/$DEV/ports/1/pkeys/*)") + [[ -z "$PKEY_IDX" ]] && fail_exit "pkey $PKEY not found for $DEV" + echo $PKEY_IDX > /sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$PKIDX || fail_exit "couldn't set pkey" + PKIDX=$((PKIDX+1)) + done + PKEY_IDX=$(basename "$(grep -lZw 0x7fff /sys/class/infiniband/$DEV/ports/1/pkeys/*)") + echo $PKEY_IDX > /sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$PKIDX + fi +} + +function ib_unsriov_pkeys() +{ + local DEV="$1" + local PCIDOMAIN="$2" + + test -e "/sys/class/infiniband/$DEV/iov/$PCIDOMAIN" || continue + grep -lZv none "/sys/class/infiniband/$DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/"* | \ + tr '\0' '\n' | \ + while read file; do + echo none > $file + done +} + +# for mlx5, we don't force pkeys but we force node guid to let opensm handle it +function ib_sriov_guid() { - local PKEYS="$1" - local IB_DEV="$2" + local DEV="$1" + local VIRTFN_IDX="$2" local CHILD_GUID="$3" - # pick one compatible card at random + if [[ "$DEV" =~ "mlx5_"* ]]; then + local GUID=$(sed -e 's/\([a-f0-9]\{2\}\)\([a-f0-9]\{2\}\)/\1:\2/g' < /sys/class/infiniband/$DEV/node_guid) + echo Follow > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/policy + echo $GUID > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/node + [[ -n "$CHILD_GUID" ]] && \ + echo "$CHILD_GUID" > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/port || \ + printf "%s:%x\n" "${GUID%:*}" $((0x${GUID##*:}+$VIRTFN_IDX+1)) > /sys/class/infiniband/$DEV/device/sriov/$VIRTFN_IDX/port + fi +} - [[ -z "$IB_DEV" ]] && IB_DEV=$(ls -d /sys/class/infiniband/*/device/virtfn0 | sort -r | awk -F/ '{print $5; exit}') - [[ -z "$IB_DEV" ]] && fail_exit "No VFIO compatible IB adapter" - local DEVDIR="/sys/class/infiniband/${IB_DEV}/device" - local PCIDOMAIN=inval + +# main sriov function - returns two values in PCIDOMAIN and VFNUM variables +function sriov() +{ + local BR="$1" + local DEVDIR=inval + local FLOCK local VIRTFN + PCIDOMAIN=inval + + [[ -e "/sys/class/net/${BR}/device" ]] && DEVDIR="/sys/class/net/${BR}/device" + [[ -e "/sys/class/infiniband/${BR}/device" ]] && DEVDIR="/sys/class/infiniband/${BR}/device" + [[ "$DEVDIR" = "inval" ]] && fail_exit "could not find device for $BR" - if [[ "$IB_DEV" =~ "mlx5_"* ]]; then + # following is racy, lock through. + exec {FLOCK}<>/tmp/kvm-wrapper-$BR.lock + flock ${FLOCK} + + if [[ -e "$DEVDIR/sriov_numvfs" && ! -e "$DEVDIR/virtfn0" ]]; then # arbitrary number: create all vfs available - [[ $(cat /sys/class/infiniband/$IB_DEV/device/mlx5_num_vfs) == "0" ]] && \ - cat /sys/class/infiniband/$IB_DEV/device/sriov_totalvfs > /sys/class/infiniband/$IB_DEV/device/mlx5_num_vfs + cat "$DEVDIR/sriov_totalvfs" > "$DEVDIR/sriov_numvfs" fi - - # pick a virtfn - this is totally racy, but sod it. + # pick a virtfn for VIRTFN in $DEVDIR/virtfn*; do PCIDOMAIN=$(basename $(readlink $VIRTFN)) [[ -e /sys/bus/pci/drivers/vfio-pci/$PCIDOMAIN ]] || break PCIDOMAIN=inval done - [[ $PCIDOMAIN = inval ]] && fail_exit "no vfio virtfn available for $IB_DEV" + [[ $PCIDOMAIN = inval ]] && fail_exit "no vfio virtfn available for $BR" + VFNUM=${VIRTFN##*virtfn} # hack - register everything to pci-stub first so we can tell appart which are used or not local PCIVENDOR="$(cat "/sys/bus/pci/devices/$PCIDOMAIN/vendor" |sed 's/^0x//')" PCIVENDOR+=" $(cat "/sys/bus/pci/devices/$PCIDOMAIN/device" |sed 's/^0x//')" echo "$PCIVENDOR" > "/sys/bus/pci/drivers/pci-stub/new_id" || fail_exit "couldn't add new_id (pci-stub)" - # set pkey if able - if [[ -e /sys/class/infiniband/$IB_DEV/iov/$PCIDOMAIN/ports/1/pkey_idx ]]; then - VFIDX=0 - [[ -z "$PKEYS" ]] || for PKEY in "$PKEYS"; do - local PKEY_IDX=$(basename "$(grep -lZw 0x8$PKEY /sys/class/infiniband/$IB_DEV/ports/1/pkeys/*)") - [[ -z "$PKEY_IDX" ]] && fail_exit "pkey $PKEY not found for $IB_DEV" - echo $PKEY_IDX > /sys/class/infiniband/$IB_DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$VFIDX || fail_exit "couldn't set pkey" - VFIDX=$((VFIDX+1)) - done - PKEY_IDX=$(basename "$(grep -lZw 0x7fff /sys/class/infiniband/$IB_DEV/ports/1/pkeys/*)") - echo $PKEY_IDX > /sys/class/infiniband/$IB_DEV/iov/$PCIDOMAIN/ports/1/pkey_idx/$VFIDX - fi + pci_vfiofy "$PCIDOMAIN" - # set guid and policy for mlx5 - if [[ "$IB_DEV" =~ "mlx5_"* ]]; then - local GUID=$(sed -e 's/\([a-f0-9]\{2\}\)\([a-f0-9]\{2\}\)/\1:\2/g' < /sys/class/infiniband/$IB_DEV/node_guid) - local VIRTFN_IDX=${VIRTFN#*virtfn} - echo Follow > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/policy - echo $GUID > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/node - [[ -n "$CHILD_GUID" ]] && \ - echo "$CHILD_GUID" > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/port || \ - printf "%s:%x\n" "${GUID%:*}" $((0x${GUID##*:}+$VIRTFN_IDX+1)) > /sys/class/infiniband/$IB_DEV/device/sriov/$VIRTFN_IDX/port - fi - - KVM_VFIO_DOMAIN+=("$PCIDOMAIN") -} - -function ib_unsriov() -{ - for PCIDOMAIN in ${KVM_VFIO_DOMAIN}; do - test -e /sys/class/infiniband/*/iov/"$PCIDOMAIN" || continue - grep -lZv none /sys/class/infiniband/*/iov/"$PCIDOMAIN"/ports/1/pkey_idx/* | \ - tr '\0' '\n' | \ - while read file; do - echo none > $file - done - done + # unlock, can't unlink file though.. + exec {FLOCK}<&- } - # Change perms. Meant to run forked. function serial_perms_forked() { @@ -701,23 +723,62 @@ function kvm_start_vm () [[ -z "${KVM_BR[@]:0:1}" ]] && fail_exit "No KVM_BR defined" [[ -z "${KVM_IF[@]:0:1}" ]] && fail_exit "No KVM_IF defined" for i in ${!KVM_MACADDR[@]}; do - local TAPDEV - KVM_BR[$i]="${KVM_BR[i]:-${KVM_BR[@]:0:1}}" KVM_IF[$i]="${KVM_IF[i]:-${KVM_IF[@]:0:1}}" - KVM_NETDEV_OPT[$i]="${KVM_NET_OPT[i]-${KVM_NET_OPT[@]:0:1}}" + KVM_NET_OPT[$i]="${KVM_NET_OPT[i]-${KVM_NET_OPT[@]:0:1}}" KVM_NET_MTU[$i]="${KVM_NET_MTU[i]-${KVM_NET_MTU[@]:0:1}}" + # handle special cases first + case "${KVM_IF[i]}" in + sriov) + local PCIDOMAIN + local VFNUM + + sriov "${KVM_BR[i]}" + CLEANUP+=("pci_unvfiofy \"$PCIDOMAIN\"") + + ip link set "${KVM_BR[i]}" vf "$VFNUM" mac "${KVM_MACADDR[i]}" || fail_exit "Could not set MAC address for vf $VFNUM device ${KVM_BR[i]}" + + KVM_NET+="-device vfio-pci,id=net${i},host=${PCIDOMAIN} " + + continue + ;; + + mlx*-sriov) + local PCIDOMAIN + local VFNUM + + sriov "${KVM_BR[i]}" + CLEANUP+=("pci_unvfiofy \"$PCIDOMAIN\"") + + if [[ "${KVM_NETSTYLE[i]}" == "mlx4-sriov" ]]; then + # KVM_MACADDR reused for pkeys as we can't (?) set guid + ib_sriov_pkeys "${KVM_BR[i]}" "$PCIDOMAIN" "${KVM_MACADDR[i]}" + CLEANUP+=("ib_unsriov_pkeys \"${KVM_BR[i]}\" \"$PCIDOMAIN\"") + else + # KVM_MACADDR relates to child guid + ib_sriov_guid "${KVM_BR[i]}" "$VFNUM" "${KVM_MACADDR[i]}" + fi + + KVM_NET+="-device vfio-pci,id=net${i},host=${PCIDOMAIN} " + + continue + ;; + esac + + local TAPDEV="" + local VHOSTFD_OPT="" + # tapdev can only be 15 chars long, if VM_NAME is too long keep start + last 2 chars # and pray for no collision... [[ $(( ${#VM_NAME} + ${#i} )) -le 10 ]] && \ TAPDEV=tap-${VM_NAME:0:$((10-${#i}))}-${i} || \ TAPDEV=tap-${VM_NAME:0:$((8-${#i}))}${VM_NAME:(-2)}-${i} - if [[ ${KVM_NETDEV_OPT[i]} == *"vhost=on"* ]] && [[ -n "$KVM_USER" ]]; then + if [[ "${KVM_NET_OPT[i]}" == *"vhost=on"* ]] && [[ -n "$KVM_USER" ]]; then exec {VHOSTFD}<>/dev/vhost-net - KVM_NETDEV_OPT[$i]="${KVM_NETDEV_OPT[$i]},vhostfd=${VHOSTFD}" - CLEANUP+=("exec ${VHOSTFD}<>-") + VHOSTFD_OPT=",vhostfd=${VHOSTFD}" + CLEANUP+=("exec ${VHOSTFD}<&-") fi if [[ -n "$KVM_MACVTAP" ]]; then @@ -726,23 +787,25 @@ function kvm_start_vm () ip link add link ${KVM_BR[i]} name $TAPDEV address ${KVM_MACADDR[i]} type macvtap mode bridge ip link set $TAPDEV ${KVM_NET_MTU[$i]:+mtu ${KVM_NET_MTU[$i]}} up + CLEANUP+=("ip link del $TAPDEV") [[ -e "/sys/class/net/$TAPDEV/ifindex" ]] || fail_exit "Could not create interface $TAPDEV" TAPFILE=/dev/tap$(cat "/sys/class/net/$TAPDEV/ifindex") exec {TAPFD}<>${TAPFILE} - CLEANUP+=("exec ${TAPFD}<>-") - CLEANUP+=("ip link del $TAPDEV") - KVM_NET+="-netdev type=tap,id=guest${i},fd=${TAPFD}${KVM_NETDEV_OPT[i]} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} " + CLEANUP+=("exec ${TAPFD}<&-") + KVM_NET+="-netdev type=tap,id=guest${i},fd=${TAPFD}${KVM_NET_OPT[i]}${VHOSTFD_OPT} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} " else ip tuntap add dev $TAPDEV mode tap ${KVM_USER+user $KVM_USER} ip link set $TAPDEV ${KVM_NET_MTU[$i]:+mtu ${KVM_NET_MTU[$i]}} up brctl addif ${KVM_BR[i]} $TAPDEV CLEANUP+=("ip tuntap del dev $TAPDEV mode tap") - KVM_NET+="-netdev type=tap,id=guest${i},ifname=${TAPDEV},script=no,downscript=no${KVM_NETDEV_OPT[i]} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} " + KVM_NET+="-netdev type=tap,id=guest${i},ifname=${TAPDEV},script=no,downscript=no${KVM_NET_OPT[i]}${VHOSTFD_OPT} -device ${KVM_IF[i]},netdev=guest${i},mac=${KVM_MACADDR[i]} " fi done + + [[ "$KVM_NET" == *"-net"* ]] || KVM_NET+="-net none " } # PCI passthrough assignement @@ -753,11 +816,6 @@ function kvm_start_vm () KVM_PCIASSIGN+="-device pci-assign,id=${KVM_PCIASSIGN_ID[$i]:-pciassign${i}},host=${KVM_PCIASSIGN_DOMAIN[$i]} " done - # IB sriov - [[ -n "$KVM_SRIOV_PKEYS$KVM_SRIOV_DEV$KVM_SRIOV_GUID" ]] && \ - ib_sriov "$KVM_SRIOV_PKEYS" "$KVM_SRIOV_DEV" "$KVM_SRIOV_GUID" && \ - CLEANUP+=("ib_unsriov") - # vfio assignement local KVM_VFIO="" for i in ${!KVM_VFIO_DOMAIN[@]}; do |