/
sbin
/
Upload File
HOME
#!/bin/bash # This is the default setting of networking multiqueue and irq affinity # 1. enable multiqueue if available # 2. irq affinity optimization # 3. stop irqbalance service # 4. adapted to the logic of the new cpu topo # 5. add cases for the irq and cpu ratio # 6. tune the smp affinity based on interface # 7. add cpus topo and bitmap relation # DEBUG # set -x # declare some global variables version="2.0.2" ecs_network_log=/var/log/ecs_network_optimization.log node_dir=/sys/devices/system/node bit_array=() cpu_num=0 chunk_size=32 group_size=0 input_para=${1:-new} # bitmap relation function function init_bit_array() { for ((i = 0; i < group_size; ++i)); do bit_array["$i"]=0 done } function set_bit() { location=$1 arr_index=$((location / chunk_size)) bit_index=$((location % chunk_size)) bit_array[arr_index]=$((bit_array[arr_index] | (1 << bit_index))) } function print_bit_array() { bit_string=$(printf "%08x" "${bit_array[0]}") for ((i = 1; i < group_size; i++)); do bit_string=$(printf "%08x,${bit_string}" "${bit_array[$i]}") done echo "${bit_string}" } # set and check multiqueue function set_check_multiqueue() { eth=$1 log_file=$ecs_network_log queue_num=$(ethtool -l $eth | grep -ia5 'pre-set' | grep -i combined | awk {'print $2'}) if [ $queue_num -gt 1 ]; then # set multiqueue ethtool -L $eth combined $queue_num # check multiqueue setting cur_q_num=$(ethtool -l $eth | grep -iA5 current | grep -i combined | awk {'print $2'}) if [ "X$queue_num" != "X$cur_q_num" ]; then echo "Failed to set $eth queue size to $queue_num" >>$log_file echo "after setting, pre-set queue num: $queue_num , current: $cur_q_num" >>$log_file return 1 else echo "OK. set $eth queue size to $queue_num" >>$log_file return 0 fi else echo "only support $queue_num queue; no need to enable multiqueue on $eth" >>$log_file return 1 fi } # check the cpu topology is old or new, to put it simply: # old topo is like one core with two cpus which are next to each other # new topo one core with two cpus which are split with the number of cores per socket function core_cpu_sibings() { cpu_topo_thread_sibling=/sys/devices/system/cpu/cpu0/topology/thread_siblings_list awk -F, '{ print $2 }' $cpu_topo_thread_sibling } # get all virtio_net irqs function get_irqs() { driver="$1" awk -v driver="$driver".*put -F: '$0 ~ driver {print $1}' /proc/interrupts | tr -d ' ' } # get device PCIe device location on NUMA0 or NUMA1 function get_device_numa_location() { numa_node_location=-1 numa_node_file=$(dirname "$(readlink -f /sys/class/net/"$1"/device)")/numa_node if [ -f "$numa_node_file" ]; then numa_node_location=$(cat "$numa_node_file") fi echo "$numa_node_location" } # print logs of the tuned result function show_nic_smp_xps_rps_info() { ethX=$1 driver=$2 irqs=($(get_irqs "$driver")) echo "$ethX rx queue:" >>$ecs_network_log for ((i = 0; i < ${#irqs[@]}; i += 2)); do rx_queue=$((i / 2)) rx_irq=${irqs[i]} aff_cpu=$(cat /proc/irq/"$rx_irq"/smp_affinity_list) rps_cpu=$(cat /sys/class/net/"$ethX"/queues/rx-"$rx_queue"/rps_cpus) rps_flow_cnt=$(cat /sys/class/net/"$ethX"/queues/rx-"$rx_queue"/rps_flow_cnt) printf "rx_queue: %-3s irq: %-3s irq_affinity_cpu: %-12s rps_cpu: %s rps_flow_cnt: %s\n" "$rx_queue" "$rx_irq" "$aff_cpu" "$rps_cpu" "$rps_flow_cnt" >>$ecs_network_log done echo "$ethX tx queue:" >>$ecs_network_log for ((i = 1; i < ${#irqs[@]}; i += 2)); do tx_queue=$((i / 2)) tx_irq=${irqs[i]} aff_cpu=$(cat /proc/irq/"$tx_irq"/smp_affinity_list) xps_cpu=$(cat /sys/class/net/"$ethX"/queues/tx-"$tx_queue"/xps_cpus) printf "tx_queue: %-3s irq: %-3s irq_affinity_cpu: %-12s xps_cpu: %s\n" "$tx_queue" "$tx_irq" "$aff_cpu" "$xps_cpu" >>$ecs_network_log done } # get cpus sorted based on NUMA nodes function get_numa_node_cpus() { cpu_list=() device_numa_location=$(get_device_numa_location "$1") if [ "$device_numa_location" != -1 ]; then cpu_pairs=($(cat ${node_dir}/node${device_numa_location}/cpulist | sed 's/,/ /g')) for cpu_pair in "${cpu_pairs[@]}"; do IFS='-' read -ra range <<<"$cpu_pair" for ((j = range[0]; j <= range[1]; j++)); do cpu_list+=("$j") done done fi for i in $(ls -d $node_dir/node*); do i=${i/*node/} if [ "$device_numa_location" == "$i" ]; then continue fi cpu_pairs=($(cat ${node_dir}/node${i}/cpulist | sed 's/,/ /g')) for cpu_pair in "${cpu_pairs[@]}"; do IFS='-' read -ra range <<<"$cpu_pair" for ((j = range[0]; j <= range[1]; j++)); do cpu_list+=("$j") done done done echo "${cpu_list[@]}" } # stop irqbalance service function stop_irqblance() { log_file=$ecs_network_log ret=0 if [ "X" != "X$(ps -ef | grep irqbalance | grep -v grep)" ]; then if which systemctl; then systemctl stop irqbalance systemctl disable irqbalance else service irqbalance stop chkconfig irqbalance off fi if [ $? -ne 0 ]; then echo "Failed to stop irqbalance" >>$log_file ret=1 fi else echo "OK. irqbalance stoped." >>$log_file fi return $ret } function set_irq_smpaffinity_new() { driver="$2" irqs=($(get_irqs "$driver")) irqs_num=${#irqs[@]} cpus=($(get_numa_node_cpus "$1")) cpu_num=${#cpus[@]} group_size=$(((cpu_num + chunk_size - 1) / chunk_size)) ratio=$((cpu_num / (irqs_num / 2))) remains=$((ratio * (irqs_num / 2))) cpu_sibling=$(core_cpu_sibings) for ((irq = 0; irq < irqs_num; )); do init_bit_array positions=() if [ $ratio -eq 1 ]; then # tx and tx will fill all the cpu bitmask cpu=${cpus[((irq / 2))]} positions+=("$cpu") elif [ $ratio -eq 2 ] || [ $ratio -eq 4 ]; then # if the ratio is 2, then the tx and rx will take one cpu bit in one core siblings # if the ratio is 4, mostly there are NUMA nodes, tx and rx take on cpu bit in one # core and to fill up the NUMA0 as mush as possible, then to NUMA1 cpu=${cpus[irq]} if [ -n "$cpu_sibling" ]; then # new topo cpu=$((irq % 2 == 0 ? cpus[irq / 2] : cpus[(irq / 2) + cpu_sibling])) fi positions+=("$cpu") elif [ $ratio -eq 8 ]; then # per cpu per tx per rx is far enough, this is moslty AMD instance # just filled it to every 4 cpus, based on the experiment cpu=${cpus[irq * 4]} positions+=("$cpu") else # other cases like ratio 3 leave it as the default we do. # if need to deal with it, just do it based on the cases cpu=${cpus[irq]} positions+=("$cpu") fi # according to the virtio_net driver the "straggers" are processed too. # https://github.com/torvalds/linux/commit/2ca653d607ce59f2729173a7ea56dbfa6330ec88 if [ $remains != "$cpu_num" ]; then positions+=("${cpus[remains]}") ((remains += 1)) fi for position in "${positions[@]}"; do set_bit "$position" done irq_mask=$(print_bit_array) if [ $ratio -eq 1 ]; then # tx and tx will be filled one same bit mask echo "$irq_mask" >"/proc/irq/${irqs[irq]}/smp_affinity" echo "$irq_mask" >"/proc/irq/${irqs[irq + 1]}/smp_affinity" ((irq += 2)) elif [ $ratio -eq 2 ] || [ $ratio -eq 4 ]; then # tx and tx will be filled one same bit mask in one cpu sibling cpus echo "$irq_mask" >"/proc/irq/${irqs[irq]}/smp_affinity" ((irq += 1)) elif [ $ratio -eq 8 ]; then # tx and rx fill all the cpu maps echo "$irq_mask" >"/proc/irq/${irqs[irq]}/smp_affinity" ((irq += 1)) else # other cases like ratio 3 leave it as default echo "$irq_mask" >"/proc/irq/${irqs[irq]}/smp_affinity" ((irq += 1)) fi done } function set_irq_smpaffinity_old() { node_cpumax=0 for i in $(ls -d $node_dir/node*); do i=${i/*node/} node_cpumax=$(cat /sys/devices/system/node/node${i}/cpulist | awk -F- '{print $NF}') [[ $node_cpumax -gt 0 ]] && break done echo "max node :$i" >>$ecs_network_log driver="$2" irqs=($(get_irqs "$driver")) core=0 for irq in ${irqs[@]}; do VEC=$core if [ $VEC -ge 32 ]; then let "IDX = $VEC / 32" MASK_FILL="" MASK_ZERO="00000000" for ((i = 1; i <= $IDX; i++)); do MASK_FILL="${MASK_FILL},${MASK_ZERO}" done let "VEC -= 32 * $IDX" MASK_TMP=$((1 << $VEC)) MASK=$(printf "%X%s" $MASK_TMP $MASK_FILL) else MASK_TMP=$((1 << $VEC)) MASK=$(printf "%X" $MASK_TMP) fi echo $MASK >/proc/irq/$irq/smp_affinity echo "mask:$MASK, irq:$irq" >>$ecs_network_log core=$(((core + 1) % (node_cpumax + 1))) done } # set irq affinity based on the instance # reference some logic from virtio_net function set_irq_smpaffinity() { if [ "$input_para" == "new" ]; then set_irq_smpaffinity_new "$@" else set_irq_smpaffinity_old "$@" fi } # main logic function main() { ret_value=0 echo "running $0" >$ecs_network_log echo "======== ECS network setting starts $(date +'%Y-%m-%d %H:%M:%S') ========" >>$ecs_network_log stop_irqblance # we assume your NIC interface(s) is/are like eth* eth_dirs=$(ls -d /sys/class/net/eth*) if [ "X$eth_dirs" = "X" ]; then echo "ERROR! can not find any ethX in /sys/class/net/ dir." >>$ecs_network_log ret_value=1 fi for i in $eth_dirs; do cur_eth=$(basename "$i") echo "optimize network performance: current device $cur_eth" >>$ecs_network_log # only optimize virtio_net device driver=$(basename "$(readlink "$i"/device)") if ! echo "$driver" | grep -q virtio; then echo "ignore device $cur_eth with driver $driver" >>$ecs_network_log continue fi echo "set and check multiqueue on $cur_eth" >>$ecs_network_log set_check_multiqueue "$cur_eth" "$ecs_network_log" ret_value=$? if [ $ret_value -ne 0 ]; then echo "Failed to set multiqueue on $cur_eth or the queue len is 1. ret_value $ret_value" >>$ecs_network_log continue fi echo "Current ========== $cur_eth $driver smp affinity:" >>$ecs_network_log show_nic_smp_xps_rps_info "$cur_eth" "$driver" set_irq_smpaffinity "$cur_eth" "$driver" echo "After ==========tuned $cur_eth $driver smp affinity:" >>$ecs_network_log show_nic_smp_xps_rps_info "$cur_eth" "$driver" done echo "======== ECS network setting END $(date +'%Y-%m-%d %H:%M:%S') ========" >>$ecs_network_log return $ret_value } function show_help() { echo "Usage: $0 [old|new] [-h] [-v]" echo "Options:" echo " -h, --help Display this help and exit" echo " -v, --version Display the version info" } # program starts here if [ -n "$1" ]; then if [ "$1" == "-v" ] || [ "$1" == "--version" ]; then echo $version exit 0 elif [ "$1" != "old" ] && [ "$1" != "new" ]; then show_help exit 1 fi fi main exit $?