前言本文介绍了高速度视觉变换器EfficientViT在YOLOv11中的结合应用。现有视觉变换器计算成本高不适合实时应用EfficientViT通过采用夹心布局的内存高效模块和级联组注意力操作减少了多头自注意力中的计算冗余提高了内存效率和通道间通信。我们将EfficientViT集成进YOLOv11通过一系列代码修改与配置经实验验证取得了一定的效果在速度和准确性之间达成了较好的平衡。文章目录 YOLOv11改进大全卷积层、轻量化、注意力机制、损失函数、Backbone、SPPF、Neck、检测头全方位优化汇总专栏链接: YOLOv11改进专栏介绍摘要视觉变换器凭借其卓越的模型表征能力已在计算机视觉领域取得显著成功然而其优异的性能表现往往伴随着高昂的计算开销限制了其在实时应用场景中的部署。本文提出了一种高效视觉变换器系列架构——EfficientViT旨在解决现有变换器模型的计算效率瓶颈。通过深入分析发现当前变换器模型的速度主要受限于内存效率低下的操作特别是在多头自注意力机制MHSA中的张量重塑和逐元素运算过程。为此我们设计了一种新颖的夹心式布局模块该模块在高效前馈神经网络FFN层之间嵌入单一内存受限的MHSA层不仅显著提升了内存利用效率同时增强了通道间的信息交互能力。进一步研究发现不同注意力头之间的注意力图存在高度相似性导致计算冗余问题。针对此问题我们提出了级联组注意力模块采用不同分割的完整特征输入至各注意力头有效降低了计算复杂度并提升了注意力机制的多样性。综合实验结果表明EfficientViT在推理速度与模型精度之间实现了优异平衡性能超越现有高效模型。具体而言EfficientViT-M5在精度上超越MobileNetV3-Large达1.9个百分点同时在Nvidia V100 GPU和Intel Xeon CPU上的吞吐量分别提升40.4%和45.2%相较于近期高效模型MobileViT-XXSEfficientViT-M2精度提升1.8%GPU/CPU运行速度分别加快5.8倍和3.7倍ONNX格式转换后速度提升达7.4倍。相关代码与模型均已开源提供。文章链接论文地址论文地址代码地址代码地址基本原理EfficientViT 是一系列高速度的视觉变换器。它采用了一种新的内存高效模块采用夹心布局并结合了高效的级联组注意力操作从而减少了注意力计算的冗余。YOLO11引入代码在根目录下的ultralytics/nn/目录新建一个backbone目录然后新建一个以EfficientViT为文件名的py文件 把代码拷贝进去。# -------------------------------------------------------- # EfficientViT Model Architecture for Downstream Tasks # Copyright (c) 2022 Microsoft # Written by: Xinyu Liu # -------------------------------------------------------- import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint import itertools from timm.models.layers import SqueezeExcite import numpy as np import itertools class Conv2d_BN(torch.nn.Sequential): def __init__(self, a, b, ks1, stride1, pad0, dilation1, groups1, bn_weight_init1, resolution-10000): super().__init__() self.add_module(c, torch.nn.Conv2d( a, b, ks, stride, pad, dilation, groups, biasFalse)) self.add_module(bn, torch.nn.BatchNorm2d(b)) torch.nn.init.constant_(self.bn.weight, bn_weight_init) torch.nn.init.constant_(self.bn.bias, 0) torch.no_grad() def switch_to_deploy(self): c, bn self._modules.values() w bn.weight / (bn.running_var bn.eps)**0.5 w c.weight * w[:, None, None, None] b bn.bias - bn.running_mean * bn.weight / \ (bn.running_var bn.eps)**0.5 m torch.nn.Conv2d(w.size(1) * self.c.groups, w.size( 0), w.shape[2:], strideself.c.stride, paddingself.c.padding, dilationself.c.dilation, groupsself.c.groups) m.weight.data.copy_(w) m.bias.data.copy_(b) return m def replace_batchnorm(net): for child_name, child in net.named_children(): if hasattr(child, fuse): setattr(net, child_name, child.fuse()) elif isinstance(child, torch.nn.BatchNorm2d): setattr(net, child_name, torch.nn.Identity()) else: replace_batchnorm(child) class PatchMerging(torch.nn.Module): def __init__(self, dim, out_dim, input_resolution): super().__init__() hid_dim int(dim * 4) self.conv1 Conv2d_BN(dim, hid_dim, 1, 1, 0, resolutioninput_resolution) self.act torch.nn.ReLU() self.conv2 Conv2d_BN(hid_dim, hid_dim, 3, 2, 1, groupshid_dim, resolutioninput_resolution) self.se SqueezeExcite(hid_dim, .25) self.conv3 Conv2d_BN(hid_dim, out_dim, 1, 1, 0, resolutioninput_resolution // 2) def forward(self, x): x self.conv3(self.se(self.act(self.conv2(self.act(self.conv1(x)))))) return x class Residual(torch.nn.Module): def __init__(self, m, drop0.): super().__init__() self.m m self.drop drop def forward(self, x): if self.training and self.drop 0: return x self.m(x) * torch.rand(x.size(0), 1, 1, 1, devicex.device).ge_(self.drop).div(1 - self.drop).detach() else: return x self.m(x) class FFN(torch.nn.Module): def __init__(self, ed, h, resolution): super().__init__() self.pw1 Conv2d_BN(ed, h, resolutionresolution) self.act torch.nn.ReLU() self.pw2 Conv2d_BN(h, ed, bn_weight_init0, resolutionresolution) def forward(self, x): x self.pw2(self.act(self.pw1(x))) return x class CascadedGroupAttention(torch.nn.Module): r Cascaded Group Attention. Args: dim (int): Number of input channels. key_dim (int): The dimension for query and key. num_heads (int): Number of attention heads. attn_ratio (int): Multiplier for the query dim for value dimension. resolution (int): Input resolution, correspond to the window size. kernels (List[int]): The kernel size of the dw conv on query. def __init__(self, dim, key_dim, num_heads8, attn_ratio4, resolution14, kernels[5, 5, 5, 5],): super().__init__() self.num_heads num_heads self.scale key_dim ** -0.5 self.key_dim key_dim self.d int(attn_ratio * key_dim) self.attn_ratio attn_ratio qkvs [] dws [] for i in range(num_heads): qkvs.append(Conv2d_BN(dim // (num_heads), self.key_dim * 2 self.d, resolutionresolution)) dws.append(Conv2d_BN(self.key_dim, self.key_dim, kernels[i], 1, kernels[i]//2, groupsself.key_dim, resolutionresolution)) self.qkvs torch.nn.ModuleList(qkvs) self.dws torch.nn.ModuleList(dws) self.proj torch.nn.Sequential(torch.nn.ReLU(), Conv2d_BN( self.d * num_heads, dim, bn_weight_init0, resolutionresolution)) points list(itertools.product(range(resolution), range(resolution))) N len(points) attention_offsets {} idxs [] for p1 in points: for p2 in points: offset (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) if offset not in attention_offsets: attention_offsets[offset] len(attention_offsets) idxs.append(attention_offsets[offset]) self.attention_biases torch.nn.Parameter( torch.zeros(num_heads, len(attention_offsets))) self.register_buffer(attention_bias_idxs, torch.LongTensor(idxs).view(N, N)) torch.no_grad() def train(self, modeTrue): super().train(mode) if mode and hasattr(self, ab): del self.ab else: self.ab self.attention_biases[:, self.attention_bias_idxs] def forward(self, x): # x (B,C,H,W) B, C, H, W x.shape trainingab self.attention_biases[:, self.attention_bias_idxs] feats_in x.chunk(len(self.qkvs), dim1) feats_out [] feat feats_in[0] for i, qkv in enumerate(self.qkvs): if i 0: # add the previous output to the input feat feat feats_in[i] feat qkv(feat) q, k, v feat.view(B, -1, H, W).split([self.key_dim, self.key_dim, self.d], dim1) # B, C/h, H, W q self.dws[i](q) q, k, v q.flatten(2), k.flatten(2), v.flatten(2) # B, C/h, N attn ( (q.transpose(-2, -1) k) * self.scale (trainingab[i] if self.training else self.ab[i]) ) attn attn.softmax(dim-1) # BNN feat (v attn.transpose(-2, -1)).view(B, self.d, H, W) # BCHW feats_out.append(feat) x self.proj(torch.cat(feats_out, 1)) return x class LocalWindowAttention(torch.nn.Module): r Local Window Attention. Args: dim (int): Number of input channels. key_dim (int): The dimension for query and key. num_heads (int): Number of attention heads. attn_ratio (int): Multiplier for the query dim for value dimension. resolution (int): Input resolution. window_resolution (int): Local window resolution. kernels (List[int]): The kernel size of the dw conv on query. def __init__(self, dim, key_dim, num_heads8, attn_ratio4, resolution14, window_resolution7, kernels[5, 5, 5, 5],): super().__init__() self.dim dim self.num_heads num_heads self.resolution resolution assert window_resolution 0, window_size must be greater than 0 self.window_resolution window_resolution self.attn CascadedGroupAttention(dim, key_dim, num_heads, attn_ratioattn_ratio, resolutionwindow_resolution, kernelskernels,) def forward(self, x): B, C, H, W x.shape if H self.window_resolution and W self.window_resolution: x self.attn(x) else: x x.permute(0, 2, 3, 1) pad_b (self.window_resolution - H % self.window_resolution) % self.window_resolution pad_r (self.window_resolution - W % self.window_resolution) % self.window_resolution padding pad_b 0 or pad_r 0 if padding: x torch.nn.functional.pad(x, (0, 0, 0, pad_r, 0, pad_b)) pH, pW H pad_b, W pad_r nH pH // self.window_resolution nW pW // self.window_resolution # window partition, BHWC - B(nHh)(nWw)C - BnHnWhwC - (BnHnW)hwC - (BnHnW)Chw x x.view(B, nH, self.window_resolution, nW, self.window_resolution, C).transpose(2, 3).reshape( B * nH * nW, self.window_resolution, self.window_resolution, C ).permute(0, 3, 1, 2) x self.attn(x) # window reverse, (BnHnW)Chw - (BnHnW)hwC - BnHnWhwC - B(nHh)(nWw)C - BHWC x x.permute(0, 2, 3, 1).view(B, nH, nW, self.window_resolution, self.window_resolution, C).transpose(2, 3).reshape(B, pH, pW, C) if padding: x x[:, :H, :W].contiguous() x x.permute(0, 3, 1, 2) return x class EfficientViTBlock(torch.nn.Module): A basic EfficientViT building block. Args: type (str): Type for token mixer. Default: s for self-attention. ed (int): Number of input channels. kd (int): Dimension for query and key in the token mixer. nh (int): Number of attention heads. ar (int): Multiplier for the query dim for value dimension. resolution (int): Input resolution. window_resolution (int): Local window resolution. kernels (List[int]): The kernel size of the dw conv on query. def __init__(self, type, ed, kd, nh8, ar4, resolution14, window_resolution7, kernels[5, 5, 5, 5],): super().__init__() self.dw0 Residual(Conv2d_BN(ed, ed, 3, 1, 1, groupsed, bn_weight_init0., resolutionresolution)) self.ffn0 Residual(FFN(ed, int(ed * 2), resolution)) if type s: self.mixer Residual(LocalWindowAttention(ed, kd, nh, attn_ratioar, \ resolutionresolution, window_resolutionwindow_resolution, kernelskernels)) self.dw1 Residual(Conv2d_BN(ed, ed, 3, 1, 1, groupsed, bn_weight_init0., resolutionresolution)) self.ffn1 Residual(FFN(ed, int(ed * 2), resolution)) def forward(self, x): return self.ffn1(self.dw1(self.mixer(self.ffn0(self.dw0(x))))) class EfficientViT(torch.nn.Module): def __init__(self, img_size400, patch_size16, frozen_stages0, in_chans3, stages[s, s, s], embed_dim[64, 128, 192], key_dim[16, 16, 16], depth[1, 2, 3], num_heads[4, 4, 4], window_size[7, 7, 7], kernels[5, 5, 5, 5], down_ops[[subsample, 2], [subsample, 2], []], pretrainedNone, distillationFalse,): super().__init__() resolution img_size self.patch_embed torch.nn.Sequential(Conv2d_BN(in_chans, embed_dim[0] // 8, 3, 2, 1, resolutionresolution), torch.nn.ReLU(), Conv2d_BN(embed_dim[0] // 8, embed_dim[0] // 4, 3, 2, 1, resolutionresolution // 2), torch.nn.ReLU(), Conv2d_BN(embed_dim[0] // 4, embed_dim[0] // 2, 3, 2, 1, resolutionresolution // 4), torch.nn.ReLU(), Conv2d_BN(embed_dim[0] // 2, embed_dim[0], 3, 1, 1, resolutionresolution // 8)) resolution img_size // patch_size attn_ratio [embed_dim[i] / (key_dim[i] * num_heads[i]) for i in range(len(embed_dim))] self.blocks1 [] self.blocks2 [] self.blocks3 [] for i, (stg, ed, kd, dpth, nh, ar, wd, do) in enumerate( zip(stages, embed_dim, key_dim, depth, num_heads, attn_ratio, window_size, down_ops)): for d in range(dpth): eval(self.blocks str(i1)).append(EfficientViTBlock(stg, ed, kd, nh, ar, resolution, wd, kernels)) if do[0] subsample: #(Subsample stride) blk eval(self.blocks str(i2)) resolution_ (resolution - 1) // do[1] 1 blk.append(torch.nn.Sequential(Residual(Conv2d_BN(embed_dim[i], embed_dim[i], 3, 1, 1, groupsembed_dim[i], resolutionresolution)), Residual(FFN(embed_dim[i], int(embed_dim[i] * 2), resolution)),)) blk.append(PatchMerging(*embed_dim[i:i 2], resolution)) resolution resolution_ blk.append(torch.nn.Sequential(Residual(Conv2d_BN(embed_dim[i 1], embed_dim[i 1], 3, 1, 1, groupsembed_dim[i 1], resolutionresolution)), Residual(FFN(embed_dim[i 1], int(embed_dim[i 1] * 2), resolution)),)) self.blocks1 torch.nn.Sequential(*self.blocks1) self.blocks2 torch.nn.Sequential(*self.blocks2) self.blocks3 torch.nn.Sequential(*self.blocks3) self.channel [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x): outs [] x self.patch_embed(x) x self.blocks1(x) outs.append(x) x self.blocks2(x) outs.append(x) x self.blocks3(x) outs.append(x) return outs EfficientViT_m0 { img_size: 224, patch_size: 16, embed_dim: [64, 128, 192], depth: [1, 2, 3], num_heads: [4, 4, 4], window_size: [7, 7, 7], kernels: [7, 5, 3, 3], } EfficientViT_m1 { img_size: 224, patch_size: 16, embed_dim: [128, 144, 192], depth: [1, 2, 3], num_heads: [2, 3, 3], window_size: [7, 7, 7], kernels: [7, 5, 3, 3], } EfficientViT_m2 { img_size: 224, patch_size: 16, embed_dim: [128, 192, 224], depth: [1, 2, 3], num_heads: [4, 3, 2], window_size: [7, 7, 7], kernels: [7, 5, 3, 3], } EfficientViT_m3 { img_size: 224, patch_size: 16, embed_dim: [128, 240, 320], depth: [1, 2, 3], num_heads: [4, 3, 4], window_size: [7, 7, 7], kernels: [5, 5, 5, 5], } EfficientViT_m4 { img_size: 224, patch_size: 16, embed_dim: [128, 256, 384], depth: [1, 2, 3], num_heads: [4, 4, 4], window_size: [7, 7, 7], kernels: [7, 5, 3, 3], } EfficientViT_m5 { img_size: 224, patch_size: 16, embed_dim: [192, 288, 384], depth: [1, 3, 4], num_heads: [3, 3, 4], window_size: [7, 7, 7], kernels: [7, 5, 3, 3], } def EfficientViT_M0(pretrained, frozen_stages0, distillationFalse, fuseFalse, pretrained_cfgNone, model_cfgEfficientViT_m0): model EfficientViT(frozen_stagesfrozen_stages, distillationdistillation, pretrainedpretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)[model])) if fuse: replace_batchnorm(model) return model def EfficientViT_M1(pretrained, frozen_stages0, distillationFalse, fuseFalse, pretrained_cfgNone, model_cfgEfficientViT_m1): model EfficientViT(frozen_stagesfrozen_stages, distillationdistillation, pretrainedpretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)[model])) if fuse: replace_batchnorm(model) return model def EfficientViT_M2(pretrained, frozen_stages0, distillationFalse, fuseFalse, pretrained_cfgNone, model_cfgEfficientViT_m2): model EfficientViT(frozen_stagesfrozen_stages, distillationdistillation, pretrainedpretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)[model])) if fuse: replace_batchnorm(model) return model def EfficientViT_M3(pretrained, frozen_stages0, distillationFalse, fuseFalse, pretrained_cfgNone, model_cfgEfficientViT_m3): model EfficientViT(frozen_stagesfrozen_stages, distillationdistillation, pretrainedpretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)[model])) if fuse: replace_batchnorm(model) return model def EfficientViT_M4(pretrained, frozen_stages0, distillationFalse, fuseFalse, pretrained_cfgNone, model_cfgEfficientViT_m4): model EfficientViT(frozen_stagesfrozen_stages, distillationdistillation, pretrainedpretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)[model])) if fuse: replace_batchnorm(model) return model def EfficientViT_M5(pretrained, frozen_stages0, distillationFalse, fuseFalse, pretrained_cfgNone, model_cfgEfficientViT_m5): model EfficientViT(frozen_stagesfrozen_stages, distillationdistillation, pretrainedpretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)[model])) if fuse: replace_batchnorm(model) return model def update_weight(model_dict, weight_dict): idx, temp_dict 0, {} for k, v in weight_dict.items(): # k k[9:] if k in model_dict.keys() and np.shape(model_dict[k]) np.shape(v): temp_dict[k] v idx 1 model_dict.update(temp_dict) print(floading weights... {idx}/{len(model_dict)} items) return model_dict if __name__ __main__: model EfficientViT_M0(efficientvit_m0.pth) inputs torch.randn((1, 3, 640, 640)) res model(inputs) for i in res: print(i.size())tasks注册在ultralytics/nn/tasks.py中进行如下操作步骤1:from ultralytics.nn.backbone.EfficientViT import EfficientViT_M0, EfficientViT_M1, EfficientViT_M2, EfficientViT_M3, EfficientViT_M4, EfficientViT_M5步骤2修改def parse_model(d, ch, verboseTrue):elif m in {EfficientViT_M0, EfficientViT_M1, EfficientViT_M2, EfficientViT_M3, EfficientViT_M4, EfficientViT_M5}: m m(*args) c2 m.channel增加代码1在def parse_model(d, ch, verboseTrue):搜索下面的代码for i, (f, n, m, args) in enumerate(d[backbone] d[head])增加is_backbone False和t mis_backbone False for i, (f, n, m, args) in enumerate(d[backbone] d[head]): # from, number, module, args t m m getattr(torch.nn, m[3:]) if nn. in m else globals()[m] # get module修改代码修改def parse_model(d, ch, verboseTrue)中的# m_ nn.Sequential(*(m(*args) for _ in range(n))) if n 1 else m(*args) # module # t str(m)[8:-2].replace(__main__., ) # module type # m.np sum(x.numel() for x in m_.parameters()) # number params # m_.i, m_.f, m_.type i, f, t # attach index, from index, type # if verbose: # LOGGER.info(f{i:3}{str(f):20}{n_:3}{m.np:10.0f} {t:45}{str(args):30}) # print # save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x ! -1) # append to savelist # layers.append(m_) # if i 0: # ch [] # ch.append(c2)修改为if isinstance(c2, list) : is_backbone True m_ m m_.backbone True else: m_ nn.Sequential(*(m(*args) for _ in range(n))) if n 1 else m(*args) # module t str(m)[8:-2].replace(__main__., ) # module type m_.np sum(x.numel() for x in m_.parameters()) # number params m_.i, m_.f, m_.type i 4 if is_backbone else i, f, t # attach index, from index, type if verbose: LOGGER.info(f{i:3}{str(f):20}{n_:3}{m_.np:10.0f} {t:45}{str(args):30}) # print save.extend(x % (i 4 if is_backbone else i) for x in ([f] if isinstance(f, int) else f) if x ! -1) # append to savelist layers.append(m_) if i 0: ch [] if isinstance(c2, list): ch.extend(c2) for _ in range(5 - len(ch)): ch.insert(0, 0) else: ch.append(c2)替换_predict_once方法把_predict_once替换为下面的代码def _predict_once(self, x, profileFalse, visualizeFalse, embedNone): y, dt, embeddings [], [], [] # outputs for idx, m in enumerate(self.model): if m.f ! -1: # if not from previous layer x y[m.f] if isinstance(m.f, int) else [x if j -1 else y[j] for j in m.f] # from earlier layers if profile: self._profile_one_layer(m, x, dt) if hasattr(m, backbone): x m(x) for _ in range(5 - len(x)): x.insert(0, None) for i_idx, i in enumerate(x): if i_idx in self.save: y.append(i) else: y.append(None) x x[-1] else: x m(x) # run y.append(x if m.i in self.save else None) # save output if visualize: feature_visualization(x, m.type, m.i, save_dirvisualize) if embed and m.i in embed: embeddings.append(nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)) # flatten if m.i max(embed): return torch.unbind(torch.cat(embeddings, 1), dim0) return x配置yolov11-EfficientViT.yamlultralytics/cfg/models/11/yolov11-EfficientViT.yaml# Ultralytics YOLO , AGPL-3.0 license # YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect # Parameters nc: 80 # number of classes scales: # model compound scaling constants, i.e. modelyolo11n.yaml will call yolo11.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs s: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs m: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs l: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs x: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs # YOLO11n backbone backbone: # [from, repeats, module, args] - [-1, 1, EfficientViT_M0, []] # 4 可选 EfficientViT_M0, EfficientViT_M1, EfficientViT_M2, EfficientViT_M3, EfficientViT_M4, EfficientViT_M5 - [-1, 1, SPPF, [1024, 5]] # 5 - [-1, 2, C2PSA, [1024]] # 6 # YOLO11n head head: - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 3], 1, Concat, [1]] # cat backbone P4 - [-1, 2, C3k2, [512, False]] # 9 - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 2], 1, Concat, [1]] # cat backbone P3 - [-1, 2, C3k2, [256, False]] # 12 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 9], 1, Concat, [1]] # cat head P4 - [-1, 2, C3k2, [512, False]] # 15 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 6], 1, Concat, [1]] # cat head P5 - [-1, 2, C3k2, [1024, True]] # 18 (P5/32-large) - [[12, 15, 18], 1, Detect, [nc]] # Detect(P3, P4, P5)实验脚本import warnings warnings.filterwarnings(ignore) from ultralytics import YOLO if __name__ __main__: # 修改为自己的配置文件地址 model YOLO(/root/ultralytics-main/ultralytics/cfg/models/11/yolov11-EfficientViT.yaml) # 修改为自己的数据集地址 model.train(data/root/ultralytics-main/ultralytics/cfg/datasets/coco8.yaml, cacheFalse, imgsz640, epochs10, single_clsFalse, # 是否是单类别检测 batch8, close_mosaic10, workers0, optimizerSGD, ampTrue, projectruns/train, nameEfficientViT, )结果