class Layer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(10, 10)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(10, 10)
        self.relu2 = torch.nn.ReLU()

    def forward(self, x):
        a = self.linear1(x)
        a = self.relu1(a)
        a = torch.sigmoid(a)
        b = self.linear2(a)
        b = self.relu2(b)
        return b


class Model(torch.nn.Module):
    def __init__(self, apply_regional_compilation):
        super().__init__()
        self.linear = torch.nn.Linear(10, 10)
        # Apply compile only to the repeated layers.
        if apply_regional_compilation:
            self.layers = torch.nn.ModuleList(
                [torch.compile(Layer()) for _ in range(64)]
            )
        else:
            self.layers = torch.nn.ModuleList([Layer() for _ in range(64)])

    def forward(self, x):
        # In regional compilation, the self.linear is outside of the scope of `torch.compile`.
        x = self.linear(x)
        for layer in self.layers:
            x = layer(x)
        return x

接下來，讓我們回顧一下完整模型與區域編譯之間的區別。

在完整模型編譯中，整個模型被作為一個整體進行編譯。這是大多數使用者使用 torch.compile 的常用方法。在此示例中，我們將 torch.compile 應用於 Model 物件。這實際上會將 64 個層內聯，生成一個大的圖進行編譯。您可以透過執行此教程並設定 TORCH_LOGS=graph_code 來檢視完整的圖。

model = Model(apply_regional_compilation=False).cuda()
full_compiled_model = torch.compile(model)

另一方面，區域編譯編譯模型的一個區域。透過有策略地選擇編譯模型的重複區域，我們可以編譯一個更小的圖，然後將編譯後的圖重用於所有區域。在此示例中，torch.compile 僅應用於 layers，而不是整個模型。

regional_compiled_model = Model(apply_regional_compilation=True).cuda()

將編譯應用於重複區域而不是完整模型，可以大大節省編譯時間。在這裡，我們將只編譯一個層例項，然後在 Model 物件中重用它 64 次。

請注意，對於重複區域，模型的某些部分可能不會被編譯。例如，Model 中的 self.linear 超出了區域編譯的範圍。

另外，請注意效能加速與編譯時間之間存在權衡。完整模型編譯涉及更大的圖，理論上提供了更多的最佳化空間。然而，就實際而言，並且取決於模型，我們觀察到許多情況下完整模型與區域編譯之間的速度提升差異很小。

接下來，讓我們測量完整模型與區域編譯的編譯時間。

torch.compile 是一個 JIT 編譯器，這意味著它在第一次呼叫時進行編譯。在下面的程式碼中，我們測量了第一次呼叫所花費的總時間。雖然這種方法不精確，但它提供了良好的估計，因為大部分時間都花在編譯上。

def measure_latency(fn, input):
    # Reset the compiler caches to ensure no reuse between different runs
    torch.compiler.reset()
    with torch._inductor.utils.fresh_inductor_cache():
        start = perf_counter()
        fn(input)
        torch.cuda.synchronize()
        end = perf_counter()
        return end - start


input = torch.randn(10, 10, device="cuda")
full_model_compilation_latency = measure_latency(full_compiled_model, input)
print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds")

regional_compilation_latency = measure_latency(regional_compiled_model, input)
print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")

assert regional_compilation_latency < full_model_compilation_latency

/usr/local/lib/python3.10/dist-packages/torch/backends/cuda/__init__.py:131: UserWarning:

Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.com.tw/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py:312: UserWarning:

TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.

Full model compilation time = 11.40 seconds
Regional compilation time = 0.87 seconds

結論#

本教程展示瞭如何控制模型的冷啟動編譯時間，如果模型包含重複區域。這種方法需要使用者進行修改，將 torch.compile 應用於重複區域，而不是更常用的完整模型編譯。我們一直在努力減少冷啟動編譯時間。

指令碼總執行時間： (0 分鐘 13.798 秒)

透過區域編譯減少 torch.compile 的冷啟動編譯時間#

先決條件#

設定#

步驟#

結論#

文件

教程

資源