# –û–±—É—á–µ–Ω–∏–µ –Ω–µ–π—Ä–æ—Å–µ—Ç–µ–π: –ë–∞–∑–æ–≤—ã–µ –ø—Ä–∏–µ–º—ã

In [None]:
!curl -O 'https://courses.cv-gml.ru/storage/seminars/nn-training-basics/dataset.zip'

In [None]:
!unzip -qo dataset.zip

In [None]:
import glob

import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import torch
import torch.nn.functional as F
from torch import nn
from tqdm.auto import tqdm

# Increase these if figures appear small
plt.rcParams["figure.figsize"] = fx, fy = (14.08, 6.40)

# Not using `bfloat16` matrix multiplication for consistency
# You might get better performance without much precision loss
# by setting this to "medium" on some devices
torch.set_float32_matmul_precision("high")

In [None]:
def show_images(images, titles=[]):
    num = len(images)
    fig, axs = plt.subplots(nrows=1, ncols=num, squeeze=True, layout="constrained")
    axs = [axs] if num <= 1 else axs
    for i in range(num):
        ax = axs[i]
        ax.imshow(images[i])
        ax.axis("off")
        if titles != []:
            ax.set_title(titles[i])

    plt.show(fig)
    plt.close(fig)

## –û–±—É—á–µ–Ω–∏–µ –ø—Ä–æ—Å—Ç–æ–≥–æ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–∞ –Ω–∞ PyTorch

In [None]:
CLASSES = {"cloudy": 0, "foggy": 1, "rainy": 2, "shine": 3, "sunrise": 4}

In [None]:
weather_images = []
titles = []
for weather in CLASSES.keys():
    sample = sorted(glob.glob(f"./dataset/{weather}/*")).pop(0)
    weather_images.append(PIL.Image.open(sample))
    titles.append(weather)
show_images(weather_images, titles)

–ù–∞—É—á–∏–º—Å—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å —Å–≤–æ–∏ –¥–∞–Ω–Ω—ã–µ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏, –¥–ª—è —ç—Ç–æ–≥–æ –Ω–µ–æ–±—Ö–∏–º–æ —Å–æ–∑–¥–∞—Ç—å —Å–≤–æ–π —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã–π –ø–æ–¥–∫–ª–∞—Å—Å **`torch.utils.data.Dataset`**.

–¢–∞–∫–∂–µ –¥–ª—è –ø–æ–¥—Ç–≤–µ—Ä–∂–¥–µ–Ω–∏—è —É—Å–ø–µ—à–Ω–æ—Å—Ç–∏ –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏ –ø–æ–Ω–∞–¥–æ–±–∏—Ç—Å—è –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞, –∫–æ—Ç–æ—Ä–∞—è –Ω–µ –¥–æ–ª–∂–Ω–∞ –ø–µ—Ä–µ—Å–µ–∫–∞—Ç—å—Å—è —Å –æ–±—É—á–∞—é—â–µ–π.

In [None]:
NETWORK_SIZE = (64, 64)
BATCH_SIZE = 16

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print("Using the GPU üòä")
else:
    DEVICE = torch.device("cpu")
    print("Using the CPU üòû")

In [None]:
import torchvision.transforms.v2 as T

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
DEFAULT_TRANSFORM = T.Compose(
    [
        T.ToImage(),
        T.ToDtype(torch.float32, scale=True),
        T.Resize(size=NETWORK_SIZE),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ]
)

In [None]:
import random

from torch.utils import data


class MyCustomDataset(data.Dataset):
    def __init__(
        self,
        mode,
        root_dir="./dataset",
        train_fraction=0.8,
        split_seed=42,
        transform=None,
    ):
        # We can't store all the images in memory at the same time,
        # because sometimes we have to work with very large datasets
        # so we will only store data paths.

        paths = []
        labels = []
        rng = random.Random(split_seed)
        for cls_name, cls_idx in CLASSES.items():

            # Make sure that train and validation splits
            # use the same (random) order of samples
            cls_paths = sorted(glob.glob(f"{root_dir}/{cls_name}/*"))
            split = int(train_fraction * len(cls_paths))
            rng.shuffle(cls_paths)

            if mode == "train":
                cls_paths = cls_paths[:split]
            elif mode == "valid":
                cls_paths = cls_paths[split:]
            else:
                raise RuntimeError(f"Invalid mode: {mode!r}")

            paths.extend(cls_paths)
            labels.extend(cls_idx for _ in range(len(cls_paths)))

        self._len = len(paths)
        self._paths = paths
        self._labels = np.array(labels)
        assert self._labels.shape == (self._len,)

        if transform is None:
            transform = DEFAULT_TRANSFORM
        self._transform = transform

    def __len__(self):
        return len(self._paths)

    def __getitem__(self, index):
        img_path = self._paths[index]
        label = self._labels[index]

        # Read the image from disk
        image = np.array(PIL.Image.open(img_path).convert("RGB"))

        # Apply the desired transformations
        image = self._transform(image)

        return image, label

In [None]:
import os

# Prepare the data
ds_train = MyCustomDataset(mode="train")
ds_valid = MyCustomDataset(mode="valid")

dl_train = data.DataLoader(
    ds_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    num_workers=os.cpu_count(),
)
dl_valid = data.DataLoader(
    ds_valid,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=os.cpu_count(),
)

–ë—É–¥–µ–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –ø—Ä–æ—Å—Ç—É—é —Å–≤–µ—Ä—Ç–æ—á–Ω—É—é –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É

In [None]:
class MyModel(nn.Sequential):
    def __init__(self, num_classes):
        super().__init__()

        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2)

        self.conv2 = nn.Conv2d(6, 16, kernel_size=3)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2)

        self.conv3 = nn.Conv2d(16, 5, kernel_size=3, padding=1)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2)

        # (N, 5, 7, 7) -> (N, 5 * 7 * 7 == 245)
        self.flatten = nn.Flatten(start_dim=1, end_dim=-1)

        self.fc1 = nn.Linear(5 * 7 * 7, 120)
        self.relu4 = nn.ReLU()

        self.fc2 = nn.Linear(120, 84)
        self.relu5 = nn.ReLU()

        self.fc3 = nn.Linear(84, num_classes)

#### –î–∏–∑–∞–π–Ω / –≤—ã–±–æ—Ä –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã

<table style="font-size: 2rem;"><thead>
  <tr>
    <td style="border-right: solid 1px;"></td>
    <th style="text-align: center;">inference (production)</th>
    <th style="text-align: center;">training (development)</th>
  </tr></thead>
<tbody>
  <tr>
    <th style="border-right: solid 1px;">—Å–∫–æ—Ä–æ—Å—Ç—å —Ä–∞–±–æ—Ç—ã<br/>(–≤ FLOPs, FPS –∏–ª–∏ seconds/sample)</th>
    <td style="text-align: center;">–Ω–∞ —Ü–µ–ª–µ–≤–æ–º –¥–µ–≤–∞–π—Å–µ, —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º<br/>inference-specific –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–π</td>
    <td style="text-align: center;">–Ω–∞ –¥–µ–≤–∞–π—Å–µ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è, –≤—Ä–µ–º—è<br/>–Ω–µ–æ–±—Ö–æ–¥–∏–º–æ–µ –Ω–∞ –ø—Ä–æ–≤–µ–¥–µ–Ω–∏–µ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–æ–≤</td>
  </tr>
  <tr>
    <th style="border-right: solid 1px;">–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤<br/>(–≤ —à—Ç—É–∫–∞—Ö –∏–ª–∏ –±–∞–π—Ç–∞—Ö)</th>
    <td style="text-align: center;">—Ä–∞–∑–º–µ—Ä –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è –Ω–∞ –¥–∏—Å–∫–µ,<br/>—Å–∫–æ—Ä–æ—Å—Ç—å –∑–∞–≥—Ä—É–∑–∫–∏ –Ω–∞ –¥–µ–≤–∞–π—Å</td>
    <td style="text-align: center;">—Ä–µ–ø—Ä–µ–∑–µ–Ω—Ç–∞—Ç–∏–≤–Ω–∞—è —Å–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å –º–æ–¥–µ–ª–∏,<br/>—Å–∫–ª–æ–Ω–Ω–æ—Å—Ç—å –∫ –ø–µ—Ä–µ–æ–±—É—á–µ–Ω–∏—é</td>
  </tr>
  <tr>
    <th style="border-right: solid 1px;">–ø–∏–∫–æ–≤–æ–µ –ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏–µ –ø–∞–º—è—Ç–∏<br/>(–≤ –±–∞–π—Ç–∞—Ö –∏–ª–∏ % –æ—Ç –º–∞–∫—Å–∏–º—É–º–∞)</th>
    <td style="text-align: center;">–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å "—É–º–µ—Å—Ç–∏—Ç—å—Å—è" –Ω–∞ —Ü–µ–ª–µ–≤–æ–º –¥–µ–≤–∞–π—Å–µ,<br/>–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–∏</td>
    <td style="text-align: center;">–º–∞–∫—Å–∏–º–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞,<br/>—Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è –∫ –¥–µ–≤–∞–π—Å—É –¥–ª—è –æ–±—É—á–µ–Ω–∏—è</td>
  </tr>
</tbody>
</table>

–í—Å–µ 6 –ø—É–Ω–∫—Ç–æ–≤ - –≤–∞–∂–Ω—ã. –ö–∞–∂–¥—ã–π –∏–∑ —ç—Ç–∏—Ö –ø—É–Ω–∫—Ç–æ–≤ –Ω–µ—Ç—Ä–∏–≤–∏–∞–ª—å–Ω–æ –∑–∞–≤–∏—Å–∏—Ç –æ—Ç –≤—ã–±–æ—Ä–∞ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –∏ —ç–ª–µ–º–µ–Ω—Ç–æ–≤ –¥–∏–∑–∞–π–Ω–∞ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã. –í —á–∞—Å—Ç–Ω–æ—Å—Ç–∏, –æ–±—Ä–∞—Ç–∏—Ç–µ –≤–Ω–∏–º–∞–Ω–∏–µ, —á—Ç–æ –≤—Å–µ 6 –ø—É–Ω–∫—Ç–æ–≤ –≤ –æ–±—â–µ–º —Å–ª—É—á–∞–µ –Ω–µ–∑–∞–≤–∏—Å–∏–º—ã –∏ –Ω–µ –æ–±—è–∑–∞–Ω—ã –∫–æ—Ä—Ä–µ–ª–∏—Ä–æ–≤–∞—Ç—å –¥—Ä—É–≥ —Å –¥—Ä—É–≥–æ–º (—Ç–æ –µ—Å—Ç—å –æ–Ω–∏ –º–æ–≥—É—Ç —É–≤–µ–ª–∏—á–∏–≤–∞—Ç—å—Å—è/—É–º–µ–Ω—å—à–∞—Ç—å—Å—è –Ω–µ–∑–∞–≤–∏—Å–∏–º–æ –¥—Ä—É–≥ –æ—Ç –¥—Ä—É–≥–∞). 

---

–¢—Ä–∏–≤–∏–∞–ª—å–Ω—ã–π —Ü–∏–∫–ª –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –Ω–∞ PyTorch (—Å–º. –ø—Ä–æ—à–ª—ã–π —Å–µ–º–∏–Ω–∞—Ä)

In [None]:
def train(num_epochs):
    # Prepare model and optimization stuff
    model = MyModel(num_classes=len(CLASSES)).to(DEVICE)
    loss_fn = torch.nn.CrossEntropyLoss().to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # The main loop
    for e in range(num_epochs):

        # Train for a single epoch
        model = model.train()
        train_loss = []
        progress_train = tqdm(
            total=len(dl_train),
            desc=f"Epoch {e}",
            leave=False,
        )
        for x_batch, y_batch in dl_train:
            x_batch = x_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)

            p_batch = model(x_batch)
            loss = loss_fn(p_batch, y_batch)
            train_loss.append(loss.detach())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            progress_train.update()
        progress_train.close()

        train_loss = torch.stack(train_loss).mean()
        print(
            f"Epoch {e},",
            f"train_loss: {train_loss.item():.8f}",
        )

        # Measure metrics on validation
        model = model.eval()
        valid_accs = []
        progress_valid = tqdm(
            total=len(dl_valid),
            desc=f"Epoch {e}",
            leave=False,
        )
        for x_batch, y_batch in dl_valid:
            x_batch = x_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)

            with torch.no_grad():
                p_batch = model(x_batch)

            matches = p_batch.argmax(axis=-1) == y_batch
            valid_accs.append(matches)

            progress_valid.update()
        progress_valid.close()

        valid_accs = torch.concat(valid_accs).float().mean()
        print(
            f"Epoch {e},",
            f"valid_accs: {valid_accs.item():.8f}",
        )

    return model

In [None]:
model = train(num_epochs=3)

–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –∏ –∑–∞–≥—Ä—É–∑–∫–∞ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤

In [None]:
# Get all of the current model parameters
sd = model.state_dict()
print(list(sd.keys()))

In [None]:
print(sd["conv1.bias"])
print()
print(model.conv1.bias)

In [None]:
# Save the parameters to disk
torch.save(sd, "model.pt")

In [None]:
# This is WRONG!
if False:
    torch.save(model, "dont_do_this.pt")

# This "kind of" works, but it pickles the whole MyModel
# object itself, instead of only saving the parameters.
# This is almost never what you actually want.

In [None]:
# Re-create the model object from scratch
model = MyModel(num_classes=len(CLASSES))

# Load the saved parameters
#
# WARNING!
# By default, torch.load uses pickle which
# can cause arbitrary code execution.
#
# NEVER use torch.load without explicitly setting
# weights_only=True. Alternatively, consider using
# safetensors when loading parameters from
# unknown/untrusted sources.
sd = torch.load(
    "model.pt",
    map_location=DEVICE,
    weights_only=True,
)
model.load_state_dict(sd);

---

## PyTorch Lightning

<img src="https://courses.cv-gml.ru/storage/seminars/nn-training-basics/lightning_logo.png" width="50%">

### –ü–æ—á–µ–º—É PyTorch Lightning?

#### –ú–Ω–æ–≥–æ —Ä–µ–∞–ª–∏–∑–æ–≤–∞–Ω–Ω–æ–≥–æ —Ñ—É–Ω–∫—Ü–∏–æ–Ω–∞–ª–∞

* –û–±—É—á–µ–Ω–∏–µ –Ω–∞ GPU/–Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö GPU/TPU 

* Multi-node training

* –ê–≤—Ç–æ –ª–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏–µ

* Gradient accumulation

* ‚Ä¶

#### –ú–µ–Ω—å—à–µ boilerplate –∫–æ–¥–∞

–õ—é–±–æ–µ –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ –º–æ–∂–Ω–æ –Ω–∞—á–∞—Ç—å —Å –ø—Ä–æ—Å—Ç–æ–≥–æ –∫–æ–¥–∞ –∏ –±—ã—Å—Ç—Ä–æ –¥–æ–±–∞–≤–∏—Ç—å –±–æ–ª–µ–µ —Å–ª–æ–∂–Ω—ã–µ (—É–∂–µ —Ä–µ–∞–ª–∏–∑–æ–≤–∞–Ω–Ω—ã–µ) –≤–µ—â–∏, —Ç–∞–∫–∏–µ –∫–∞–∫: –æ–±—É—á–µ–Ω–∏–µ –Ω–∞ GPU, 16-bit, —Å–æ—Ö—Ä–∞–Ω–∏–µ —á–µ–∫–ø–æ–∏–Ω—Ç–æ–≤, –ª–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏–µ, ‚Ä¶

PyTorch Lightning —Å–¥–µ–ª–∞–ª –∏ –ø—Ä–æ—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–ª –º–Ω–æ–≥–æ –≤—Å–µ–≥–æ –∑–∞ –í–∞—Å, —á—Ç–æ–±—ã –í—ã –º–æ–≥–ª–∏ —Å–æ—Å—Ä–µ–¥–æ—Ç–æ—á–∏—Ç—å—Å—è –Ω–∞ –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–∏ –¥—Ä—É–≥–∏—Ö –∏–¥–µ–π.

–ú–µ–Ω—å—à–µ –∏–Ω–∂–µ–Ω–µ—Ä–Ω–æ–≥–æ –∫–æ–¥–∞ –æ–∑–Ω–∞—á–∞–µ—Ç:

* –º–µ–Ω—å—à–µ –±–∞–≥–æ–≤

* –±–æ–ª–µ–µ –±—ã—Å—Ç—Ä–æ–µ –∏—Ç–µ—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –ø—Ä–æ—Ç–æ—Ç–∏–ø–∏—Ä–æ–≤–∞–Ω–∏–µ

#### –°—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–∞—è —Å—Ç—Ä—É–∫—Ç—É—Ä–∞ –∫–æ–¥–∞

PyTorch Lightning –∑–∞–¥–∞–µ—Ç –¥–µ—Ñ–æ–ª—Ç–Ω—É—é "opinionated" —Å—Ö–µ–º—É –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏ –∫–æ–¥–∞. –ü–µ—Ä–µ—Ö–æ–¥—è –≤ –Ω–æ–≤—ã–π –ø—Ä–æ–µ–∫—Ç –Ω–µ –Ω—É–∂–Ω–æ —Å –Ω—É–ª—è —Ä–∞–∑–±–∏—Ä–∞—Ç—å—Å—è –≤ —Ç–æ–º, –∫–∞–∫ –∞–≤—Ç–æ—Ä—ã –ø–µ—Ä–µ–∏–∑–æ–±—Ä–µ–ª–∏ –≤–µ–ª–æ—Å–∏–ø–µ–¥. Callback —Ñ—É–Ω–∫—Ü–∏–∏ –∏–º–µ—é—Ç –ø—Ä–µ–¥—Å–∫–∞–∑—É–µ–º—ã–µ –∏–º–µ–Ω–∞, –º–µ–Ω—å—à–µ –≥–ª–æ–±–∞–ª—å–Ω—ã—Ö –æ–±—ä–µ–∫—Ç–æ–≤, –º–µ–Ω—å—à–µ –º–∞–≥–∏—á–µ—Å–∫–∏—Ö –∫–æ–Ω—Å—Ç–∞–Ω—Ç –∏ —Ç–ø.

In [None]:
import lightning as L


# Direct translation of the previous train function
class MyTrainingModule(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = MyModel(num_classes=len(CLASSES))
        self.train_loss = []
        self.valid_accs = []

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=1e-3)

    def training_step(self, batch):
        x, y = batch
        p = self.model(x)
        loss = F.cross_entropy(p, y)
        self.train_loss.append(loss.detach())
        return loss

    def validation_step(self, batch):
        x, y = batch
        p = self.model(x)
        matches = p.argmax(axis=-1) == y
        self.valid_accs.append(matches)
        return {}

    def on_train_epoch_end(self):
        epoch_loss = torch.stack(self.train_loss).mean()
        print(
            f"Epoch {self.trainer.current_epoch},",
            f"train_loss: {epoch_loss.item():.8f}",
        )
        # don't forget to clear the saved losses
        self.train_loss.clear()

    def on_validation_epoch_end(self):
        epoch_accs = torch.concat(self.valid_accs).float().mean()
        print(
            f"Epoch {self.trainer.current_epoch},",
            f"valid_accs: {epoch_accs.item():.8f}",
        )
        # don't forget to clear the saved accuracies
        self.valid_accs.clear()

In [None]:
# Init our training module
training_module = MyTrainingModule()

# Initialize a trainer
trainer = L.Trainer(accelerator="auto", max_epochs=3)

# Train the model ‚ö°
trainer.fit(training_module, dl_train, dl_valid)

### –í–æ–∑–æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ –æ–±—É—á–µ–Ω–∏—è

–ö–∞–∫ —É–∂–µ –±—ã–ª–æ —Å–∫–∞–∑–∞–Ω–æ –≤—ã—à–µ –≤ Pytorch Lightning —Ä–µ–∞–ª–∏–∑–æ–≤–∞–Ω–æ –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ –ª–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏–µ.

–°–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤—Å–µ –∑–∞–ø—É—Å–∫–∏: 

In [None]:
!tree lightning_logs/

–ö–∞–∫ –≤–∏–¥–∏—Ç–µ, Lightning –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ —Å–æ—Ö—Ä–∞–Ω–∏–ª —á–µ–∫–ø–æ–∏–Ω—Ç –ø–æ—Å–ª–µ–¥–Ω–µ–π —ç–ø–æ—Ö–∏. –ï—Å–ª–∏ –ø—Ä–µ—Ä–≤–∞—Ç—å –æ–±—É—á–µ–Ω–∏–µ, —Ç–æ –≤—Å–µ —Ä–∞–≤–Ω–æ –±—É–¥–µ—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å –ø–æ—Å–ª–µ–¥–Ω–µ–π —É—Å–ø–µ—à–Ω–æ –∑–∞–≤–µ—Ä—à–∏–≤—à–µ–π—Å—è —ç–ø–æ—Ö–∏.

–û–±—Ä–∞—Ç–∏—Ç–µ –≤–Ω–∏–º–∞–Ω–∏–µ, —á—Ç–æ —á–µ–∫–ø–æ–∏–Ω—Ç - —ç—Ç–æ –Ω–µ –ø—Ä–æ—Å—Ç–æ –≤–µ—Å–∞ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏. –ù–∞–ø—Ä–∏–º–µ—Ä, –≤ —á–µ–∫–ø–æ–∏–Ω—Ç —Ç–∞–∫–∂–µ –≤–∫–ª—é—á–∞—é—Ç—Å—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä–∞. –¢–∞–∫–∏–º –æ–±—Ä–∞–∑–æ–º, –∏—Å–ø–æ–ª—å–∑—É—è —á–µ–∫–ø–æ–∏–Ω—Ç –º–æ–∂–Ω–æ –≤–æ–∑–æ–±–Ω–æ–≤–∏—Ç—å –ø—Ä–µ—Ä–≤–∞–Ω–Ω—ã–π –ø—Ä–æ—Ü–µ—Å—Å –æ–±—É—á–µ–Ω–∏—è.

In [None]:
last_ckpt = "lightning_logs/version_0/checkpoints/epoch=2-step=225.ckpt"
ckpt = torch.load(last_ckpt, weights_only=True, map_location=DEVICE)
print(list(ckpt.keys()))

In [None]:
# Continue training the model for 2 more epochs
training_module = MyTrainingModule()
trainer = L.Trainer(accelerator="auto", max_epochs=5)
trainer.fit(training_module, dl_train, dl_valid, ckpt_path=last_ckpt)

In [None]:
training_module = MyTrainingModule.load_from_checkpoint(last_ckpt, map_location=DEVICE)

In [None]:
training_module.model

### –ò–¥–∏–æ–º–∞—Ç–∏—á–Ω—ã–π –ø–æ–¥—Å—á–µ—Ç –º–µ—Ç—Ä–∏–∫

In [None]:
import torchmetrics


class MyTrainingModule(L.LightningModule):
    def __init__(self, num_classes=len(CLASSES)):
        super().__init__()
        self.model = MyModel(num_classes=num_classes)
        self.accuracy = torchmetrics.classification.Accuracy(
            task="multiclass",
            num_classes=num_classes,
        )

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=1e-3)

    def training_step(self, batch):
        return self._step(batch, "train")

    def validation_step(self, batch):
        return self._step(batch, "valid")

    def _step(self, batch, kind):
        x, y = batch
        p = self.model(x)
        loss = F.cross_entropy(p, y)
        accs = self.accuracy(p.argmax(axis=-1), y)

        metrics = {
            f"{kind}_accs": accs,
            f"{kind}_loss": loss,
        }
        self.log_dict(
            metrics,
            prog_bar=True,
            logger=True,
            on_step=kind == "train",
            on_epoch=True,
        )

        return loss

In [None]:
def my_progress_bar():
    return L.pytorch.callbacks.TQDMProgressBar(leave=True)

In [None]:
training_module = MyTrainingModule()
trainer = L.Trainer(
    accelerator="auto",
    max_epochs=3,
    callbacks=[my_progress_bar()],
    # Make on_step log *literally* every step
    # (since our dataset is so small)
    log_every_n_steps=1,  # for demonstration puproses only
)
trainer.fit(training_module, dl_train, dl_valid)

### Tensorboard

Tensorboard - –ø–æ–ø—É–ª—è—Ä–Ω—ã–π –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç–∞—Ä–∏–π –¥–ª—è –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏–∏ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –æ–±—ä–µ–∫—Ç–æ–≤ –ø–æ–ª—É—á–∞–µ–º—ã—Ö –≤ –ø—Ä–æ—Ü–µ—Å—Å–µ –æ–±—É—á–µ–Ω–∏—è. –í Lightning –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é –≤–∫–ª—é—á–µ–Ω–æ –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –ª–æ–≥–æ–≤ –≤ —Ñ–æ—Ä–º–∞—Ç–µ Tensorboard. –ö—Ä–æ–º–µ —Ç–æ–≥–æ, –≤ Lightning –µ—Å—Ç—å –ø–æ–¥–¥–µ—Ä–∂–∫–∞ –¥–ª—è —Ä—è–¥–∞ –¥—Ä—É–≥–∏—Ö –±–∏–±–ª–∏—Ç–æ–µ–∫ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è (WandB, Comet, MLFlow, Neptune, –ø—Ä–æ—Å—Ç–æ–π CSV –∏–ª–∏ –¥–∞–∂–µ —Å–≤–æ–π –∫–∞—Å—Ç–æ–º–Ω—ã–π –ª–æ–≥–≥–µ—Ä).

Tensorboard –º–æ–∂–Ω–æ –∑–∞–ø—É—Å—Ç–∏—Ç—å, –≤—ã–ø–æ–ª–Ω–∏–≤ –∫–æ–º–º–∞–Ω–¥—É `tensorboard` –≤ —Ç–µ—Ä–º–∏–Ω–∞–ª–µ –∏ –æ—Ç–∫—Ä—ã–≤ —Ä–∞—Å–ø–µ—á–∞—Ç–∞–Ω–Ω—É—é —Å—Å—ã–ª–∫—É –≤ –±—Ä–∞—É–∑–µ—Ä–µ (–æ–±—ã—á–Ω–æ, http://localhost:6006/).

–õ–∏–±–æ, –∏—Å–ø–æ–ª—å–∑—É—è —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ–µ —Ä–∞—Å—à–∏—Ä–µ–Ω–∏–µ –¥–ª—è Jupyter, –º–æ–∂–Ω–æ –æ—Ç–∫—Ä—ã—Ç—å Tensorboard –∫–∞–∫ embedded –æ–∫–æ—à–µ—á–∫–æ –¥–ª—è –ø—Ä—è–º–æ –≤ –Ω–æ—É—Ç–±—É–∫–µ:

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir lightning_logs

### –†–∞—Å–ø–∏—Å–∞–Ω–∏–µ learning rate

–ü—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏ –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏ –ø–æ–ª–µ–∑–Ω–æ –ø–æ—Å—Ç–µ–ø–µ–Ω–Ω–æ —Å–Ω–∏–∂–∞—Ç—å —Ç–µ–º–ø –æ–±—É—á–µ–Ω–∏—è –¥–ª—è —Ç–æ–≥–æ, —á—Ç–æ–±—ã –±–æ–ª–µ–µ —Ç–æ—á–Ω–æ –ø–æ–ø–∞—Å—Ç—å –≤ –ª–æ–∫–∞–ª—å–Ω—ã–π –º–∏–Ω–∏–º—É–º.

–°–Ω–∏–∂–∞—é—Ç learning rate –ø–æ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–º –ø—Ä–∞–≤–∏–ª–∞–º. –î–æ—Å—Ç–∞—Ç–æ—á–Ω–æ —á–∞—Å—Ç–æ –∏—Å–ø–æ–ª—å–∑—É—é—Ç –æ–¥–∏–Ω –∏–∑ —Å–ª–µ–¥—É—é—â–∏—Ö –ø–æ–¥—Ö–æ–¥–æ–≤:

- **time-based decay**
  $$\alpha(step) = \alpha_0\ {\cdot}\ \frac{1}{1 + \gamma{\cdot}step}$$
  <br/>

- **smooth exponential decay**
  $$\alpha(step) = \alpha_0\ {\cdot}\ {\gamma}^{step} \qquad (0 < \gamma < 1)$$
  <br/>

- **stepwise exponential decay**
  $$\alpha(step) = \alpha_0\ {\cdot}\ {\gamma}^{\left\lfloor\frac{step}{interval}\right\rfloor} \qquad (\text{e.g. }\ \gamma = 0.1,\ \ interval \propto epoch)$$
  <br/>

- **plateau decay**
  $$
      \alpha(0) = \alpha_0\\
      \alpha(epoch) = \begin{cases}
          \alpha(epoch - 1)\ {\cdot}\ \gamma\quad\text{if metrics didn't improving for }P\text{ epochs}\\
          \alpha(epoch - 1)\quad\quad\text{otherwise}
      \end{cases}
  $$

<br/>
<img src="https://courses.cv-gml.ru/storage/seminars/nn-training-basics/lr_schedulers.png" width="50%">

In [None]:
class MyTrainingModule_WithLRScheduler(MyTrainingModule):
    def configure_optimizers(self):
        optimizer = super().configure_optimizers()

        lr_scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer,
            # decay the LR every five epochs
            step_size=5,
            gamma=0.1,
        )

        lr_scheduler_config = {
            "scheduler": lr_scheduler,
            # The unit of the scheduler's step size, could also be 'step'.
            # 'epoch' updates the scheduler on epoch end whereas 'step'
            # updates it after a optimizer update.
            "interval": "epoch",
            # How many epochs/steps should pass between calls to
            # `scheduler.step()`. 1 corresponds to updating the learning
            # rate after every `interval` (epoch or step).
            "frequency": 1,
            ## Metric to to monitor for schedulers like `ReduceLROnPlateau`
            # "monitor": "valid_accs",
        }

        return [optimizer], [lr_scheduler_config]

In [None]:
def my_lr_monitor():
    return L.pytorch.callbacks.LearningRateMonitor()

In [None]:
training_module = MyTrainingModule_WithLRScheduler()
trainer = L.Trainer(
    accelerator="auto",
    max_epochs=10,
    callbacks=[my_progress_bar(), my_lr_monitor()],
    log_every_n_steps=1,  # for demonstration puproses only
)
trainer.fit(training_module, dl_train, dl_valid)

### –ö–∞—Å—Ç–æ–º–∏–∑–∞—Ü–∏—è —á–µ–∫–ø–æ–∏–Ω—Ç–æ–≤

–ú—ã —É–∂–µ –≤–∏–¥–µ–ª–∏, —á—Ç–æ Pytorch Lightning —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç –≤–µ—Å–∞ –º–æ–¥–µ–ª–∏ —Å–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω–æ. –û–¥–Ω–∞–∫–æ —Å–æ—Ö—Ä–∞–Ω—è—é—Ç—Å—è –≤–µ—Å–∞ —Ç–æ–ª—å–∫–æ –ø–æ—Å–ª–µ–¥–Ω–µ–π —ç–ø–æ—Ö–∏, –º–µ—Ç—Ä–∏–∫–∏ –Ω–∞ –∫–æ—Ç–æ—Ä–æ–π –º–æ–≥—É—Ç –±—ã—Ç—å –Ω–µ –º–∞–∫—Å–∏–º–∞–ª—å–Ω—ã, —á—Ç–æ–±—ã —ç—Ç–æ –∏—Å–ø—Ä–∞–≤–∏—Ç—å —Å–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω–æ –Ω–∞–ø–∏—à–µ–º callback **`ModelCheckpoint`**.

–¢–∞–∫–∂–µ –≤–æ—Å–ø–æ–ª—å–∑—É–µ–º—Å—è **`EarlyStopping`** –¥–ª—è —Ç–æ–≥–æ, —á—Ç–æ–±—ã –æ–±—É—á–µ–Ω–∏–µ –æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–ª–æ—Å—å –ø—Ä–∏ –æ—Ç—Å—É—Ç—Å—Ç–≤–∏–µ —Ä–æ—Å—Ç–∞ –º–µ—Ç—Ä–∏–∫ –≤ —Ç–µ—á–µ–Ω–∏–µ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —ç–ø–æ—Ö.

In [None]:
def my_training_checkpoint():
    # Save the training module periodically by monitoring a quantity.
    return L.pytorch.callbacks.ModelCheckpoint(
        filename="{epoch}-{valid_accs:.3f}",
        monitor="valid_accs",
        mode="max",
        save_top_k=1,
        save_last=True,
    )


def my_early_stopping():
    # Monitor a metric and stop training when it stops improving.
    return L.pytorch.callbacks.EarlyStopping(
        monitor="valid_accs",
        mode="max",
        patience=4,
        verbose=True,
    )

In [None]:
training_module = MyTrainingModule_WithLRScheduler()
trainer = L.Trainer(
    max_epochs=100,
    accelerator="auto",
    callbacks=[
        my_progress_bar(),
        my_lr_monitor(),
        my_training_checkpoint(),
        my_early_stopping(),
    ],
    log_every_n_steps=1,  # for demonstration puproses only
)
trainer.fit(training_module, dl_train, dl_valid)

In [None]:
!tree lightning_logs/

------

<img src="https://courses.cv-gml.ru/storage/seminars/nn-training-basics/albumentations_logo.png" width="50%">

## –ü–æ—á–µ–º—É Albumentations?
Albumentations - —ç—Ç–æ Python –±–∏–±–ª–∏–æ—Ç–µ–∫–∞ –¥–ª—è –±—ã—Å—Ç—Ä–æ–≥–æ –∏ –≥–∏–±–∫–æ–≥–æ –∞—É–≥–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–∏—è –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π. Albumentations —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ —Ä–µ–∞–ª–∏–∑—É–µ—Ç —à–∏—Ä–æ–∫–∏–π —Å–ø–µ–∫—Ç—Ä –æ–ø–µ—Ä–∞—Ü–∏–π –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π –∏ –æ–±–µ—Å–ø–µ—á–∏–≤–∞–µ—Ç –ø—Ä–∏ —ç—Ç–æ–º –ª–∞–∫–æ–Ω–∏—á–Ω—ã–π, –Ω–æ –º–æ—â–Ω—ã–π –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å –¥–ª—è —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –∑–∞–¥–∞—á –∫–æ–º–ø—å—é—Ç–µ—Ä–Ω–æ–≥–æ –∑—Ä–µ–Ω–∏—è, –≤–∫–ª—é—á–∞—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏—é, —Å–µ–≥–º–µ–Ω—Ç–∞—Ü–∏—é –∏ –¥–µ—Ç–µ–∫—Ü–∏—é.

### –ü—Ä–∏–º–µ—Ä –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–π –∏–∑ Albumentations:

![](https://courses.cv-gml.ru/storage/seminars/nn-training-basics/albumentations_1.png)

### –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è:

In [None]:
import albumentations as A

In [None]:
MyTransform = A.Compose(
    [
        A.RandomResizedCrop(size=(128, 128), p=0.3),
        A.Rotate(limit=30),
        A.HorizontalFlip(p=0.3),
        A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.5),
    ]
)

In [None]:
show_images(weather_images, titles)
for _ in range(4):
    transformed = [
        MyTransform(image=np.array(image))["image"]
        for image in weather_images
        #
    ]
    show_images(transformed)

In [None]:
MyTransform = A.Compose(
    [
        A.SmallestMaxSize(max_size=160),
        A.Affine(translate_percent=0.05, scale=(1.0, 1.05), rotate=15, p=0.5),
        A.RandomCrop(height=128, width=128),
        A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.5),
        A.HorizontalFlip(p=0.3),
    ]
)

In [None]:
show_images(weather_images, titles)
for _ in range(4):
    transformed = [
        MyTransform(image=np.array(image))["image"]
        for image in weather_images
        #
    ]
    show_images(transformed)

#### –ö–æ–≤–∞—Ä–∏–∞–Ω—Ç–Ω–æ–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π –∏ –º–µ—Ç–æ–∫

![](https://courses.cv-gml.ru/storage/seminars/nn-training-basics/albumentations_2.png)

------

### –î–æ–±–∞–≤–∏–º –∞—É–≥–º–µ–Ω—Ç–∞—Ü–∏–∏ –≤ –æ–±—É—á–µ–Ω–∏–µ

In [None]:
import albumentations.pytorch.transforms

augmentations = [
    A.SmallestMaxSize(max_size=160),
    A.Affine(translate_percent=0.05, scale=(1.0, 1.05), rotate=15, p=0.5),
    A.RandomCrop(height=128, width=128),
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.5),
    A.HorizontalFlip(p=0.3),
]
# Don't forget to prepare the image data for consumption by PyTorch
common_transforms = [
    A.Resize(*NETWORK_SIZE),
    A.ToFloat(max_value=255),
    A.Normalize(max_pixel_value=1.0, mean=IMAGENET_MEAN, std=IMAGENET_STD),
    A.pytorch.transforms.ToTensorV2(),
]

MyTrainTransform = A.Compose(augmentations + common_transforms)
MyValidTransform = A.Compose(common_transforms)

In [None]:
def my_train_transform(image):
    return MyTrainTransform(image=image)["image"]


def my_valid_transform(image):
    return MyValidTransform(image=image)["image"]


ds_train_alb = MyCustomDataset(
    mode="train",
    transform=my_train_transform,
)
ds_valid_alb = MyCustomDataset(
    mode="valid",
    transform=my_valid_transform,
)

dl_train_alb = data.DataLoader(
    ds_train_alb,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    num_workers=os.cpu_count(),
)
dl_valid_alb = data.DataLoader(
    ds_valid_alb,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=os.cpu_count(),
)

In [None]:
training_module = MyTrainingModule_WithLRScheduler()
trainer = L.Trainer(
    max_epochs=100,
    accelerator="auto",
    callbacks=[
        my_progress_bar(),
        my_lr_monitor(),
        my_training_checkpoint(),
        my_early_stopping(),
    ],
    log_every_n_steps=1,  # for demonstration puproses only
)
trainer.fit(training_module, dl_train_alb, dl_valid_alb)

---