코딩/PyTorch
[Introduction to PyTorch] Building Models
guungyul
2025. 1. 11. 23:14
Building Models in PyTorch
torch.nn.Module and torch.nn.Parameter
Module class는 모델과 모델 구성 요소들을 포함
Parameter class는 learning weights들을 표현
두 개의 linear layer와 하나의 activation function, 그리고 softmax로 구성된 모델
import torch
class TinyModel(torch.nn.Module):
def __init__(self):
super(TinyModel, self).__init__()
self.linear1 = torch.nn.Linear(100, 200)
self.activation = torch.nn.ReLU()
self.linear2 = torch.nn.Linear(200, 10)
self.softmax = torch.nn.Softmax()
def forward(self, x):
x = self.linear1(x)
x = self.activation(x)
x = self.linear2(x)
x = self.softmax(x)
return x
tinymodel = TinyModel()
print('The model:')
print(tinymodel)
print('\\n\\nJust one layer:')
print(tinymodel.linear2)
print('\\n\\nModel params:')
for param in tinymodel.parameters():
print(param)
print('\\n\\nLayer params:')
for param in tinymodel.linear2.parameters():
print(param)
- torch.nn.Module의 subclass임
- __init__에서 모델 구조를 정의
- forward 메소드에서 실제 계산을 정의
- parameters() 메소드로 parameter 확인 가능
- Output
The model:
TinyModel(
(linear1): Linear(in_features=100, out_features=200, bias=True)
(activation): ReLU()
(linear2): Linear(in_features=200, out_features=10, bias=True)
(softmax): Softmax(dim=None)
)
Just one layer:
Linear(in_features=200, out_features=10, bias=True)
Model params:
Parameter containing:
tensor([[ 0.0765, 0.0830, -0.0234, ..., -0.0337, -0.0355, -0.0968],
[-0.0573, 0.0250, -0.0132, ..., -0.0060, 0.0240, 0.0280],
[-0.0908, -0.0369, 0.0842, ..., -0.0078, -0.0333, -0.0324],
...,
[-0.0273, -0.0162, -0.0878, ..., 0.0451, 0.0297, -0.0722],
[ 0.0833, -0.0874, -0.0020, ..., -0.0215, 0.0356, 0.0405],
[-0.0637, 0.0190, -0.0571, ..., -0.0874, 0.0176, 0.0712]],
requires_grad=True)
Parameter containing:
tensor([ 0.0304, -0.0758, -0.0549, -0.0893, -0.0809, -0.0804, -0.0079, -0.0413,
-0.0968, 0.0888, 0.0239, -0.0659, -0.0560, -0.0060, 0.0660, -0.0319,
-0.0370, 0.0633, -0.0143, -0.0360, 0.0670, -0.0804, 0.0265, -0.0870,
0.0039, -0.0174, -0.0680, -0.0531, 0.0643, 0.0794, 0.0209, 0.0419,
0.0562, -0.0173, -0.0055, 0.0813, 0.0613, -0.0379, 0.0228, 0.0304,
-0.0354, 0.0609, -0.0398, 0.0410, 0.0564, -0.0101, -0.0790, -0.0824,
-0.0126, 0.0557, 0.0900, 0.0597, 0.0062, -0.0108, 0.0112, -0.0358,
-0.0203, 0.0566, -0.0816, -0.0633, -0.0266, -0.0624, -0.0746, 0.0492,
0.0450, 0.0530, -0.0706, 0.0308, 0.0533, 0.0202, -0.0469, -0.0448,
0.0548, 0.0331, 0.0257, -0.0764, -0.0892, 0.0783, 0.0062, 0.0844,
-0.0959, -0.0468, -0.0926, 0.0925, 0.0147, 0.0391, 0.0765, 0.0059,
0.0216, -0.0724, 0.0108, 0.0701, -0.0147, -0.0693, -0.0517, 0.0029,
0.0661, 0.0086, -0.0574, 0.0084, -0.0324, 0.0056, 0.0626, -0.0833,
-0.0271, -0.0526, 0.0842, -0.0840, -0.0234, -0.0898, -0.0710, -0.0399,
0.0183, -0.0883, -0.0102, -0.0545, 0.0706, -0.0646, -0.0841, -0.0095,
-0.0823, -0.0385, 0.0327, -0.0810, -0.0404, 0.0570, 0.0740, 0.0829,
0.0845, 0.0817, -0.0239, -0.0444, -0.0221, 0.0216, 0.0103, -0.0631,
0.0831, -0.0273, 0.0756, 0.0022, 0.0407, 0.0072, 0.0374, -0.0608,
0.0424, -0.0585, 0.0505, -0.0455, 0.0268, -0.0950, -0.0642, 0.0843,
0.0760, -0.0889, -0.0617, -0.0916, 0.0102, -0.0269, -0.0011, 0.0318,
0.0278, -0.0160, 0.0159, -0.0817, 0.0768, -0.0876, -0.0524, -0.0332,
-0.0583, 0.0053, 0.0503, -0.0342, -0.0319, -0.0562, 0.0376, -0.0696,
0.0735, 0.0222, -0.0775, -0.0072, 0.0294, 0.0994, -0.0355, -0.0809,
-0.0539, 0.0245, 0.0670, 0.0032, 0.0891, -0.0694, -0.0994, 0.0126,
0.0629, 0.0936, 0.0058, -0.0073, 0.0498, 0.0616, -0.0912, -0.0490],
requires_grad=True)
Parameter containing:
tensor([[ 0.0504, -0.0203, -0.0573, ..., 0.0253, 0.0642, -0.0088],
[-0.0078, -0.0608, -0.0626, ..., -0.0350, -0.0028, -0.0634],
[-0.0317, -0.0202, -0.0593, ..., -0.0280, 0.0571, -0.0114],
...,
[ 0.0582, -0.0471, -0.0236, ..., 0.0273, 0.0673, 0.0555],
[ 0.0258, -0.0706, 0.0315, ..., -0.0663, -0.0133, 0.0078],
[-0.0062, 0.0544, -0.0280, ..., -0.0303, -0.0326, -0.0462]],
requires_grad=True)
Parameter containing:
tensor([ 0.0385, -0.0116, 0.0703, 0.0407, -0.0346, -0.0178, 0.0308, -0.0502,
0.0616, 0.0114], requires_grad=True)
Layer params:
Parameter containing:
tensor([[ 0.0504, -0.0203, -0.0573, ..., 0.0253, 0.0642, -0.0088],
[-0.0078, -0.0608, -0.0626, ..., -0.0350, -0.0028, -0.0634],
[-0.0317, -0.0202, -0.0593, ..., -0.0280, 0.0571, -0.0114],
...,
[ 0.0582, -0.0471, -0.0236, ..., 0.0273, 0.0673, 0.0555],
[ 0.0258, -0.0706, 0.0315, ..., -0.0663, -0.0133, 0.0078],
[-0.0062, 0.0544, -0.0280, ..., -0.0303, -0.0326, -0.0462]],
requires_grad=True)
Parameter containing:
tensor([ 0.0385, -0.0116, 0.0703, 0.0407, -0.0346, -0.0178, 0.0308, -0.0502,
0.0616, 0.0114], requires_grad=True)
Common Layer Types
Linear Layers
Linear / fully connected layer임. m input과 n output을 가지면 weights는 m * n matrix.
lin = torch.nn.Linear(3, 2)
x = torch.rand(1, 3)
print('Input:')
print(x)
print('\\n\\nWeight and Bias parameters:')
for param in lin.parameters():
print(param)
y = lin(x)
print('\\n\\nOutput:')
print(y)
- Output
Input:
tensor([[0.8790, 0.9774, 0.2547]])
Weight and Bias parameters:
Parameter containing:
tensor([[ 0.1656, 0.4969, -0.4972],
[-0.2035, -0.2579, -0.3780]], requires_grad=True)
Parameter containing:
tensor([0.3768, 0.3781], requires_grad=True)
Output:
tensor([[ 0.8814, -0.1492]], grad_fn=<AddmmBackward0>)
Convolution Layers
공간적 correlation이 중요한 데이터를 다룰 때 사용됨
import torch.functional as F
class LeNet(torch.nn.Module):
def __init__(self):
super(LeNet, self).__init__()
# 1 input image channel (black & white), 6 output channels, 5x5 square convolution
# kernel
self.conv1 = torch.nn.Conv2d(1, 6, 5)
self.conv2 = torch.nn.Conv2d(6, 16, 3)
# an affine operation: y = Wx + b
self.fc1 = torch.nn.Linear(16 * 6 * 6, 120) # 6*6 from image dimension
self.fc2 = torch.nn.Linear(120, 84)
self.fc3 = torch.nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
- Conv2d(1, 6, 5)는 각각 input channel, output channel, convolution kernel 크기를 나타냄
- PyTorch는 2D 뿐 아니라 1D, 3D tensor를 위한 convolutional layer도 제공
- 추가 parameter를 넣을 수 있음 (padding, 등)
Recurrent Layers
- one-hot encoded 단어가 주어지면 embedding 함
- 그 후 LSTM은 embedding sequence를 순회하면서 hidden_dim 길이의 vector를 출력
- 마지막 layer에서 출력된 vector를 softmax에 넣어 tag에 word가 맵핑될 확률 예측
class LSTMTagger(torch.nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
# The linear layer that maps from hidden state space to tag space
self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space, dim=1)
return tag_scores
- vocab_size: input 단어 수 (one-hot encoded vector에 크기)
- tagset_size: tag 수
- embedding_dim: embed 후 dimension
- hidden_dim: LSTM 기억 크기
Transformers
PyTorch는 Transformer class를 제공
- Attention head 수, encoder & decoder layer 수, dropout, activation function 등 조정 가능
- Encoder와 Decoder의 subcomponents도 조정 가능
Other Layers and Functions
Data Manipulation Layers
Max pooling
Cell들을 결합해 tensor의 차원을 줄여줌
my_tensor = torch.rand(1, 6, 6)
print(my_tensor)
maxpool_layer = torch.nn.MaxPool2d(3)
print(maxpool_layer(my_tensor))
# OUTPUT
tensor([[[0.5036, 0.6285, 0.3460, 0.7817, 0.9876, 0.0074],
[0.3969, 0.7950, 0.1449, 0.4110, 0.8216, 0.6235],
[0.2347, 0.3741, 0.4997, 0.9737, 0.1741, 0.4616],
[0.3962, 0.9970, 0.8778, 0.4292, 0.2772, 0.9926],
[0.4406, 0.3624, 0.8960, 0.6484, 0.5544, 0.9501],
[0.2489, 0.8971, 0.7499, 0.1803, 0.9571, 0.6733]]])
tensor([[[0.7950, 0.9876],
[0.9970, 0.9926]]])
- 각 사분면의 최대 값을 가지는 2 * 2 tensor를 출력
Normalization layers
다른 layer로 넣기 전에 출력을 정규화 함
my_tensor = torch.rand(1, 4, 4) * 20 + 5
print(my_tensor)
print(my_tensor.mean())
norm_layer = torch.nn.BatchNorm1d(4)
normed_tensor = norm_layer(my_tensor)
print(normed_tensor)
print(normed_tensor.mean())
# OUTPUT
tensor([[[ 7.7375, 23.5649, 6.8452, 16.3517],
[19.5792, 20.3254, 6.1930, 23.7576],
[23.7554, 20.8565, 18.4241, 8.5742],
[22.5100, 15.6154, 13.5698, 11.8411]]])
tensor(16.2188)
tensor([[[-0.8614, 1.4543, -0.9919, 0.3990],
[ 0.3160, 0.4274, -1.6834, 0.9400],
[ 1.0256, 0.5176, 0.0914, -1.6346],
[ 1.6352, -0.0663, -0.5711, -0.9978]]],
grad_fn=<NativeBatchNormBackward0>)
tensor(3.3528e-08, grad_fn=<MeanBackward0>)
- Vanishing/exploding gradient 해결 가능
Dropout layers
모델에 sparse representation을 장려
무작위로 input tensor의 일부를 mask 시킴 → overfitting 방지
my_tensor = torch.rand(1, 4, 4)
dropout = torch.nn.Dropout(p=0.4)
print(dropout(my_tensor))
print(dropout(my_tensor))
# OUTPUT
tensor([[[0.8869, 0.6595, 0.2098, 0.0000],
[0.5379, 0.0000, 0.0000, 0.0000],
[0.1950, 0.2424, 1.3319, 0.5738],
[0.5676, 0.8335, 0.0000, 0.2928]]])
tensor([[[0.8869, 0.6595, 0.2098, 0.2878],
[0.5379, 0.0000, 0.4029, 0.0000],
[0.0000, 0.2424, 1.3319, 0.5738],
[0.0000, 0.8335, 0.9647, 0.0000]]])
- Inference 과정에서는 작동 x
Activation Functions
Non-linearity를 모델링
- ReLU, Tanh, Hardtanh, sigmoid, etc
Loss Functions
- MSE, Cross Entropy Loss, Legative Likelihood Loss, etc