I'm trying to classify cat vs dog with GoogleNet(Pytorch). Each class contains 4000 images to train and 1000 images to test, which's size is 300*300. My computer has 32GB RAM and RTX 2080 Super graphic card. And this error occurs when it just start to train. Below is my code the part of GoogleNet :
class Inception(nn.Module) :
def __init__(self, in_dim, out_dim1, mid_dim3, out_dim3, mid_dim5, out_dim5, pool):
super(Inception, self).__init__()
self.lay1 = nn.Sequential(nn.Conv2d(in_dim, out_dim1, kernel_size= 1), nn.BatchNorm2d(out_dim1), nn.ReLU())
self.lay2 = nn.Sequential(nn.Conv2d(in_dim, mid_dim3, kernel_size = 1), nn.BatchNorm2d(mid_dim3), nn.ReLU(), nn.Conv2d(mid_dim3, out_dim3, kernel_size = 3, padding = 1), nn.BatchNorm2d(out_dim3), nn.ReLU())
self.lay3 = nn.Sequential(nn.Conv2d(in_dim, mid_dim5, kernel_size = 1), nn.BatchNorm2d(mid_dim5), nn.ReLU(), nn.Conv2d(mid_dim5, out_dim5, kernel_size = 3, padding = 1), nn.BatchNorm2d(out_dim5), nn.ReLU(), nn.Conv2d(out_dim5, out_dim5, kernel_size = 3, padding = 1), nn.BatchNorm2d(out_dim5), nn.ReLU())
self.lay4 = nn.Sequential(nn.MaxPool2d(3, 1, 1), nn.Conv2d(in_dim, pool, kernel_size = 1), nn.BatchNorm2d(pool), nn.ReLU())
def forward(self, x):
y1 = self.lay1(x)
y2 = self.lay2(x)
y3 = self.lay3(x)
y4 = self.lay4(x)
return torch.cat([y1, y2, y3, y4], 1)
class Google(nn.Module):
def __init__(self):
super(Google, self).__init__()
self.pre_lay = nn.Sequential(nn.Conv2d(1, 48, 3, padding = 1), nn.BatchNorm2d(48), nn.ReLU())
self.glay1 = Inception(48, 16, 24, 32, 4, 8, 8)
self.glay2 = Inception(64, 32, 32, 48, 8, 24, 16) # input channel : prev output channel sum(torch.cat)
self.maxpool = nn.MaxPool2d(3, stride = 2, padding = 1)
self.glay3 = Inception(120, 48, 24, 52, 4, 12, 16)
self.glay4 = Inception(128, 40, 28, 56, 6, 16, 16)
self.glay5 = Inception(128, 32, 32, 64, 6, 16, 16)
self.glay6 = Inception(128, 28, 36, 72, 8, 16, 16)
self.glay7 = Inception(132, 64, 40, 80, 8, 32, 32)
self.glay8 = Inception(208, 64, 40, 80, 8, 32, 32)
self.glay9 = Inception(208, 96, 48, 96, 12, 32, 32)
self.avgpool = nn.AvgPool2d(8, stride = 1)
self.linear = nn.Linear(47872, 2)
gc.collect()
torch.cuda.empty_cache()
def forward(self, x):
gc.collect()
torch.cuda.empty_cache()
# with torch.no_grad() : //
out = self.pre_lay(x) # CUDA out of memory Occurs!!
out = self.glay1(out)
out = self.glay2(out)
out = self.maxpool(out)
out = self.glay3(out)
out = self.glay4(out)
out = self.glay5(out)
out = self.glay6(out)
out = self.glay7(out)
out = self.maxpool(out)
out = self.glay8(out)
out = self.glay9(out)
out = self.avgpool(out)
out = out.view(out.size(0), -1)
print("Out size : ", out.size())
out = self.linear(out)
return out
As I wrote, at the first step of GoogleNet the error occurs. Prior line of the error-occurs, someone suggested to add with torch.no_grad() : at model's forward function if there's cuda memory error but memory wasn't the issue. But then element 0 of tensors does not require grad and does not have a grad_fn error was out. I tried empty_cache on every step I suspect that using GPU memory a lot, but still not working.
If anyone who've been similar error or know the reason, your advice must be very thankful.
Add full error message
This is what I got with memory error.
Below error was after I add torch.no_grad() at forward()