Something wrong with my checkpoint file when using torch.load()

Question

When I use torch.load to load one checkpoint:

torch.load('./latest_net_G.pth', map_location='cpu')

I got the runtime error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/opt/conda/lib/python3.7/tarfile.py in nti(s)
    186             s = nts(s, "ascii", "strict")
--> 187             n = int(s.strip() or "0", 8)
    188         except ValueError:

ValueError: invalid literal for int() with base 8: '_v2\nq\x03(('

During handling of the above exception, another exception occurred:

InvalidHeaderError                        Traceback (most recent call last)
/opt/conda/lib/python3.7/tarfile.py in next(self)
   2288             try:
-> 2289                 tarinfo = self.tarinfo.fromtarfile(self)
   2290             except EOFHeaderError as e:

/opt/conda/lib/python3.7/tarfile.py in fromtarfile(cls, tarfile)
   1094         buf = tarfile.fileobj.read(BLOCKSIZE)
-> 1095         obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
   1096         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE

/opt/conda/lib/python3.7/tarfile.py in frombuf(cls, buf, encoding, errors)
   1036 
-> 1037         chksum = nti(buf[148:156])
   1038         if chksum not in calc_chksums(buf):

/opt/conda/lib/python3.7/tarfile.py in nti(s)
    188         except ValueError:
--> 189             raise InvalidHeaderError("invalid header")
    190     return n

InvalidHeaderError: invalid header

During handling of the above exception, another exception occurred:

ReadError                                 Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/torch/serialization.py in _load(f, map_location, pickle_module, **pickle_load_args)
    555         try:
--> 556             return legacy_load(f)
    557         except tarfile.TarError:

/opt/conda/lib/python3.7/site-packages/torch/serialization.py in legacy_load(f)
    466 
--> 467         with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as tar, \
    468                 mkdtemp() as tmpdir:

/opt/conda/lib/python3.7/tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
   1590                 raise CompressionError("unknown compression type %r" % comptype)
-> 1591             return func(name, filemode, fileobj, **kwargs)
   1592 

/opt/conda/lib/python3.7/tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
   1620             raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621         return cls(name, mode, fileobj, **kwargs)
   1622 

/opt/conda/lib/python3.7/tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
   1483                 self.firstmember = None
-> 1484                 self.firstmember = self.next()
   1485 

/opt/conda/lib/python3.7/tarfile.py in next(self)
   2300                 elif self.offset == 0:
-> 2301                     raise ReadError(str(e))
   2302             except EmptyHeaderError:

ReadError: invalid header

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-15-2abbf3aab3ae> in <module>
----> 1 torch.load('multi_task/checkpoints/latest_pet/latest_net_G.pth.tar', map_location='cpu')

/opt/conda/lib/python3.7/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
    385         f = f.open('rb')
    386     try:
--> 387         return _load(f, map_location, pickle_module, **pickle_load_args)
    388     finally:
    389         if new_fd:

/opt/conda/lib/python3.7/site-packages/torch/serialization.py in _load(f, map_location, pickle_module, **pickle_load_args)
    558             if zipfile.is_zipfile(f):
    559                 # .zip is used for torch.jit.save and will throw an un-pickling error here
--> 560                 raise RuntimeError("{} is a zip archive (did you mean to use torch.jit.load()?)".format(f.name))
    561             # if not a tarfile, reset file offset and proceed
    562             f.seek(0)

RuntimeError: multi_task/checkpoints/latest_pet/latest_net_G.pth.tar is a zip archive (did you mean to use torch.jit.load()?)

And here's how I save the model:

    def save_networks(self, epoch):
        """Save all the networks to the disk.

        Parameters:
            epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
        """
        for name in self.model_names:
            if isinstance(name, str):
                save_filename = '%s_net_%s.pth' % (epoch, name)
                save_path = os.path.join(self.save_dir, save_filename)
                net = getattr(self, 'net' + name)

                if len(self.gpu_ids) > 0 and torch.cuda.is_available():
                    if name == 'Rgr':
                        torch.save(net.state_dict(), save_path)
                    else:
                        torch.save(net.module.cpu().state_dict(), save_path)
                        net.cuda(self.gpu_ids[0])
                else:
                    if name == 'Rgr':
                        torch.save(net.state_dict(), save_path)
                    else:
                        torch.save(net.cpu().state_dict(), save_path)

I don't know what's wrong with my checkpoint file. Because I can actually load my other checkpoint files successfully. Plus, my pytorch version is 1.1.0. Could you help me with this problem?

Thank you.

It's possible the file may be corrupted. Can you save a dummy model and try to load that file? That way, you can confirm whether the problem is with your code or with your file. — akshayk07
– akshayk07, Commented Aug 15, 2020 at 5:52
Thanks. It proves to be the problem of the torch version @akshayk07 — YYGX
– YYGX, Commented Aug 15, 2020 at 13:54

YYGX · Accepted Answer · 2020-08-15 13:53:01Z

7

I have found the solution. Because I'm using different clusters to train and debug, torch version of each is different. When saving the model, the torch version is 1.6.0 and it's 1.1.0 when loading it.

answered Aug 15, 2020 at 13:53

YYGX

1612 silver badges10 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Amr Keleg Over a year ago

This was exactly the case for me as well!

Collectives™ on Stack Overflow

Something wrong with my checkpoint file when using torch.load()

1 Answer 1

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related