facebookresearch
diff --git a/‎examples/roberta/commonsense_qa/commonsense_qa_task.py
Lines changed: 1 addition & 1 deletion b/‎examples/roberta/commonsense_qa/commonsense_qa_task.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/roberta/wsc/wsc_task.py
Lines changed: 2 additions & 2 deletions b/‎examples/roberta/wsc/wsc_task.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎fairseq/benchmark/dummy_lm.py
Lines changed: 1 addition & 1 deletion b/‎fairseq/benchmark/dummy_lm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎fairseq/benchmark/dummy_masked_lm.py
Lines changed: 1 addition & 1 deletion b/‎fairseq/benchmark/dummy_masked_lm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎fairseq/checkpoint_utils.py
Lines changed: 7 additions & 1 deletion b/‎fairseq/checkpoint_utils.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎fairseq/data/__init__.py
Lines changed: 0 additions & 2 deletions b/‎fairseq/data/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎fairseq/data/iterators.py
Lines changed: 12 additions & 7 deletions b/‎fairseq/data/iterators.py
Lines changed: 12 additions & 7 deletions
diff --git a/‎fairseq/data/resampling_dataset.py
Lines changed: 2 additions & 2 deletions b/‎fairseq/data/resampling_dataset.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎fairseq/data/sharded_dataset.py
Lines changed: 0 additions & 60 deletions b/‎fairseq/data/sharded_dataset.py
Lines changed: 0 additions & 60 deletions
diff --git a/‎fairseq/tasks/cross_lingual_lm.py
Lines changed: 4 additions & 3 deletions b/‎fairseq/tasks/cross_lingual_lm.py
Lines changed: 4 additions & 3 deletions
@@ -66,7 +66,7 @@ def setup_task(cls, args, **kwargs):
 
         return cls(args, vocab)
 
-    def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
+    def load_dataset(self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs):
         """Load a given dataset split.
 
         Args:
 
@@ -101,7 +101,7 @@ def binarize_with_mask(self, txt, prefix, suffix, leading_space, trailing_space)
         mask[mask_start:mask_start + mask_size] = 1
         return toks, mask
 
-    def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
+    def load_dataset(self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs):
         """Load a given dataset split.
 
         Args:
@@ -281,7 +281,7 @@ def setup_task(cls, args, **kwargs):
 
         return cls(args, vocab)
 
-    def load_dataset(self, split, epoch=0, combine=False, data_path=None, return_only=False, **kwargs):
+    def load_dataset(self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs):
         """Load a given dataset split.
 
         Args:
 
@@ -42,7 +42,7 @@ def setup_task(cls, args, **kwargs):
 
         return cls(args, dictionary)
 
-    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
         """Load a given dataset split.
         Args:
             split (str): name of the split (e.g., train, valid, test)
 
@@ -53,7 +53,7 @@ def setup_task(cls, args, **kwargs):
 
         return cls(args, dictionary)
 
-    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
         """Load a given dataset split.
         Args:
             split (str): name of the split (e.g., train, valid, test)
 
@@ -109,6 +109,7 @@ def is_better(a, b):
             if os.path.lexists(old_chk):
                 os.remove(old_chk)
 
+
 def load_checkpoint(args, trainer, **passthrough_args):
     """
     Load a checkpoint and restore the training iterator.
@@ -150,7 +151,7 @@ def load_checkpoint(args, trainer, **passthrough_args):
         epoch_itr.load_state_dict(itr_state)
     else:
         epoch_itr = trainer.get_train_iterator(
-            epoch=0, load_dataset=True, **passthrough_args
+            epoch=1, load_dataset=True, **passthrough_args
         )
 
     trainer.lr_step(epoch_itr.epoch)
@@ -349,6 +350,11 @@ def _upgrade_state_dict(state):
         state["args"].dataset_impl = "raw"
     elif getattr(state["args"], "lazy_load", False):
         state["args"].dataset_impl = "lazy"
+    # epochs start at 1
+    state["extra_state"]["train_iterator"]["epoch"] = max(
+        getattr(state["extra_state"]["train_iterator"], "epoch", 1),
+        1,
+    )
 
     # set any missing default values in the task, model or other registries
     registry.set_defaults(state["args"], tasks.TASK_REGISTRY[state["args"].task])
 
@@ -38,7 +38,6 @@
 from .resampling_dataset import ResamplingDataset
 from .roll_dataset import RollDataset
 from .round_robin_zip_datasets import RoundRobinZipDatasets
-from .sharded_dataset import ShardedDataset
 from .sort_dataset import SortDataset
 from .strip_token_dataset import StripTokenDataset
 from .subsample_dataset import SubsampleDataset
@@ -96,7 +95,6 @@
     'ResamplingDataset',
     'RightPadDataset',
     'RoundRobinZipDatasets',
-    'ShardedDataset',
     'ShardedIterator',
     'SortDataset',
     'StripTokenDataset',
 
@@ -100,17 +100,18 @@ def load_state_dict(self, state_dict):
 
 class StreamingEpochBatchIterator(EpochBatchIterating):
     def __init__(
-        self, dataset, epoch=0, num_shards=1, shard_id=0,
+        self, dataset, epoch=1, num_shards=1, shard_id=0,
     ):
         assert isinstance(dataset, torch.utils.data.IterableDataset)
         self.dataset = dataset
-        self.epoch = epoch
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
         self._current_epoch_iterator = None
         self.num_shards = num_shards
         self.shard_id = shard_id
 
     def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
-        self.epoch += 1
+        if self._current_epoch_iterator is not None and self.end_of_epoch():
+            self.epoch += 1
         self.dataset.set_epoch(self.epoch)
         self._current_epoch_iterator = CountingIterator(
             iterable=ShardedIterator(
@@ -165,12 +166,12 @@ class EpochBatchIterator(EpochBatchIterating):
             loading. 0 means the data will be loaded in the main process
             (default: 0).
         epoch (int, optional): the epoch to start the iterator from
-            (default: 0).
+            (default: 1).
     """
 
     def __init__(
         self, dataset, collate_fn, batch_sampler, seed=1, num_shards=1, shard_id=0,
-        num_workers=0, epoch=0,
+        num_workers=0, epoch=1,
     ):
         assert isinstance(dataset, torch.utils.data.Dataset)
         self.dataset = dataset
@@ -181,7 +182,7 @@ def __init__(
         self.shard_id = shard_id
         self.num_workers = num_workers
 
-        self.epoch = epoch
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
         self.shuffle = True
         self._cur_epoch_itr = None
         self._next_epoch_itr = None
@@ -204,7 +205,8 @@ def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
             self._cur_epoch_itr = self._next_epoch_itr
             self._next_epoch_itr = None
         else:
-            self.epoch += 1
+            if self._cur_epoch_itr is not None and self.end_of_epoch():
+                self.epoch += 1
             self._cur_epoch_itr = self._get_iterator_for_epoch(
                 self.epoch, shuffle, fix_batches_to_gpus=fix_batches_to_gpus,
             )
@@ -244,6 +246,9 @@ def load_state_dict(self, state_dict):
                 shuffle=state_dict.get('shuffle', True),
                 offset=itr_pos,
             )
+            if self._next_epoch_itr is None:
+                # we finished the epoch, increment epoch counter
+                self.epoch += 1
 
     def _get_iterator_for_epoch(self, epoch, shuffle, fix_batches_to_gpus=False, offset=0):
 
 
@@ -31,7 +31,7 @@ class ResamplingDataset(BaseWrapperDataset):
         batch_by_size (bool): whether or not to batch by sequence length
             (default: True).
         seed (int): RNG seed to use (default: 0).
-        epoch (int): starting epoch number (default: 0).
+        epoch (int): starting epoch number (default: 1).
     """
 
     def __init__(
@@ -42,7 +42,7 @@ def __init__(
         size_ratio=1.0,
         batch_by_size=True,
         seed=0,
-        epoch=0,
+        epoch=1,
     ):
         super().__init__(dataset)
 
 
@@ -102,7 +102,7 @@ def _load_single_lang_dataset(self, split, epoch):
 
         paths = utils.split_paths(self.args.data)
         assert len(paths) > 0
-        data_path = paths[epoch % len(paths)]
+        data_path = paths[(epoch - 1) % len(paths)]
 
         for k in itertools.count():
             split_k = split + (str(k) if k > 0 else '')
@@ -136,8 +136,9 @@ def _load_single_lang_dataset(self, split, epoch):
 
         return dataset, sizes
 
-    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
         """Load a given dataset split.
+
         Args:
             split (str): name of the split (e.g., train, valid, test)
         """
@@ -165,5 +166,5 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
 
         self.datasets[split] = MultiCorpusSampledDataset(dataset_map)
         logger.info('{} {} {} examples'.format(
-            utils.split_paths(self.args.data)[epoch], split, len(self.datasets[split]))
+            utils.split_paths(self.args.data)[epoch - 1], split, len(self.datasets[split]))
         )