save best and test on tiny/s0

pull/680/head
Haoxin Ma 4 years ago
parent 6487ca6022
commit 68bcc46940

@ -18,7 +18,7 @@ import paddle
from paddle import distributed as dist from paddle import distributed as dist
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from deepspeech.utils import checkpoint from deepspeech.utils.checkpoint import KBestCheckpoint
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
@ -139,9 +139,12 @@ class Trainer():
"epoch": self.epoch, "epoch": self.epoch,
"lr": self.optimizer.get_lr() "lr": self.optimizer.get_lr()
}) })
checkpoint.save_parameters(self.checkpoint_dir, self.iteration self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration
if tag is None else tag, self.model, if tag is None else tag, self.model,
self.optimizer, infos) self.optimizer, infos)
# checkpoint.save_parameters(self.checkpoint_dir, self.iteration
# if tag is None else tag, self.model,
# self.optimizer, infos)
def resume_or_scratch(self): def resume_or_scratch(self):
"""Resume from latest checkpoint at checkpoints in the output """Resume from latest checkpoint at checkpoints in the output
@ -151,7 +154,7 @@ class Trainer():
resume training. resume training.
""" """
scratch = None scratch = None
infos = checkpoint.load_parameters( infos = self.checkpoint.load_parameters(
self.model, self.model,
self.optimizer, self.optimizer,
checkpoint_dir=self.checkpoint_dir, checkpoint_dir=self.checkpoint_dir,
@ -180,7 +183,7 @@ class Trainer():
from_scratch = self.resume_or_scratch() from_scratch = self.resume_or_scratch()
if from_scratch: if from_scratch:
# save init model, i.e. 0 epoch # save init model, i.e. 0 epoch
self.save(tag='init') self.save(tag='init', infos=None)
self.lr_scheduler.step(self.iteration) self.lr_scheduler.step(self.iteration)
if self.parallel: if self.parallel:
@ -263,6 +266,9 @@ class Trainer():
self.checkpoint_dir = checkpoint_dir self.checkpoint_dir = checkpoint_dir
self.checkpoint = KBestCheckpoint(max_size=self.config.training.max_epoch,
last_size=self.config.training.last_epoch)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def destory(self): def destory(self):
"""Close visualizer to avoid hanging after training""" """Close visualizer to avoid hanging after training"""

@ -23,19 +23,110 @@ from paddle.optimizer import Optimizer
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
import glob
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ["load_parameters", "save_parameters"] __all__ = ["load_parameters", "save_parameters"]
class KBestCheckpoint(object):
def __init__(self,
max_size: int=5,
last_size: int=1):
self.best_records: Mapping[Path, float] = {}
self.last_records = []
self.max_size = max_size
self.last_size = last_size
self._save_all = (max_size == -1)
def should_save_best(self, metric: float) -> bool:
if not self.best_full():
return True
# already full
worst_record_path = max(self.best_records, key=self.best_records.get)
worst_metric = self.best_records[worst_record_path]
return metric < worst_metric
def best_full(self):
return (not self._save_all) and len(self.best_records) == self.max_size
def last_full(self):
return len(self.last_records) == self.last_size
def add_checkpoint(self,
checkpoint_dir, tag_or_iteration,
model, optimizer, infos):
if("val_loss" not in infos.keys()):
self.save_parameters(checkpoint_dir, tag_or_iteration,
model, optimizer, infos)
return
#save best
if self.should_save_best(infos["val_loss"]):
self.save_checkpoint_and_update(infos["val_loss"],
checkpoint_dir, tag_or_iteration,
model, optimizer, infos)
#save last
self.save_last_checkpoint_and_update(checkpoint_dir, tag_or_iteration,
model, optimizer, infos)
if isinstance(tag_or_iteration, int):
self._save_record(checkpoint_dir, tag_or_iteration)
def save_checkpoint_and_update(self, metric,
checkpoint_dir, tag_or_iteration,
model, optimizer, infos):
# remove the worst
if self.best_full():
worst_record_path = max(self.best_records,
key=self.best_records.get)
self.best_records.pop(worst_record_path)
if(worst_record_path not in self.last_records):
print('----to remove (best)----')
print(worst_record_path)
self.del_checkpoint(checkpoint_dir, worst_record_path)
# add the new one
self.save_parameters(checkpoint_dir, tag_or_iteration,
model, optimizer, infos)
self.best_records[tag_or_iteration] = metric
def save_last_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration,
model, optimizer, infos):
# remove the old
if self.last_full():
to_del_fn = self.last_records.pop(0)
if(to_del_fn not in self.best_records.keys()):
print('----to remove (last)----')
print(to_del_fn)
self.del_checkpoint(checkpoint_dir, to_del_fn)
self.last_records.append(tag_or_iteration)
self.save_parameters(checkpoint_dir, tag_or_iteration,
model, optimizer, infos)
# with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as handle:
# for iteration in self.best_records
# handle.write("model_checkpoint_path:{}\n".format(iteration))
def del_checkpoint(self, checkpoint_dir, tag_or_iteration):
checkpoint_path = os.path.join(checkpoint_dir,
"{}".format(tag_or_iteration))
for filename in glob.glob(checkpoint_path+".*"):
os.remove(filename)
print("delete file: "+filename)
def _load_latest_checkpoint(checkpoint_dir: str) -> int:
def _load_latest_checkpoint(self, checkpoint_dir: str) -> int:
"""Get the iteration number corresponding to the latest saved checkpoint. """Get the iteration number corresponding to the latest saved checkpoint.
Args: Args:
checkpoint_dir (str): the directory where checkpoint is saved. checkpoint_dir (str): the directory where checkpoint is saved.
Returns: Returns:
int: the latest iteration number. -1 for no checkpoint to load. int: the latest iteration number. -1 for no checkpoint to load.
""" """
checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_last")
if not os.path.isfile(checkpoint_record): if not os.path.isfile(checkpoint_record):
return -1 return -1
@ -46,7 +137,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
return iteration return iteration
def _save_record(checkpoint_dir: str, iteration: int): def _save_record(self, checkpoint_dir: str, iteration: int):
"""Save the iteration number of the latest model to be checkpoint record. """Save the iteration number of the latest model to be checkpoint record.
Args: Args:
checkpoint_dir (str): the directory where checkpoint is saved. checkpoint_dir (str): the directory where checkpoint is saved.
@ -54,13 +145,20 @@ def _save_record(checkpoint_dir: str, iteration: int):
Returns: Returns:
None None
""" """
checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") checkpoint_record_last = os.path.join(checkpoint_dir, "checkpoint_last")
checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best")
# Update the latest checkpoint index. # Update the latest checkpoint index.
with open(checkpoint_record, "a+") as handle: # with open(checkpoint_record, "a+") as handle:
handle.write("model_checkpoint_path:{}\n".format(iteration)) # handle.write("model_checkpoint_path:{}\n".format(iteration))
with open(checkpoint_record_best, "w") as handle:
for i in self.best_records.keys():
handle.write("model_checkpoint_path:{}\n".format(i))
with open(checkpoint_record_last, "w") as handle:
for i in self.last_records:
handle.write("model_checkpoint_path:{}\n".format(i))
def load_parameters(model, def load_parameters(self, model,
optimizer=None, optimizer=None,
checkpoint_dir=None, checkpoint_dir=None,
checkpoint_path=None): checkpoint_path=None):
@ -81,7 +179,7 @@ def load_parameters(model,
if checkpoint_path is not None: if checkpoint_path is not None:
tag = os.path.basename(checkpoint_path).split(":")[-1] tag = os.path.basename(checkpoint_path).split(":")[-1]
elif checkpoint_dir is not None: elif checkpoint_dir is not None:
iteration = _load_latest_checkpoint(checkpoint_dir) iteration = self._load_latest_checkpoint(checkpoint_dir)
if iteration == -1: if iteration == -1:
return configs return configs
checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration))
@ -112,7 +210,7 @@ def load_parameters(model,
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def save_parameters(checkpoint_dir: str, def save_parameters(self, checkpoint_dir: str,
tag_or_iteration: Union[int, str], tag_or_iteration: Union[int, str],
model: paddle.nn.Layer, model: paddle.nn.Layer,
optimizer: Optimizer=None, optimizer: Optimizer=None,
@ -148,5 +246,3 @@ def save_parameters(checkpoint_dir: str,
data = json.dumps(infos) data = json.dumps(infos)
fout.write(data) fout.write(data)
if isinstance(tag_or_iteration, int):
_save_record(checkpoint_dir, tag_or_iteration)

@ -43,12 +43,15 @@ model:
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 24 n_epoch: 6
lr: 1e-5 lr: 1e-5
lr_decay: 1.0 lr_decay: 1.0
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 1 log_interval: 1
max_epoch: 3
last_epoch: 2
decoding: decoding:
batch_size: 128 batch_size: 128

Loading…
Cancel
Save