#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # https://github.com/pytorch/fairseq/blob/master/LICENSE from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import argparse import sys import sentencepiece as spm def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", required=True, help="sentencepiece model to use for decoding") parser.add_argument("--input", default=None, help="input file to decode") parser.add_argument("--input_format", choices=["piece", "id"], default="piece") args = parser.parse_args() sp = spm.SentencePieceProcessor() sp.Load(args.model) if args.input_format == "piece": def decode(l): return "".join(sp.DecodePieces(l)) elif args.input_format == "id": def decode(l): return "".join(sp.DecodeIds(l)) else: raise NotImplementedError def tok2int(tok): # remap reference-side (represented as <>) to 0 return int(tok) if tok != "<>" else 0 if args.input is None: h = sys.stdin else: h = open(args.input, "r", encoding="utf-8") for line in h: print(decode(line.split())) if __name__ == "__main__": main()