## 获取测试的 logit 数据

In [1]:
!mkdir -p ./test_data
!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz
!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data


hlens.npy
logits.npy
ys_lens.npy
ys_pad.npy


In [2]:
import os
import numpy as np
import time

data_dir="./test_data"


In [3]:
logits_np = np.load(os.path.join(data_dir, "logits.npy"))
ys_pad_np = np.load(os.path.join(data_dir, "ys_pad.npy"))
hlens_np = np.load(os.path.join(data_dir, "hlens.npy"))
ys_lens_np = np.load(os.path.join(data_dir, "ys_lens.npy"))

## 使用 torch 的 ctc loss

In [4]:
import torch
torch.__version__

'1.10.1+cu102'

In [5]:
def torch_ctc_loss(use_cpu):
    if use_cpu:
        device = torch.device("cpu")
    else:
        device = torch.device("cuda")

    reduction_type = "sum" 

    ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)

    ys_hat = torch.tensor(logits_np, device = device)
    ys_pad = torch.tensor(ys_pad_np, device = device)
    hlens = torch.tensor(hlens_np, device = device)
    ys_lens = torch.tensor(ys_lens_np, device = device)

    ys_hat = ys_hat.transpose(0, 1)
    
    # 开始计算时间
    start_time = time.time()
    ys_hat = ys_hat.log_softmax(2)
    loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)
    end_time = time.time()
    
    loss = loss / ys_hat.size(1)
    return end_time - start_time, loss.item()

## 使用 paddle 的 ctc loss

In [6]:
import paddle
paddle.__version__

'2.2.2'

In [7]:
def paddle_ctc_loss(use_cpu):    
    import paddle.nn as pn
    if use_cpu:
        device = "cpu"
    else:
        device = "gpu"

    paddle.set_device(device)

    logits = paddle.to_tensor(logits_np)
    ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')
    hlens = paddle.to_tensor(hlens_np, dtype='int64')
    ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')

    logits = logits.transpose([1,0,2])

    ctc_loss = pn.CTCLoss(reduction='sum')
    # 开始计算时间
    start_time = time.time()
    pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)
    end_time = time.time()
    
    pn_loss = pn_loss / logits.shape[1]
    return end_time - start_time, pn_loss.item()

In [8]:
# 使用 CPU

iteration = 10
use_cpu = True
torch_total_time = 0
paddle_total_time = 0
for _ in range(iteration):
    cost_time, torch_loss = torch_ctc_loss(use_cpu)
    torch_total_time += cost_time
for _ in range(iteration):
    cost_time, paddle_loss = paddle_ctc_loss(use_cpu)
    paddle_total_time += cost_time
print ("CPU, iteration", iteration)
print ("torch_ctc_loss", torch_loss)
print ("paddle_ctc_loss", paddle_loss)
print ("paddle average time", paddle_total_time / iteration)
print ("torch average time", torch_total_time / iteration)
print ("paddle time / torch time (cpu)" , paddle_total_time/ torch_total_time)

print ("")

# 使用 GPU

use_cpu = False
torch_total_time = 0
paddle_total_time = 0
for _ in range(iteration):
    cost_time, torch_loss  = torch_ctc_loss(use_cpu)
    torch_total_time += cost_time
for _ in range(iteration):
    cost_time, paddle_loss = paddle_ctc_loss(use_cpu)
    paddle_total_time += cost_time
print ("GPU, iteration", iteration)
print ("torch_ctc_loss", torch_loss)
print ("paddle_ctc_loss", paddle_loss)
print ("paddle average time", paddle_total_time / iteration)
print ("torch average time", torch_total_time / iteration)
print ("paddle time / torch time (gpu)" , paddle_total_time/ torch_total_time)

CPU, iteration 10
torch_ctc_loss 159.17137145996094
paddle_ctc_loss 159.16574096679688
paddle average time 1.718252992630005
torch average time 0.17536230087280275
paddle time / torch time (cpu) 9.798303193320452

GPU, iteration 10
torch_ctc_loss 159.172119140625
paddle_ctc_loss 159.17205810546875
paddle average time 0.018606925010681154
torch average time 0.0026710033416748047
paddle time / torch time (gpu) 6.966267963938231


## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值

In [9]:
logits_np = np.load(os.path.join(data_dir, "logits.npy"))
ys_pad_np = np.load(os.path.join(data_dir, "ys_pad.npy"))
hlens_np = np.load(os.path.join(data_dir, "hlens.npy"))
ys_lens_np = np.load(os.path.join(data_dir, "ys_lens.npy"))

In [10]:
use_cpu = False

from paddlespeech.s2t.modules.loss import CTCLoss

if use_cpu:
    device = "cpu"
else:
    device = "gpu"

paddle.set_device(device)

blank_id=0
reduction_type='sum'
batch_average= True
grad_norm_type='instance'

criterion = CTCLoss(
        blank=blank_id,
        reduction=reduction_type,
        batch_average=batch_average,
        grad_norm_type=grad_norm_type)

logits = paddle.to_tensor(logits_np)
ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')
hlens = paddle.to_tensor(hlens_np, dtype='int64')
ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')

pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)
print("loss", pn_ctc_loss.item())
    

2022-02-25 11:34:34.143 | INFO     | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True
2022-02-25 11:34:34.143 | INFO     | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance
2022-02-25 11:34:34.144 | INFO     | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}
loss 159.17205810546875


  format(lhs_dtype, rhs_dtype, lhs_dtype))


## 结论
在 CPU 环境下： torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍  
在 GPU 环境下： torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍

## 其他结论
torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。