PaddleSpeech/third_party/python-pinyin/phrase-pinyin-data/Makefile

.PHONY: help
help:
	@echo "merge          update pinyin.txt and large_pinyin.txt"
	@echo "er             find r"
	@echo "check          check unexpected char"
	@echo "cedict_get     get latest cedict data"
	@echo "cedict         parse latest cedict data"

.PHONY: merge
merge:
	python merge.py pinyin.txt overwrite.txt > new.txt && mv new.txt pinyin.txt
	python merge.py zdic_cibs.txt zdic_cybs.txt cc_cedict.txt pinyin.txt overwrite.txt > new.txt && mv new.txt large_pinyin.txt

.PHONY: er
er:
	cat overwrite.txt|grep 儿|grep -v ér|grep -v er

.PHONY: tone_mark
tone_mark:
	ls *.txt | xargs -L 1 sed -i 's/ùo/uò/g'
	ls *.txt | xargs -L 1 sed -i 's/oǔ/ǒu/g'
	ls *.txt | xargs -L 1 sed -i 's/ùi/uì/g'
	ls *.txt | xargs -L 1 sed -i 's/íe/ié/g'
	ls *.txt | xargs -L 1 sed -i 's/ùi/uì/g'
	ls *.txt | xargs -L 1 sed -i 's/ǐe/iě/g'
	ls *.txt | xargs -L 1 sed -i 's/aō/āo/g'
	ls *.txt | xargs -L 1 sed -i 's/ìan/iàn/g'
	ls *.txt | xargs -L 1 sed -i 's/īan/iān/g'

.PHONY: check
check: tone_mark
	-rg 'ɡ|ɑ'

.PHONY: cedict_get
cedict_get:
	python -m pip install -U -r requirements_dev.txt
	python get_latest_cc_cedict.py

.PHONY: cedict
cedict:
	python -m pip install -U -r requirements_dev.txt
	python parse_latest_cc_cedict.py
-												E2E/Streaming Transformer/Conformer ASR (#578)

* add cmvn and label smoothing loss layer

* add layer for transformer

* add glu and conformer conv

* add torch compatiable hack, mask funcs

* not hack size since it exists

* add test; attention

* add attention, common utils, hack paddle

* add audio utils

* conformer batch padding mask bug fix #223

* fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2

* fix ci

* fix ci

* add encoder

* refactor egs

* add decoder

* refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils

* refactor docs

* add fix

* fix readme

* fix bugs, refactor collator, add pad_sequence, fix ckpt bugs

* fix docstring

* refactor data feed order

* add u2 model

* refactor cmvn, test

* add utils

* add u2 config

* fix bugs

* fix bugs

* fix autograd maybe has problem when using inplace operation

* refactor data, build vocab; add format data

* fix text featurizer

* refactor build vocab

* add fbank, refactor feature of speech

* refactor audio feat

* refactor data preprare

* refactor data

* model init from config

* add u2 bins

* flake8

* can train

* fix bugs, add coverage, add scripts

* test can run

* fix data

* speed perturb with sox

* add spec aug

* fix for train

* fix train logitc

* fix logger

* log valid loss, time dataset process

* using np for speed perturb, remove some debug log of grad clip

* fix logger

* fix build vocab

* fix logger name

* using module logger as default

* fix

* fix install

* reorder imports

* fix board logger

* fix logger

* kaldi fbank and mfcc

* fix cmvn and print prarams

* fix add_eos_sos and cmvn

* fix cmvn compute

* fix logger and cmvn

* fix subsampling, label smoothing loss, remove useless

* add notebook test

* fix log

* fix tb logger

* multi gpu valid

* fix log

* fix log

* fix config

* fix compute cmvn, need paddle 2.1

* add cmvn notebook

* fix layer tools

* fix compute cmvn

* add rtf

* fix decoding

* fix layer tools

* fix log, add avg script

* more avg and test info

* fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh;

* add vimrc

* refactor tiny script, add transformer and stream conf

* spm demo; librisppech scripts and confs

* fix log

* add librispeech scripts

* refactor data pipe; fix conf; fix u2 default params

* fix bugs

* refactor aishell scripts

* fix test

* fix cmvn

* fix s0 scripts

* fix ds2 scripts and bugs

* fix dev & test dataset filter

* fix dataset filter

* filter dev

* fix ckpt path

* filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test

* add comment

* add syllable doc

* fix ds2 configs

* add doc

* add pypinyin tools

* fix decoder using blank_id=0

* mmseg with pybind11

* format code
											
										
										
											3 years ago
+								.PHONY: help
 								help:
 									@echo "merge          update pinyin.txt and large_pinyin.txt"
 									@echo "er             find r"
 									@echo "check          check unexpected char"
 									@echo "cedict_get     get latest cedict data"
 									@echo "cedict         parse latest cedict data"
 								.PHONY: merge
 								merge:
 									python merge.py pinyin.txt overwrite.txt > new.txt && mv new.txt pinyin.txt
 									python merge.py zdic_cibs.txt zdic_cybs.txt cc_cedict.txt pinyin.txt overwrite.txt > new.txt && mv new.txt large_pinyin.txt
 								.PHONY: er
 								er:
 									cat overwrite.txt|grep 儿|grep -v ér|grep -v er
 								.PHONY: tone_mark
 								tone_mark:
 									ls *.txt | xargs -L 1 sed -i 's/ùo/uò/g'
 									ls *.txt | xargs -L 1 sed -i 's/oǔ/ǒu/g'
 									ls *.txt | xargs -L 1 sed -i 's/ùi/uì/g'
 									ls *.txt | xargs -L 1 sed -i 's/íe/ié/g'
 									ls *.txt | xargs -L 1 sed -i 's/ùi/uì/g'
 									ls *.txt | xargs -L 1 sed -i 's/ǐe/iě/g'
 									ls *.txt | xargs -L 1 sed -i 's/aō/āo/g'
 									ls *.txt | xargs -L 1 sed -i 's/ìan/iàn/g'
 									ls *.txt | xargs -L 1 sed -i 's/īan/iān/g'
 								.PHONY: check
 								check: tone_mark
 									-rg 'ɡ|ɑ'
 								.PHONY: cedict_get
 								cedict_get:
 									python -m pip install -U -r requirements_dev.txt
 									python get_latest_cc_cedict.py
 								.PHONY: cedict
 								cedict:
 									python -m pip install -U -r requirements_dev.txt
 									python parse_latest_cc_cedict.py