From a679daece930be33b683fbc70eec09ff9119904d Mon Sep 17 00:00:00 2001 From: cjaneway <72281149+cjaneway@users.noreply.github.com> Date: Wed, 4 Nov 2020 08:05:10 -0800 Subject: [PATCH] Add files via upload Just copied the folder and pasted it into the data/librispeech to get the librispeech.py to run properly and avoid the error of numpy not seeing the data_utils.utility module to then import download and unpack. It works now and get the download. --- data/librispeech/data_utils/data.pyc | Bin 0 -> 15409 bytes .../data_utils/featurizer/text_featurizer.py | 68 +++++++++++++ data/librispeech/data_utils/normalizer.pyc | Bin 0 -> 4254 bytes data/librispeech/data_utils/utility.py | 90 ++++++++++++++++++ 4 files changed, 158 insertions(+) create mode 100644 data/librispeech/data_utils/data.pyc create mode 100644 data/librispeech/data_utils/featurizer/text_featurizer.py create mode 100644 data/librispeech/data_utils/normalizer.pyc create mode 100644 data/librispeech/data_utils/utility.py diff --git a/data/librispeech/data_utils/data.pyc b/data/librispeech/data_utils/data.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7daa7a484be740dd9536b889bee30f629d303d68 GIT binary patch literal 15409 zcmb_jOK=>=dG6W8Yw;jJfD{2mmPS@&EE5patK@jYC=y9iQ6f>yKonzG(qORD3ky|NZyh|FgILKXcQ+`sL5uw#t6$`2X&frIKHx;48I>r=|Rw>RPH-Q=2tq@x3nZ zb+uVH?_=^lrZ&gS`?%_ktIcs4i}HlpoXE?QYI8C#PpQpDUY=HaGiq~2UMEy{R`ur8 z=9~dHsk*0BZ(eQAoA)WzT~NK#YV)*tZ>a8~>Mg0wCG$S5x@T1HtlB(l-e*+zIn_I- zHqYUGR_SHc!6N3=1EuaO^@Gh7<&Ue4<|(W>`5Qd0L}BU$VPgAU>e)Lw)UlUFvE9KZ zig!XU_%aB0>^oi@M1#ck27VBkzWrGDZV-6FbvG#l& z+d-J>xZ|}on(Z5&?{~F%+OfVp2x8sSVVbN@2f#`-Ej-eBJiP5B>JC!v2EBe1r>0>F z4SsMZNP;LdHC#|1bEa-*5Vlj)UpEcoXy`_5uWMfC@JfW-o)-okon!#dp!&VR4pHzp z_(rxIGiU*=8-1ah@_zM&?coBPsXN@eP>txFN!ixSFx6ePT{0XH|m{b~tNwqh{-DS;`dN3=j*5HWBJO?`D(3XtVF<_ z$Xdx0cuPjM`d;ky>^oqJZMPG2b>B;OTlS}XA9TI=z|Je>6a=BK?~$3)XkUleNME3E zNhkcIu@@%oIOvm=+1^SI`uf;pST%T$XQYZ!PkW)8q<#gfYrxNLn?ou{>}t?XUuZ|Y zK2C*ibBxdi<9vOmI_U}Eu1BFhJb7}j!u~F1>;yY4yLu*w1}X;-5I7+?&-Pw||7ICx z-V+E?R9)2&G~L8bv>pPRAGLMY^`o}yR#$K$PMC0rXx=^7&yXU*6i|1Abl1)+?Iuu7 zG_d!R^Y#u-KLnAT?t0Ja4o!4d z=D8)sr=Fz(>DVmXf)12N%f6Z5B;4KA*yp}(?*La>k+Hu6<%} zL*Jde_3bE*b+c4iFW z$$pWYw_q-a8l&$~ec_GQZ2$7fD=m(=4H|)Ow5{(!4nhl4Y641z{Sf<{jL3Erb*sk+ z($tT7E@Xz^vK@J$02w-oLw#TYRLAdY-FHiw463;)kZai=bO0yif5Hw3_yn^)hHmnuzzgcJ=zAbOmR znaIFgB%vs2N|JoQn*CUcM&!V|5I}2oE_gs1a8?Si-B6Djyq=Q6oJvp8=ws!)0`e&A%`11IQa-H?=cEHHHCV9CX{_aXx`gKm ziZz`VDMTO=sD!!j2Ak&mbz^hs&cN6iL1K4KQVtayR%AZSO%?7+`V+n`A&iA<^BA z!+l8UQ5>`99L_3J4^fS^y(-b@zc_^8kO&UJJEXWnE>F+l@wgR|vVxN=rLw*z8k9Ve zH&ME{8};`ei(T^d>LbFuVtE%78w2JQuk738eSC{x;|Zq&CGid z#Vme8%chS&9|jJ)xGuKgDEdN1f772aVv~x;cqCN-r9)sJ3WU+wO6>@Bg41F=9C`s1 z^P`}vy-Uy~Ok)`V(x4CLro=M0adenqB_xRG4|FIYa!&Yf=~@A<(50~dV0s+y3cqF0 z^{J)BgR=u4ZX#6DFy=8$tCpSSI5pq!y*Ty`)@&HX>G@*1grWh*w$+rw!$XWPM3|&r z*w%8U%O%SzTH-XJA>d4MC?#emhEc;^k#p@PgKfURm_;?ZJ3NvS{gBmw8G4XxaBLik zBLg81L^v;?%AxQ}d04%EzkAT+!Ejz-9q}u-+Q^(@14YDVA~7-X3ptgj5+k}qK|ETQ zgntGHpOPpG9YG|(86)Yak`sz2HS;3Q{c>&5TEKZfi*q0RAAUp^YS5bI~KTAEVsfTs-a7;ZMR}UvtN8)flR_X|cc``qb|A@iGzFsEip@3DJ zo+6JFnX!m_t=oTAq=Fa3v9uC7EGZ<2eln#6r6Dg|@g4~)}^QA<|`(+@?V^B~fn=9KzNIkeggWfhA z56~n(LkS}bo>}>*WT}qKEqy`2gmg$BSfy?nwd~x&zwkPYdsx8GkUef@2g;=ni}b1Bu*`&0Aty0Z~c&C z=%)qN8Bsf34@W61EOO)47{0A=mSLo_1s9Ud1@WAEa3A24Wqy>I zjrBsIaxb?J^0SkLXbQXw!W4yUQ#wJMq_&tA z${Xe#ku$!R-t=$5%%Hfs%JW3Wy#-Z`{TDG*VpA++q_CQkBbQvO+(Dx;F3{uo8lbWh zV5SC~BtY24i@OIG7{~DoxCqrhE-*2I-Owvyie#=R7gi?h3f2stNSz@#d=XTK)>odn zs`NNE@NpH_K>Z%W3%&^@uEDL<2&6J1@b%H#+H!}8na3Zuw!XrS4vlo%-JsvsKDJ|Z zs9{w^_18x&C%`Te72u7alZrLO`03LF6C@!W*X^8+PY>&vY(T!s(T#ZW22kkaBFILJ zFOxpmxkd#dwp7f&HByi1DZ43lD-x&Ndn0wTLtY3E3ESl{HLjqQTeFwNfXI?vx&8-> ziuPwvM_8lO|0Z5Z8zK*98F0o9&25WfkJ_+p6m3shJ8#n3IehX;X&M{sg#>d~0m zlPnPnaBNilNJ%Ba5@Tu~9?^&D(FC7#UQ8xwf{C2_%Cf?rs7I1eV!uR&iA(bc6HGCV zald8(T&Sg6L%{w0v2K%jw|Au5v~>Gt>1G@!Oa@0Pd`X(&c4D$$sBnRvGV)Wv;5oGa zwWW4os$pGNh{QkWPu0}DkHld^67BQLblT|NA--XpGuX1mrvt_W`x_ZO&&o%2^$6}> zU43d9xVL|!@OP^ig1p%NxQaU#CmbCT3K7TlK*d@0-5R1jK4U4g{;!KsI%M}av_YNk z;30tx2|?}agEsS-{Lm5H;O^0I#Xey7R*ZXiT}$-^ky-)b)DX zzW2`00SLSURKLd(c`6fP=~S6x8F!6{42v8SVauhRu)$Mv00Tn?BSVsZ;%t9_hpI8s z(y(T0XRYVzvnZcMjKw^cP^zPS8lU;vf;9(^?2J6mS%xd806@@}bvX`SKjCA8k5Boy z$;T}`%m~uQ^ph?mhZ*q-OVXLco_e8Vv+6?gy@v!-;=rqmFTW~hlIO9GkA#rOHh~0; zAWq-*MQ*+gIRqK9(WG~sbnr0#Uv^`{5OOs^Bu7iq$Pd~Gh!kSGP!Ct(@{M$&yys6V zi#>2?G0=z)y2{3Vr%#@mpuHTbQQw^53`1Nl$#4_*JHyK*aMO;;r>87;E(7@=gu`Q^ zCeK)ffa8G|VlwhTMT0+*kM+q9@yOg~>RpL0o0L1P*OSOr$;z89hGZqts;-cP*-VUt zvzy>Ghso?hlW$G}=W~GmC09d2tI4mn@@;fIdFAkW7-4$C;)`CR>*v^lX-*^ndpuJD ze1Wz6H$FK5NC%z{@`7yr6%@d~P}$Hi*nOjzp|S71Om!{B36%I!9wbFF1&X(>F5UjN zx`hr1%SzN#c;+tv?1mS^%=hSdn+PX76uRJ2!(c=f@+=6!;GqYfzdv+$0+*N&c-Lj} z*GPGh)?Q3FqERB~29e&FqNf?3NTfbQU`51$IbB??jF~_IY2!gRMdTTEBI!7y#R~$6)?sM;B82BGx=LA@qI>NOzbp|c{7AcHmMw^4@@c- z*UXrZ2U|R5$Z+JF9MtyOyYQ(CuH%M^(-1PbNzqTcUako60Ovy}-e9kwwkhsl^tC$E)}U8l|FjCC2b6EN&UHWeeM zl2|f)|IL}vxrkZ+7oUU?XEo7;F)*Kf&LUvDgpaTzS&?j6r>)Dr4<#;(V-1e{vB(DO zX_grIFSHlb`yh*q2z?P6%*?iR`w9r2QVQh0k)3;4l_+ZgCbz{-inr!dv;eFK_=hGC zR^}rr8duzJPvkV(i#f7<0&a`o8l>?ii`aU}4%{fA1-D&1w?9PT-dpMsJ()EHqXg~= z&L+5>?g0OR-En4db<fT>j zGVeX>Gss?@xH$`UA)!I&ka3 z4cS~WmUH341|~(Q185?~VU}-JhOx8KTA&25z)Eg|ps6#4bI-(m@>B46A%SifpEZlI zSK|ED3{lV9mkd#_$Q9XTj>v9C<~k*{*Gaje^Af~_{b}-1?cJ(s?but@WX2YwaZk>; z6gdxORf@GdDZ65Sg$zmo3_82a(KO}iU#xL^NK$6>J`{K$$jFI^U6LW>o` zJ~@FRieGY;3B6kcD@!k*^a-O>XcRDol&7W!xhHWzFaHC!1*>NZBpDDCG#;$ zpYXvAkO)es@0$3MnNX7HF&n4x;O!X@V+TeQ!+P8onkr zEThjV>R!Zi*%Bq(0A%unDB*ucq3V}BFu`YvO$bIvMez(Eyq6)#sjB5_9Y(-ccnPu- zQWhHd%uk@`3THx;BH9?+g(9L-r#hcNX*SFEe^FPr_tfnz8CB=EEWm7lictW(%ok`+ z=rrm9=rZViv`ncP&S@1j#BNNz%sT3Pevd{h`5l~>8%UfzJ{Kn`HC)LGBq){HinveA zi|D%332?tSY#$WDi@t{l?JNyGltilttJl}<4MYYdhhZ|k4C3OZ@0F#H}6d}%vdb^#1PnR_WaNVYdAZ2StTN(|n@YO??{1^7^(0`kaphL-(NetQ8z z2hsvx%dkI0Ktyg~NT@Dfpvm{-(cYFA`=ek>%CzjulJAkA6t37puCqw_j+k}CI~Q^t z*v>D+7NGL^D*s2fgS3&UKPPJtv(Co58dl#oiGZMiga6>@|^>-m*-7> z5!d%{7uI|wrGj2j#1|>}y%U5qxna$4`)uFP> zf%kG%%g)b0L0kpwn^X|gt?KL&1pc?+)O`mc{&^^XbJq1HlQ~%ggMkOvHSsF9hf7bm z(&h?A*;jiqJ&v!+IDdj+&NV*1&&Q8)DEaMk=l57m=ha~VP-Z|XwT}z(2hL}#W*F9C zC{h9?5||J_(4kgyhDLu7!$-Ut+leh4g`2C6TI@DZ!i>L*JP5+`3F2Q;&cWGMS(b?cO79+8duH=AGl|3F8}oqQ4F$+ayXVkJqYan zfdhqU?)}ovz}-%o%dmIssP3Z+4@27FLYBn`iz10Y@Sqx&QzG literal 0 HcmV?d00001 diff --git a/data/librispeech/data_utils/featurizer/text_featurizer.py b/data/librispeech/data_utils/featurizer/text_featurizer.py new file mode 100644 index 000000000..89202163c --- /dev/null +++ b/data/librispeech/data_utils/featurizer/text_featurizer.py @@ -0,0 +1,68 @@ +"""Contains the text featurizer class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs + + +class TextFeaturizer(object): + """Text featurizer, for processing or extracting features from text. + + Currently, it only supports char-level tokenizing and conversion into + a list of token indices. Note that the token indexing order follows the + given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: basestring + """ + + def __init__(self, vocab_filepath): + self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( + vocab_filepath) + + def featurize(self, text): + """Convert text string to a list of token indices in char-level.Note + that the token indexing order follows the given vocabulary file. + + :param text: Text to process. + :type text: basestring + :return: List of char-level token indices. + :rtype: list + """ + tokens = self._char_tokenize(text) + return [self._vocab_dict[token] for token in tokens] + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return len(self._vocab_list) + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._vocab_list + + def _char_tokenize(self, text): + """Character tokenizer.""" + return list(text.strip()) + + def _load_vocabulary_from_file(self, vocab_filepath): + """Load vocabulary from file.""" + vocab_lines = [] + with codecs.open(vocab_filepath, 'r', 'utf-8') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list diff --git a/data/librispeech/data_utils/normalizer.pyc b/data/librispeech/data_utils/normalizer.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5364f86a45fa0eb468277f27fe700901144e1631 GIT binary patch literal 4254 zcmb_fTW=dx5T3P@&7}@0h2AJg2UP-Bw2p)XhzKg6rBYD|(ng}f655=2kDU#BWA`|1 zD(8h(2#KH22gJ|d$MDWOyu&wh)*HJm6hxf(_}q5R%s1benQi~Rw))$zKZOITKW+Se zwBu3r13D2=AFW4GixQ907WG@?@whGHHuc+XyddKR>MywQA|;E|_vKmim#DuaeV>wL zN>`}A;`S|3vP$VG>Yu`RnM{XI?gIJG}?|! zqq4kAbrL@?Wz~zZfw~?VyN*WhR(UeECXCY}FRfcx#zGVy#8sSUZsrBd6lI**a5&Be z7V}$fVGRqV(NUP{EFPMwme9fU$Kxo@x6LRunYAxs;+9iqtI?(bbP=ncFrc5IV*n`Z zL)&62`Cq^Vo{t2)Xn_rW?HGf4DQqsJ7dO4(fP z8kG+fpvqNhbf$C`sc{zDRfSJwqsSceR%H+mWk-m~a2O}1&~{J7l`6{oAdXC=x>;^< zN@lKHRd#RI4eDN;Br1waGqA~_lZG>5_K_Nvc`Ex^Xit5CN6UM0Wv;5-94AbH4&^*n z&$$_oigTQzd-*tt*qBj#U@|q#OOpLNM$XJ&V#@a}r>51KJcT#%mhpw z+_pL!7%&`mHOfElY~g1u?jH`GhZ82(jAwS3`?k~r+c2i5Zd+9~U=MVX=-tGK9c=hB zKKEicac5`#l&5?1cET-n*_IVc3T`K_Bmsma_?HDuX14J(}%iK(z_obDQUeW*&1 z1eB^*{yk=M;&At8z2js;VH0htpgsTtkuE1OI~qsLnltxkBev9_c%{D7$=G~amU+3U zZVzk5;~vi6GG_hUJ@{L|yC8_WHqR6?5A7AD5iOY-PT%+s#7PO510R8jzlbGtgCiAU zK27j@nUFY`2_Aa%oktl$tVjDTnk*7#CPZPI+x-Q81p6lo^bn~bTa(#E z7KeQvBHwnm1@T+eHd>&-cWNF0gc(-zsw^GzTB6mwO;dJqEJ8RCi-IwE{YJanh9fXXC85iotOiIa z>?usmkWt1nEb3yJq^aIVZ(}b{&Bg$P8g1M(rr0iw8SHIDrbzO`jb3jf(pHBEcS*Hz zoEdw?q2;_6hA|>>7+yxd;xtCS?{&QO)*0_yYxMZ7pTB?n$p3J_CBXD6d^m>*IFS8I zbb)^`Xwmsg+ma}&_duopmT~5@4M&ptnmcs(j^EA@wwqv7r?1AQ$gx51J z)GqVPNHzu5ltz+^SXiAM(5rKrf)`8~BlchofiB*S?ctNBh>&5DYZov?d}&aKWE{Hc zN@SX~Of~a($yz?GoLmOWHG&Y&aT2<9*#QCjNIW-%#qupLW?*%`uPc;I^+dX5(E=$85bOZ$mZ)jjz^oAHD}7)d1TSGC zco~iOWCr_M`iWuAaY5_~Z7$K`IwlIz0lX<(#ti;McGU|lg2%~TkYI9wSuy&~c&eIM^J{JsB2ZVFj?FM zVaLb?2{@w#{IV6CMkBFmn({2q)V4IQP{G;hwrTxzVl3jEe}|9cJ4~&Pch>X4B;RX$ zm%MfFO>e6!sIc;3nCaAnp%e;XnC8(q;UTj-45NI2KI6nCMQ{e4;5;|{1}ba~_^@Nm zrT705bik3A<9)@)`DguwHWY7roz+ffWqHN#3es1xR_i<*^ZRcY2AHCNNf2<%kbs6% zRvd~~1GdPr$m7bO$Mpe{n=nPZG_xiwa~xqp= min_duration): + manifest.append(json_data) + return manifest + + +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath + + +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code + + +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath) + + +class XmapEndSignal(): + pass