# -*- coding: utf-8 -*- # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import xml.dom.minidom import xml.parsers.expat from xml.dom.minidom import Node from xml.dom.minidom import parseString ''' Note: xml 有5种特殊字符, &<>"' 其一,采用特殊标签,将包含特殊字符的字符串封装起来。 例如: 其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: & & < < > > " " ' ' 例如: "姓名" ''' class MixTextProcessor(): def __repr__(self): print("@an MixTextProcessor class") def get_xml_content(self, mixstr): '''返回字符串的 xml 内容''' xmlptn = re.compile(r".*?", re.M | re.S) ctn = re.search(xmlptn, mixstr) if ctn: return ctn.group(0) else: return None def get_content_split(self, mixstr): ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号 不能去除空格,因为 xml 中tag 属性带空格 ''' ctlist = [] # print("Testing:",mixstr[:20]) patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) mat = re.match(patn, mixstr) if mat: pre_xml = mat.group(1) in_xml = mat.group(2) after_xml = mat.group(3) ctlist.append(pre_xml) ctlist.append(in_xml) ctlist.append(after_xml) return ctlist else: ctlist.append(mixstr) return ctlist @classmethod def get_pinyin_split(self, mixstr): ctlist = [] patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) mat = re.match(patn, mixstr) if mat: # pre pre_xml = mat.group(1) # between ... in_xml = mat.group(2) # post after_xml = mat.group(3) # pre with none syllable if pre_xml: ctlist.append([pre_xml, []]) # between with syllable # [(sub sentence, [syllables]), ...] dom = DomXml(in_xml) pinyinlist = dom.get_pinyins_for_xml() ctlist = ctlist + pinyinlist # post with none syllable if after_xml: ctlist.append([after_xml, []]) else: ctlist.append([mixstr, []]) return ctlist @classmethod def get_dom_split(self, mixstr): ''' 文本分解,顺序加了列表中,返回文本和say-as标签 ''' ctlist = [] patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) mat = re.match(patn, mixstr) if mat: pre_xml = mat.group(1) in_xml = mat.group(2) after_xml = mat.group(3) if pre_xml: ctlist.append(pre_xml) dom = DomXml(in_xml) tags = dom.get_text_and_sayas_tags() ctlist.extend(tags) if after_xml: ctlist.append(after_xml) else: ctlist.append(mixstr) return ctlist class DomXml(): def __init__(self, xmlstr): self.tdom = parseString(xmlstr) #Document self.root = self.tdom.documentElement #Element self.rnode = self.tdom.childNodes #NodeList def get_text(self): '''返回 xml 内容的所有文本内容的列表''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: res.append(x1.value) else: for x2 in x1.childNodes: if isinstance(x2, xml.dom.minidom.Text): res.append(x2.data) else: for x3 in x2.childNodes: if isinstance(x3, xml.dom.minidom.Text): res.append(x3.data) else: print("len(nodes of x3):", len(x3.childNodes)) return res def get_xmlchild_list(self): '''返回 xml 内容的列表,包括所有文本内容(不带 tag)''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: res.append(x1.value) else: for x2 in x1.childNodes: if isinstance(x2, xml.dom.minidom.Text): res.append(x2.data) else: for x3 in x2.childNodes: if isinstance(x3, xml.dom.minidom.Text): res.append(x3.data) else: print("len(nodes of x3):", len(x3.childNodes)) print(res) return res def get_pinyins_for_xml(self): '''返回 xml 内容,字符串和拼音的 list ''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: t = re.sub(r"\s+", "", x1.value) res.append([t, []]) else: for x2 in x1.childNodes: if isinstance(x2, xml.dom.minidom.Text): t = re.sub(r"\s+", "", x2.data) res.append([t, []]) else: # print("x2",x2,x2.tagName) if x2.hasAttribute('pinyin'): pinyin_value = x2.getAttribute("pinyin") pinyins = pinyin_value.split(" ") for x3 in x2.childNodes: # print('x3',x3) if isinstance(x3, xml.dom.minidom.Text): t = re.sub(r"\s+", "", x3.data) res.append([t, pinyins]) else: print("len(nodes of x3):", len(x3.childNodes)) return res def get_all_tags(self, tag_name): '''获取所有的 tag 及属性值''' alltags = self.root.getElementsByTagName(tag_name) for x in alltags: if x.hasAttribute('pinyin'): # pinyin print(x.tagName, 'pinyin', x.getAttribute('pinyin'), x.firstChild.data) def get_text_and_sayas_tags(self): '''返回 xml 内容的列表,包括所有文本内容和 tag''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: res.append(x1.value) else: for x2 in x1.childNodes: res.append(x2.toxml()) return res